<a href="https://colab.research.google.com/github/hasun/ProgrammingCI/blob/master/vocabToken.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
sentence  =  "이것이 무엇인가 보고 듣고 먹고 냄새맞고 생각하는 이것이 무엇인가"
sentence.split()

In [None]:
import  numpy  as  np  
token_sequence  =  str.split(sentence)
vocab  =  sorted(set(token_sequence))
num_tokens  =  len(token_sequence)
vocab_size  =  len(vocab) 
onehot_vectors  =  np.zeros((num_tokens,vocab_size),  int)
for  i,  word  in  enumerate(token_sequence):
  onehot_vectors[i,  vocab.index(word)]  =  1

print(onehot_vectors)

import  pandas  as  pd
pd.DataFrame(onehot_vectors,  columns=vocab)

In [None]:
def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word

    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

def convert_one_hot(corpus, vocab_size):
    '''원핫 표현으로 변환

    :param corpus: 단어 ID 목록(1차원 또는 2차원 넘파이 배열)
    :param vocab_size: 어휘 수
    :return: 원핫 표현(2차원 또는 3차원 넘파이 배열)
    '''
    N = corpus.shape[0]

    if corpus.ndim == 1:
        one_hot = np.zeros((N, vocab_size), dtype=np.int32)
        for idx, word_id in enumerate(corpus):
            one_hot[idx, word_id] = 1

    elif corpus.ndim == 2:
        C = corpus.shape[1]
        one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
        for idx_0, word_ids in enumerate(corpus):
            for idx_1, word_id in enumerate(word_ids):
                one_hot[idx_0, idx_1, word_id] = 1

    return one_hot

def create_contexts_target(corpus, window_size=1):
    '''맥락과 타깃 생성

    :param corpus: 말뭉치(단어 ID 목록)
    :param window_size: 윈도우 크기(윈도우 크기가 1이면 타깃 단어 좌우 한 단어씩이 맥락에 포함)
    :return:
    '''
    target = corpus[window_size:-window_size]
    contexts = []

    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)

    return np.array(contexts), np.array(target)

In [None]:
window_size = 1
corpus, word_to_id, id_to_word = preprocess(sentence)

print (corpus)
print ("키워드 맵핑=======")
print (word_to_id)

vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
#print ("타겟 키워드=======")
#print (target)
#print ("타켓 키워드 주변 단어=======")
#print (contexts)

target = convert_one_hot(target, vocab_size)
#print ("타켓 one hot encoder 변환=======")
print(pd.DataFrame(target,  columns=word_to_id))

contexts = convert_one_hot(contexts, vocab_size)
#print ("주변단어 one hot encoder 변환=======")
print (contexts)

In [None]:
import tensorflow as tf
from tensorflow.keras import preprocessing
import numpy as np

In [None]:
samples = ['너 오늘 이뻐 보인다',
          '나는 오늘 기분이 더러워',
          '끝내주는데, 좋은 일이 있나봐',
          '나 좋은 일이 생겼어',
          '아 오늘 진짜 짜증나',
          '환상적인데, 정말 좋은거 같아']

targets =[[1], [0], [1], [1], [0], [1]]

test_samples = ['이뻐 보인다',
          '안좋아',
          '좋은 일이 있나봐',
          '일이 생겼어',
          '짜증나',
          '좋은거 같아']

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
input_sequences = np.array(sequences)
labels = np.array(targets)

word_index = tokenizer.word_index

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
RANDOM_SEED = 42
TEST_SPLIT = 0.2

vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) 

X = vectorizer.fit_transform(samples)
y = np.array(targets)

#print(X)
features = vectorizer.get_feature_names()

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

lgs = LogisticRegression(class_weight='balanced') 
lgs.fit(X_train, y_train) 

predicted = lgs.predict(X_eval)
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

testDataVecs = vectorizer.transform(test_samples)
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)


In [None]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in reviews:
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [None]:
num_features = 5    
min_word_count = 1   
num_workers = 4       
context = 5          
downsampling = 1e-3 

from gensim.models import word2vec

sentences = []
for sample in samples:
    sentences.append(sample.split())

print (sentences)
model = word2vec.Word2Vec(sentences, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

test_data_vecs = get_dataset(sentences, model, num_features)

X = test_data_vecs
y = np.array(targets)

print (X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

predicted = lgs.predict(X_test)
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)
