1. 다층 퍼셉트론(MultiLayer Perceptron, MLP)

In [None]:
# RNN(순환신경망)과 분산 표현(distributed representation)이라는 concept를 모른다 할 지라도 
# 자연어 처리 실습이 FFNN만을 통하여 가능하다

2. Keras :: texts_to_matrix()이해

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# example
texts = ['먹고 싶은 사과', '먹고 싶은 바나나', '길고 노란 바나나 바나나', '저는 과일이 좋아요']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
print(tokenizer.word_index)

In [None]:
print(tokenizer.texts_to_matrix(texts, mode='count')) #'count' is DTM

# you need to be careful when you think of index
# word index :: start 1
# matrix index :: start 0



In [None]:
print(tokenizer.texts_to_matrix(texts, mode= 'binary')) # binary only cares about the existence

In [None]:
# you can apply tfidf mode
print(tokenizer.texts_to_matrix(texts, mode = 'tfidf').round(3))

# tf-idf is very similar to TfidfVectorizer in sklearn.
# but formula is a little different.

In [None]:
# freq mode
print(tokenizer.texts_to_matrix(texts, mode = 'freq').round(3))


### 3. Twenty Newsgroups example to explain NN

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [None]:
# dataset :: subject 20s
# obs is 18,846

news_data = fetch_20newsgroups(subset = 'train') # we can select the train data by parameter - 'train', 'test', 'all'

In [None]:
print(news_data.keys())

In [None]:
# train_data_obs
print('train_data_obs:{0}'.format(len(news_data.data)))

# num of subject
print('num of subject:{0}'.format(len(news_data.target_names)))

# Subject ::
print('Subject:{0}'.format(news_data.target_names))

In [None]:
# 1st sample`s label is 7
print('1st label: {0}'.format(news_data.target[0]))

# what is the 7 ?
print('label 7`s subject: {0}'.format(news_data.target_names[7]))

In [None]:
print(news_data.data[0])
news_df = pd.DataFrame(news_data.data, columns = ['email'])
news_df['target'] = news_data.target
news_df.head()

In [None]:
news_df.info()

In [None]:
# 결측값 확인-1
news_df.isna().sum()

# 결측값 확인-2
news_df.isna().values.any()

# 중복제외처리 확인
print('중복을 제외한 샘플의 수 : {0}'.format(news_df['email'].nunique()))
print('중복을 제외한 주제의 수 : {0}'.format(news_df['target'].nunique()))

In [None]:
news_df['target'].value_counts().plot(kind = 'bar');

In [None]:
news_df.groupby(by='target').count().reset_index().rename(columns = {'email':'count'})

In [None]:
newsdata_test = fetch_20newsgroups(subset = 'test', shuffle = True)

train_email = news_df['email']
train_label = news_df['target']

test_email = newsdata_test.data
test_label = newsdata_test.target

In [None]:
vocab_size = 10000
num_classes = 20

In [None]:
# index of keras.tokenizer is based on freq
def preparation_data(train, test, mode):
    tokenizer = Tokenizer(num_words = vocab_size) # we will just use vocab_size
    tokenizer.fit_on_texts(train)
    X_train = tokenizer.texts_to_matrix(train, mode = mode) # make the train matrix using ttm func
    X_test = tokenizer.texts_to_matrix(test, mode = mode) # make the test matrix using ttm func 

    return X_train, X_test, tokenizer.index_word

In [None]:
X_train, X_test, index_to_word = preparation_data(train_email, test_email, 'binary') # binary mode
y_train = to_categorical(train_label, num_classes) # one-hot encoding
y_test = to_categorical(test_label, num_classes) # one-hot encoding

In [None]:
print('train_sample_size:{0}'.format(X_train.shape))
print('train_sample_label_size:{0}'.format(y_train.shape))
print('test_sample_label_size:{0}'.format(X_test.shape))
print('test_sample_label_size:{0}'.format(y_test.shape))

In [None]:
index_to_word

In [None]:
print('빈도수 상위 1번 단어:{0}'.format(index_to_word[1]))
print('빈도수 상위 9999번 단어:{0}'.format(index_to_word[9999]))

4. MLP 설계를 통한 Text분류

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# 딥러닝을 함수지정하는 이유 :: 값의 변화를 주기 위해
def fit_and_eval(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Dense(256, input_shape = (vocab_size,), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.fit(X_train,y_train, batch_size= 128, epochs =10, verbose=1, validation_split=0.1)
    score = model.evaluate(X_test, y_test, batch_size=128, verbose=0)
    
    return score[1]

In [None]:
modes = ['binary','count', 'tfidf', 'freq'] # 4 modes in list

for mode in modes :
    X_train, X_test, _ = preparation_data(train_email, test_email, mode) 
    score = fit_and_eval(X_train,y_train,X_test,y_test)
    print(mode+' mode`s test accuracy:', score)    

In [None]:
# end of file 