## NLP steps
> Pre-processing > tokenizer (dic) > texts_to_sequences -> padding -> embedding -> modeling
> ML and LSTM

# NL Classification by ML (RandomForest)


In [None]:
# hangul font for colab
#
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

#
# Restart colab runtime

In [None]:
import matplotlib.pyplot as plt

plt.rc('font', family='NanumBarunGothic')

In [None]:
#import
import numpy as np
import pandas as pd

In [None]:
# AI-HUB corpus data read
final_data = pd.read_csv('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv')
final_data.head()

In [None]:
# check special char, english, numbers
final_data[final_data['문장'].str.contains('[^가-힣 ]')].values[:5]

In [None]:
# remove special char, english, numbers
final_data['문장'] = final_data['문장'].str.replace('[^가-힣 ]','')

  final_data['문장'] = final_data['문장'].str.replace('[^가-힣 ]','')


In [None]:
# confirm [special char, english, numbers] is not exists
final_data['문장'][final_data['문장'].str.contains('[^가-힣 ]')].sum()

0

In [None]:
final_data.head()

In [None]:
# strip both start-blank and end-blank of string
final_data['문장'] = final_data['문장'].str.strip()
final_data.tail()

In [None]:
# check Null data
final_data.isnull().sum()

감정    0
문장    0
dtype: int64

In [None]:
# check duplicated data
final_data['문장'].duplicated().sum()

56

In [None]:
# remove duplicated data
final_data.drop_duplicates(subset=['문장'], inplace=True)
final_data.info()

In [None]:
# check label '감정' distribution
final_data['감정'].value_counts()

불안    9311
분노    9149
상처    9130
슬픔    9118
당황    8747
기쁨    6119
Name: 감정, dtype: int64

In [None]:
# draw plot Bar
final_data['감정'].value_counts().plot(kind='bar')

In [None]:
#
# Label encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
final_data['감정'] = le.fit_transform(final_data['감정'])
le.classes_

array(['기쁨', '당황', '분노', '불안', '상처', '슬픔'], dtype=object)

In [None]:
final_data.tail()

In [None]:
# X, Y split
features = final_data['문장'].values
labels = final_data['감정'].values

features.shape, labels.shape

((51574,), (51574,))

In [None]:
features[:3]

In [None]:
print('string Max :{}'.format(max(len(l) for l in features)))
print('string Avg :{}'.format(sum(map(len, features))/len(features)))

string Max :152
string Avg :33.91709000659247


In [None]:
import matplotlib.pyplot as plt
# review data distribution. declined to range 30~40
plt.hist([len(s) for s in features], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, labels , test_size=0.2, stratify=labels, random_state=41)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((41259,), (10315,), (41259,), (10315,))

In [None]:
x_train[:2], y_train[:2]

In [None]:
# Corpus -> TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
x_train_v = tfidf.fit_transform(x_train)
x_test_v = tfidf.transform(x_test)

In [None]:
# TF-IDF value of word in line
print(x_train_v)

In [None]:
# TF-IDF maxtrix of train data : 41259 line, 47366 word
x_train_v.shape

(41259, 47366)

In [None]:
# ML modeling : takes 4 minutes in colab
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train_v, y_train)
rfc.score(x_test_v, y_test)

0.49801260300533207

In [None]:
# try to predict
# explain output  : (0, 44327)	0.241660101642553
# 0 : 1-st line, 44327 : map number to word, 0.241660101642553 : tf-idf calculated value

print(f'1st TF-IDF of valid data : {x_test_v[0]}')

In [None]:
print(f'Reverse of st TF-IDF of valid data : {tfidf.inverse_transform(x_test_v[:1])}')

Reverse of st TF-IDF of valid data : [array(['하는', '친구들은', '일을', '어려워', '사실', '만나기가', '때문에', '든다는', '돈이'],
      dtype='<U22')]


In [None]:
# predict by RandomForest model
predict = rfc.predict(x_test_v[:1])
predict, le.inverse_transform(predict)

(array([4]), array(['상처'], dtype=object))

# Classification by LSTM


In [None]:
## label encondign by manually
# label list
list1 = final_data['감정'].value_counts().index.values
list1

array(['불안', '분노', '상처', '슬픔', '당황', '기쁨'], dtype=object)

In [None]:
# label - class mapping
label2class = {}
class2label = {}
for cl, la in enumerate(list1):
  # print(i, j)
  label2class[la] = cl
  class2label[cl] = la

print(label2class)
print(class2label)

{'불안': 0, '분노': 1, '상처': 2, '슬픔': 3, '당황': 4, '기쁨': 5}
{0: '불안', 1: '분노', 2: '상처', 3: '슬픔', 4: '당황', 5: '기쁨'}


In [None]:
# create new column of label
final_data['label'] = final_data['감정'].map(label2class)

In [None]:
final_data.tail()

In [None]:
# X, Y split
features = final_data['문장'].values
labels = final_data['label'].values

features.shape, labels.shape

((51574,), (51574,))

In [None]:
# check data
features[:3]

In [None]:
import matplotlib.pyplot as plt
# histogram
plt.hist([len(s) for s in features], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# Train, Test data split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, labels , test_size=0.2, stratify=labels, random_state=41)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((41259,), (10315,), (41259,), (10315,))

In [None]:
# sample, label
# {0: '불안', 1: '분노', 2: '상처', 3: '슬픔', 4: '당황', 5: '기쁨'}

x_train[:2], y_train[:2]

(array(['아르바이트만 하다가 취업하려니 거부감 들어', '혼자가 편하다고 한 게 후회돼'], dtype=object),
 array([1, 4]))

In [None]:
#
# Tokenizing

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenizer : bag of words (fit_on_texts)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
# word - index mapping
print(tokenizer.word_index)

# index - word mapping
print(tokenizer.index_word)

# word frequency
print(tokenizer.word_counts)

In [None]:
# total words count : 47,646
max_words = len(tokenizer.index_word)
#print(max_words)

In [None]:
#
# texts_to_sequences == convert text to index sequences
#
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [None]:
# count after converting text to sequences
#x_train.shape, x_test.shape, y_train.shape, y_test.shape : ((41259,), (10315,), (41259,), (10315,))
print(len(x_train_seq), len(x_test_seq))

41259 10315


In [None]:
print(x_train[0:3])
print(x_train_seq[0:3])

In [None]:
# padding sequence
#
# grab the max sentence's seq length :  38?
max(len(line) for line in x_train_seq)

38

In [None]:
# Set the lenght of all sentences to max seq length

x_train_pad = pad_sequences(x_train_seq, maxlen=38)
x_test_pad = pad_sequences(x_test_seq, maxlen=38)

In [None]:
# check the padding result
x_train_pad[:1]

In [None]:
# check shape of sentence sequence
x_train_pad.shape, x_test_pad.shape

((41259, 38), (10315, 38))

In [None]:
# LSTM Modeling
#
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPool2D
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# Hyper parameter Tuning

max_words = 47646 + 1   # max_words + padding 0
max_len = 38            # max_len
embedding_dim = 32      # embedding dimension - arbitary

In [None]:
model = Sequential()
# Convert word to meaningful 32-dim Vector(Embedding)
model.add(Embedding(max_words, embedding_dim, input_length=max_len))

In [None]:
# LSTM model
#model.add(LSTM(16, return_sequences=True))
model.add(LSTM(16, return_sequences=True))
model.add(Flatten())
model.add(Dense(128, activation='swish'))
model.add(Dense(32, activation='swish'))
model.add(Dense(6, activation='softmax'))

# compile
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = 'accuracy')
model.summary()

In [None]:
# callbacks
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
checkpoint_path = 'tmp_checkpoint.ckpt'
cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

In [None]:
# fit
history = model.fit(x_train_pad, y_train, epochs=50, batch_size=512,
                      validation_split=0.2, verbose =1, callbacks=[es, cp])

In [None]:
epochs = range(1, len(history.history['accuracy']) + 1)
plt.plot(epochs, history.history['accuracy'])
plt.plot(epochs, history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

In [None]:
model.evaluate(x_test_pad, y_test)



[4.395216464996338, 0.41667473316192627]

In [None]:
#
#predict

print(f'String : {x_test[0]}')
print(f'Sequence : {x_test_pad[0]}')

문자열 : 눈 수술을 했더니 시력이 좋아졌어
Sequence : [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 1954  412  493 2450  961]


In [None]:
# model predict
predict = model.predict(x_test_pad[:1])



In [None]:
print(f'True : {class2label[y_test[0]]}')
print(f'Predict : {class2label[np.argmax(predict)]}')

True : 기쁨
Predict : 불안
