## 데이터 불러오기

In [None]:
# 라이브러리
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings 
warnings.filterwarnings(action='ignore')

In [None]:
#경로 설정
import os
os.chdir('/content/Writing_Style')

In [None]:
#파일 불러오기
train = pd.read_csv('/content/train.csv', encoding = 'utf-8')
test = pd.read_csv('/content/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/sample_submission.csv', encoding = 'utf-8')

In [None]:
#train 데이터 살펴보기
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [None]:
#test 데이터 살펴보기
test

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [None]:
#sample_submission
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


## 전처리

In [None]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)

In [None]:
#부호가 사라진 것을 확인할 수 있습니다.
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


In [None]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [None]:
X_train = train['text'].values
Y_train = train['author'].values

X_test = test['text'].values
print(X_train.shape, X_test.shape, Y_train.shape)

(54879,) (19617,) (54879,)


In [None]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy odin',
       'wanted lend money', 'certainly not occurred said yes like'],
      dtype=object)

### 토크나이징

In [None]:
# keras의 Tokenizer 사용
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

## 벡터화

In [None]:
# keras의 texts_to_sequences 사용
train_sequences = tokenizer.texts_to_sequences(X_train)
train_max_len=max(len(l) for l in train_sequences)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_max_len=max(len(l) for l in test_sequences)

print(train_max_len, test_max_len)

211 199


In [None]:
# 최대길이에 맞춰 패딩 처리
padding_type='post'

train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=train_max_len)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=train_max_len)
print(train_padded.shape, test_padded.shape)

(54879, 211) (19617, 211)


## 임베딩

> word2vec & glove 두 방식으로 임베딩



* glove 임베딩 기법

In [None]:
embedding_dict= dict()
f = open('/content/glove.6B.100d.txt', encoding='utf8')

for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr
f.close

embedding_matrix = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    temp = embedding_dict.get(word)
    if temp is not None:
        embedding_matrix[i] = temp

### k-fold 교차검증

In [None]:
# k-fold 교차검증
# Stratified K-fold 교차 검증 방법은 원본 데이터에서 레이블 분포를 먼저 고려한 뒤, 이 분포와 동일하게 학습 및 검증 데이터 세트를 분배한다.
n_fold = 5
n_class = 5
seed = 42
cross_validation = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

## 모델 학습

> glove + CNN 예측 모델

activatino = swish



In [None]:
def get_model():
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=train_max_len),
        Dropout(.5),
        Conv1D(128, 7, padding="valid", activation="swish", strides=3),
        Conv1D(128, 7, padding="valid", activation="swish", strides=3),
        Conv1D(128, 7, padding="valid", activation="swish", strides=3),
        GlobalMaxPooling1D(),
        Dense(128, activation='swish'),
        Dropout(.5),
        Dense(n_class, activation='softmax')
    ])
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(learning_rate=.007))
    return model

In [None]:
p_validation = np.zeros((train_padded.shape[0], n_class))
p_test = np.zeros((test_padded.shape[0], n_class))
for i, (i_train, i_validation) in enumerate(cross_validation.split(train_padded, Y_train), 1):
    
    print(f'\n training model for 교차검증 # {i}번째 \n')
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
    
    print(f'학습인덱스 : {i_train} | 학습데이터에 사용되는 데이터 : {len(i_train)}')
    print(f'검증인덱스 : {i_validation} | 검증에 사용하는 데이터 : {len(i_validation)}\n')
    print('-' * 90)
    
    class_model = get_model()    
    history = class_model.fit(train_padded[i_train], 
            to_categorical(Y_train[i_train]), # 범주형으로 
            validation_data=(train_padded[i_validation], to_categorical(Y_train[i_validation])),
            epochs=10,
            batch_size=512,
            callbacks=[es])
    p_validation[i_validation, :] = class_model.predict(train_padded[i_validation])
    p_test += class_model.predict(test_padded) / n_fold
    print('-' * 90)


 training model for 교차검증 # 1번째 

학습인덱스 : [    0     1     2 ... 54876 54877 54878] | 학습데이터에 사용되는 데이터 : 43903
검증인덱스 : [    9    12    16 ... 54850 54855 54860] | 검증에 사용하는 데이터 : 10976

------------------------------------------------------------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Restoring model weights from the end of the best epoch.
Epoch 00007: early stopping
------------------------------------------------------------------------------------------

 training model for 교차검증 # 2번째 

학습인덱스 : [    0     1     2 ... 54874 54877 54878] | 학습데이터에 사용되는 데이터 : 43903
검증인덱스 : [    3     5    10 ... 54869 54875 54876] | 검증에 사용하는 데이터 : 10976

------------------------------------------------------------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping
--------------------------

In [None]:
print(f'Accuracy (CV): {accuracy_score(Y_train, np.argmax(p_validation, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(Y_train), p_validation):8.4f}')

Accuracy (CV):  72.4284%
Log Loss (CV):   0.7640


In [None]:
# model summary
print(class_model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 211, 100)          4713700   
_________________________________________________________________
dropout_8 (Dropout)          (None, 211, 100)          0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 69, 128)           89728     
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 21, 128)           114816    
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 5, 128)            114816    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)              

## 모델 학습_2

> glove + Bi-LSTM 예측 모델



In [None]:
# 학습파라미터 (학습률, 에폭수)
lr = 0.007
epoch_val = 10

def get_model():
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=train_max_len),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(n_class, activation='softmax')
    ])
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(learning_rate=lr))
    return model

In [None]:
p_validation = np.zeros((train_padded.shape[0], n_class))
p_test = np.zeros((test_padded.shape[0], n_class))
for i, (i_train, i_validation) in enumerate(cross_validation.split(train_padded, Y_train), 1):
    
    print(f'\n training model for 교차검증 # {i}번째 \n')
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
    
    print(f'학습인덱스 : {i_train} | 학습데이터에 사용되는 데이터 : {len(i_train)}')
    print(f'검증인덱스 : {i_validation} | 검증에 사용하는 데이터 : {len(i_validation)}\n')
    print('-' * 90)
    
    class_model = get_model()    
    history = class_model.fit(train_padded[i_train], 
            to_categorical(Y_train[i_train]), # 범주형으로 
            validation_data=(train_padded[i_validation], to_categorical(Y_train[i_validation])),
            epochs=epoch_val,
            batch_size=512,
            callbacks=[es])
    p_validation[i_validation, :] = class_model.predict(train_padded[i_validation])
    p_test += class_model.predict(test_padded) / n_fold
    print('-' * 90)


 training model for 교차검증 # 1번째 

학습인덱스 : [    0     1     2 ... 54876 54877 54878] | 학습데이터에 사용되는 데이터 : 43903
검증인덱스 : [    9    12    16 ... 54850 54855 54860] | 검증에 사용하는 데이터 : 10976

------------------------------------------------------------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Restoring model weights from the end of the best epoch.
Epoch 00005: early stopping
------------------------------------------------------------------------------------------

 training model for 교차검증 # 2번째 

학습인덱스 : [    0     1     2 ... 54874 54877 54878] | 학습데이터에 사용되는 데이터 : 43903
검증인덱스 : [    3     5    10 ... 54869 54875 54876] | 검증에 사용하는 데이터 : 10976

------------------------------------------------------------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Restoring model weights from the end of the best epoch.
Epoch 00005: early stopping
---------------------------------------------------------------------------------

In [None]:
print(f'Accuracy : {accuracy_score(Y_train, np.argmax(p_validation, axis=1)) * 100:8.4f}%')
print(f'Log Loss : {log_loss(pd.get_dummies(Y_train), p_validation):8.4f}')

Accuracy :  74.1960%
Log Loss :   0.7032


In [None]:
# model summary
print(class_model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 211, 100)          4713700   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 211, 128)          84480     
_________________________________________________________________
bidirectional_9 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 645       
Total params: 4,897,641
Trainable params: 4,897,641
Non-trainable params: 0
_________________________________________________________________
None


# glove + CNN 모델 예측 결과

> Accuracy (CV):  72.4284%

> Log Loss (CV):   0.7640





# glove + BiLSTM 모델 예측 결과


> Accuracy :  74.1960%

> Log Loss :   0.7032







### glove로 임베딩하여 각각의 예측 모델에 모델링한 결과,

### CNN보다 BiLSTM 모델이 정확도가 2% 정도 높았고, 손실율은 0.06 정도 작았습니다.

### 그 결과, glove로 임베딩한 상황에서 BiLSTM 모델이 더욱 적합한 것을 알 수 있었습니다. 

