<a href="https://colab.research.google.com/github/im-hjk/dankook_dacon/blob/master/d_201103_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#lib import / setting

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import io
from pathlib import Path

import pandas as pd
import warnings
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [None]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

#loading

In [None]:
main_path = Path('/content/drive/My Drive/Colab Notebooks/dacon_author')
feedMe_dir = main_path / 'feed_me' # 기본 feature들 있는 곳
medium_dir =  main_path / 'medium' # 중간 결과물 저장하는 곳
result_dir =  main_path / 'result' # 최종 결과물 저장하는 곳

trn_file = feedMe_dir / 'train.csv'
tst_file = feedMe_dir / 'test_x.csv'
smp_file = feedMe_dir / 'sample_submission.csv'

trn = pd.read_csv( trn_file , encoding = 'utf-8')
tst = pd.read_csv( tst_file , encoding = 'utf-8')
sub = pd.read_csv( smp_file , encoding = 'utf-8')

algo_name = 'keras'
feature_name = 'raw'
model_name = f'{algo_name}_{feature_name}'
sub_file = result_dir / f'{model_name}.csv'

#preprocessing

In [None]:
#부호 제거
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

trn['text']=trn['text'].apply(alpha_num)

In [None]:
#불용어 제거
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
             

In [None]:
#전처리 적용
trn['text'] = trn['text'].str.lower()
tst['text'] = tst['text'].str.lower()
trn['text'] = trn['text'].apply(alpha_num).apply(remove_stopwords)
tst['text'] = tst['text'].apply(alpha_num).apply(remove_stopwords)

In [None]:
# train test 분리
X_train = np.array([x for x in trn['text']])
X_test = np.array([x for x in tst['text']])
y_train = np.array([x for x in trn['author']])

#modeling

In [None]:
#파라미터 설정
vocab_size = 20000
embedding_dim = 16
max_length = 500
padding_type='post'
#oov_tok = "<OOV>"

In [None]:
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [None]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 16)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 320,533
Trainable params: 320,533
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# fit model
num_epochs = 20
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

Epoch 1/20
1372/1372 - 8s - loss: 1.5660 - accuracy: 0.2780 - val_loss: 1.5515 - val_accuracy: 0.2812
Epoch 2/20
1372/1372 - 8s - loss: 1.4248 - accuracy: 0.3999 - val_loss: 1.2760 - val_accuracy: 0.5221
Epoch 3/20
1372/1372 - 8s - loss: 1.1641 - accuracy: 0.5421 - val_loss: 1.1101 - val_accuracy: 0.5579
Epoch 4/20
1372/1372 - 12s - loss: 1.0340 - accuracy: 0.5885 - val_loss: 1.0310 - val_accuracy: 0.5925
Epoch 5/20
1372/1372 - 8s - loss: 0.9492 - accuracy: 0.6218 - val_loss: 0.9778 - val_accuracy: 0.6096
Epoch 6/20
1372/1372 - 8s - loss: 0.8836 - accuracy: 0.6508 - val_loss: 0.9350 - val_accuracy: 0.6324
Epoch 7/20
1372/1372 - 8s - loss: 0.8237 - accuracy: 0.6824 - val_loss: 0.9032 - val_accuracy: 0.6471
Epoch 8/20
1372/1372 - 8s - loss: 0.7701 - accuracy: 0.7067 - val_loss: 0.8893 - val_accuracy: 0.6495
Epoch 9/20
1372/1372 - 8s - loss: 0.7267 - accuracy: 0.7274 - val_loss: 0.8568 - val_accuracy: 0.6673
Epoch 10/20
1372/1372 - 8s - loss: 0.6848 - accuracy: 0.7446 - val_loss: 0.8508 -

In [None]:
# predict values
pred = model.predict_proba(test_padded)
# WARNING:tensorflow:From <ipython-input-18-9e01c6cf1b05>:2: Sequential.predict_proba (from tensorflow.python.keras.engine.sequential) is deprecated and will be removed after 2021-01-01.
# Instructions for updating:
# Please use `model.predict()` instead.

Instructions for updating:
Please use `model.predict()` instead.


#sub f

In [None]:
# submission
sub[['0','1','2','3','4']] = pred

In [None]:
sub.to_csv(sub_file, index = False, encoding = 'utf-8')