In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [96]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
import gensim

In [3]:
cd '/content/drive/MyDrive/DCC/data'

/content/drive/MyDrive/DCC/data


In [20]:
#파일 불러오기
train = pd.read_csv('nlp/train.csv', encoding = 'utf-8')
test = pd.read_csv('nlp/test_x.csv', encoding = 'utf-8')
submission = pd.read_csv('nlp/sample_submission.csv', encoding = 'utf-8')

In [5]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [6]:
test

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [21]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)

In [104]:
nltk.download("stopwords")
#불용어 처리
def stop_words(text):
    r = []
    stop_words = list(set(stopwords.words('english')))
    word = text.split(' ')
    for w in word:
      if w not in stop_words:
        r.append(w)    
    return ' '.join(r)

train['text_stopword'] = train['text'].apply(stop_words)
test['text_stopword']=test['text'].apply(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#불용어의 갯수

In [None]:
#tfidf 적용


In [73]:
X_train = train['text_stopword']
X_test = test['text_stopword']
y_train = train['author']

In [70]:
num_class = train['author'].nunique()

> 문장의 길이를 동일하게 맞춰주어야 합니다. 문장의 길이를 맞춰주기 위해 부족한 길이만큼 0을 채워넣게 되는데 우리는 이것을 Padding라고 부릅니다.

In [98]:
#파라미터 설정
embedding_dim = 16
max_length = 212
padding_type='post'

In [99]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1 #모델에 알려줄 vocabulary의 크기 계산    
word_index = tokenizer.word_index

In [100]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [29]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 2021)

In [101]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= max_len),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(24, activation='relu'),
  tf.keras.layers.Dense(num_class, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

# model summary
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 16)           755984    
_________________________________________________________________
global_average_pooling1d_5 ( (None, 16)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 24)                408       
_________________________________________________________________
dense_22 (Dense)             (None, 5)                 125       
Total params: 756,517
Trainable params: 756,517
Non-trainable params: 0
_________________________________________________________________
None


In [102]:
history = model.fit(
    train_padded, y_train, epochs=10,
    verbose=2, validation_split=0.2
    )

Epoch 1/10
1372/1372 - 14s - loss: 1.5342 - accuracy: 0.3178 - val_loss: 1.4022 - val_accuracy: 0.4611
Epoch 2/10
1372/1372 - 13s - loss: 1.2220 - accuracy: 0.4955 - val_loss: 1.1380 - val_accuracy: 0.5210
Epoch 3/10
1372/1372 - 13s - loss: 1.0520 - accuracy: 0.5637 - val_loss: 1.0505 - val_accuracy: 0.5661
Epoch 4/10
1372/1372 - 12s - loss: 0.9171 - accuracy: 0.6461 - val_loss: 0.9611 - val_accuracy: 0.6248
Epoch 5/10
1372/1372 - 13s - loss: 0.7714 - accuracy: 0.7224 - val_loss: 0.8507 - val_accuracy: 0.6748
Epoch 6/10
1372/1372 - 13s - loss: 0.6612 - accuracy: 0.7666 - val_loss: 0.7834 - val_accuracy: 0.7205
Epoch 7/10
1372/1372 - 13s - loss: 0.5829 - accuracy: 0.7989 - val_loss: 0.7376 - val_accuracy: 0.7346
Epoch 8/10
1372/1372 - 12s - loss: 0.5261 - accuracy: 0.8177 - val_loss: 0.7274 - val_accuracy: 0.7378
Epoch 9/10
1372/1372 - 13s - loss: 0.4798 - accuracy: 0.8337 - val_loss: 0.7077 - val_accuracy: 0.7469
Epoch 10/10
1372/1372 - 13s - loss: 0.4410 - accuracy: 0.8483 - val_loss:

In [103]:
# predict values
pred = model.predict(test_padded)
submission[['0','1','2','3','4']] = pred
submission.to_csv('./nlp/submission_2.csv', index = False, encoding = 'utf-8')

