In [1]:
import pandas as pd
import numpy as np #to read table
from sklearn.model_selection import train_test_split #to split validation dataset
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer #to tokenize sentence
from tensorflow.keras.preprocessing.sequence import pad_sequences #to add padding
from keras.utils.np_utils import to_categorical #to make one hot vector
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense #to make 1D CNN model

In [2]:
#read table
train_data = pd.read_table('train_final.csv',sep=',')
test_data = pd.read_table('eval_final_open.csv',sep=',')

#leave only alphabet
train_data['Sentence'] = train_data['Sentence'].str.replace("[^a-zA-Z ]","")
test_data['Sentence'] = test_data['Sentence'].str.replace("[^a-zA-Z ]","")

#tokenization of train dataset and test dataset
X_train = []
for sentence in train_data['Sentence']:
    temp_X_train = word_tokenize(sentence)
    X_train.append(temp_X_train)

X_test = []
for sentence in test_data['Sentence']:
    temp_X_test = word_tokenize(sentence)
    X_test.append(temp_X_test)

vocabulary = 15000

tokenizer = Tokenizer(num_words = vocabulary)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

#assign label of train dataset
y_train = np.array(train_data['Category'])

#split train dataset and validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size = len(X_train)-1000 ,shuffle = False)

#padding
padding = 35
X_train = pad_sequences(X_train, maxlen = padding)
X_val = pad_sequences(X_val, maxlen = padding)
X_test = pad_sequences(X_test, maxlen = padding)

#one hot encoding
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

  train_data['Sentence'] = train_data['Sentence'].str.replace("[^a-zA-Z ]","")
  test_data['Sentence'] = test_data['Sentence'].str.replace("[^a-zA-Z ]","")


In [3]:
#make 1D CNN model
model = Sequential()
model.add(Embedding(vocabulary, 256))
model.add(Conv1D(256, 3, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(5, activation='softmax'))

#learning
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics = ['acc'])
model.fit(X_train, y_train, epochs = 10, validation_data = (X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b2b6d35b50>

In [4]:
#make test label to submit
y_temp = model.predict(X_test)
y_test = []
for i in range(y_temp.shape[0]):
    temp_y_test = np.argmax(y_temp[i])
    y_test.append(temp_y_test)

#export as csv file
y_test = np.reshape(y_test,(-1,1))
np.savetxt('sub_label.csv',y_test,delimiter=",")