In [1]:
import numpy as np
import pandas as pd

#Read raw data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [2]:
#Remove all punctuations by using regex

import re
import nltk
def get_clean_tokens(sentence):
    tokens = nltk.word_tokenize(sentence)
    replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), tokens))
    #Remove all string == ""
    removed_punctation = list(filter(lambda token: token, replaced_punctation))
    return removed_punctation

In [3]:
#Preprocess for word2vec
#Some document says no need to remove stopwords in word2vec

import copy
def tokenize_sentences(df):
    #Avoid directly modify original dataframe
    df_temp = copy.deepcopy(df)
    df_temp['text'] = df_temp['text'].str.lower()
    df_temp['sentences'] = df_temp['text'].str.split('.')
    df_temp['tokenized_sentences'] = list(map(lambda sentences: list(map(get_clean_tokens, sentences)), 
                                              df_temp.sentences))
    #Remove all list == []
    df_temp['tokenized_sentences'] = list(map(lambda sentences: list(filter(lambda lst: lst, sentences)),
                                         df_temp.tokenized_sentences)) 
    
    return df_temp[['id','label','tokenized_sentences']]
    
train = tokenize_sentences(train_data)
test = tokenize_sentences(test_data)

In [4]:
#Train word2vec model

from gensim.models import Word2Vec
train_sentences = [sentence for sentences in train.tokenized_sentences for sentence in sentences]

W2Vmodel = Word2Vec(sentences=train_sentences, sg=1, hs=0, workers=4, size=200, min_count=3, window=6,
                    sample=1e-3, negative=5, iter=6)

In [5]:
#Preprocess for keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

#Split trainset into train and evaluation, proportion of eva is 0.2
X_train, X_eva, y_train, y_eva = train_test_split(train_data['text'], train_data['label'], test_size=0.2)


#Set a number which is larger than vocab to keep all useful information
NUM_WORDS = 80000
#keras.preprocessing.text.Tokenizer is different from nltk.word_tokenize, it will turn
#each text into either a sequence of integers.
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_eva_sequences = tokenizer.texts_to_sequences(X_eva)
word_index = tokenizer.word_index


#Make labels to be one-hot for keras input
y_train = to_categorical(np.asarray(y_train))
y_val = to_categorical(np.asarray(y_eva))
#Make input to be same length
X_train = pad_sequences(X_train_sequences)
X_val = pad_sequences(X_eva_sequences, maxlen=X_train.shape[1])

print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

Using TensorFlow backend.


Shape of X train and X validation tensor: (12800, 2039) (3200, 2039)
Shape of label train and validation tensor: (12800, 6) (3200, 6)


Notice: the label of original train set is 1~5 but the y_train is 0~6. However, since no sample is labeled as 0 in train.csv, I believe the influence can be ignored

In [6]:
#Build the embedding layer of NN

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

#Loading previous trained word2vec model
word_vectors = W2Vmodel.wv
EMBEDDING_DIM=200 #The same as word2vec features

vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

#Give embedding matrix value according to word2vec model
for word, i in word_index.items():
    if i > len(word_index):
        continue
    try:
        embedding_matrix[i] = word_vectors[word]
    except KeyError:
        #Ignore words not exist in train
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix],
                            trainable=True)

In [7]:
#Build other layers of NN

from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout, concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

maxpool=[]
for i in range(len(filter_sizes)):
    #Convolutional layer is responsible for the convolutional operation in which feature maps identifies features
    conv_temp = Conv2D(num_filters, (filter_sizes[i], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
    #Maxpool is used to make the network more flexible to slight changes and decrease the network computationl expenses by extracting 
    #the group of features that are highly contributing to each feature in the feature maps in the layer.
    maxpool_temp = MaxPooling2D((sequence_length - filter_sizes[i] + 1, 1), strides=(1,1))(conv_temp)
    maxpool.append(maxpool_temp)

merged_tensor = concatenate(maxpool, axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
#Dropout is a regulization technique where you turn off part of the network's layers 
#randomally to increase regulization and hense decrease overfitting. 
dropout = Dropout(drop)(flatten)
#The dense layer is a fully connected layer that comes after the convolutional layers 
#and they give us the output vector of the Network
output = Dense(units=6, activation='softmax', kernel_regularizer=regularizers.l2(0.01))(dropout)

model = Model(inputs, output)

In [8]:
print(X_val.shape,y_val.shape)

(3200, 2039) (3200, 6)


In [9]:
#Train CNN model

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['acc'])
model.fit(X_train, y_train, epochs=5, verbose=1, validation_data=(X_val, y_val),
         callbacks=[EarlyStopping(monitor='val_loss')]) 

Train on 12800 samples, validate on 3200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.callbacks.History at 0x1a393b7ac8>

In [10]:
#Predict

X_test_sequences = tokenizer.texts_to_sequences(test_data.text)
X_test = pad_sequences(X_test_sequences, maxlen=X_train.shape[1])
y_pred=model.predict(X_test)

y_pred_label = np.argmax(y_pred, axis=1)
df_sub = pd.DataFrame()
df_sub['id'] = test_data.id
df_sub['pred'] = y_pred_label
df_sub.to_csv("deep_new2.csv", index=False)


Reference:

https://arxiv.org/pdf/1408.5882.pdf

https://www.kaggle.com/vukglisovic/classification-combining-lda-and-word2vec

https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights/notebook

https://www.kaggle.com/moghazy/beginner-s-guide-to-cnns-with-keras-99-8
