In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import nltk
nltk.download('stopwords')
from tqdm import tqdm
import os, re, csv, math, codecs
from sklearn.preprocessing import LabelBinarizer,StandardScaler
sns.set_style("whitegrid")
np.random.seed(0)


  import pandas.util.testing as tm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amakr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r",", " ", string) 
    string = re.sub(r"!", " ", string) 
    string = re.sub(r"\(", " ", string) 
    string = re.sub(r"\)", " ", string) 
    string = re.sub(r"\?", " ", string) 
    string = re.sub(r"\s{2,}", " ", string)   
    string = re.sub(" \d+", " ", string)
    return  string.strip().lower()

In [4]:
TRAIN_FILEPATH = "../Translated/cleaned/train.csv"
TEST_FILEPATH = "../Translated/cleaned/test.csv"
SS_FILEPATH = "../data/SampleSubmission.csv"
VECTORS_FILEPATH = ""
train = pd.read_csv(TRAIN_FILEPATH)
test = pd.read_csv(TEST_FILEPATH)
ss = pd.read_csv(SS_FILEPATH)

In [5]:
train["Text"] =train.Text.apply(lambda x: clean_str(x))
test["Text"] =test.Text.apply(lambda x: clean_str(x))


In [6]:
corpus = pd.concat([train.Text, test.Text],axis = 0,ignore_index = True)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [8]:
mat = X.toarray()
X_train = pd.DataFrame(mat[:train.shape[0]])
X_test = pd.DataFrame(mat[train.shape[0]:])

In [9]:
# X_train
X_train['Label'] = train['Label']
X_train['ID'] = train['ID']
X_test['ID'] = test['ID']
features = X_train.columns.to_list()[:-2]
lb = LabelBinarizer()
y_train_b = lb.fit_transform(X_train['Label']) 

In [10]:
batch_size = 64 
num_epochs =  50

#model parameters
num_filters = 64 
embed_dim = 500 
weight_decay = 1e-3
num_classes = train.Label.unique().shape[0]
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.01, patience=6, verbose=1)
callbacks_list = [early_stopping]

In [15]:
print("training CNN ...")
model = Sequential()
# model.add(keras.Input(shape=(X_train.shape[1]-2,1)))
model.add(Embedding(len(features), 20, trainable=True))
# Conv1D(filters=1, kernel_size=10 ,strides=10,     
#                   ,kernel_initializer= 'uniform',      
#                   activation= 'relu')
model.add(Conv1D(num_filters, 12, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 10, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))

model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(num_classes, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

training CNN ...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          1268780   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 64)          15424     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 64)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 64)          41024     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 5

In [16]:
hist = model.fit(X_train[features], y_train_b, batch_size=batch_size, epochs=num_epochs,shuffle = True, callbacks=callbacks_list, validation_split=0.1, verbose=2)

Epoch 1/50
21/21 - 288s - loss: 1.1714 - accuracy: 0.0224 - val_loss: 0.7946 - val_accuracy: 0.0903
Epoch 2/50
21/21 - 281s - loss: 0.6060 - accuracy: 0.0898 - val_loss: 0.4546 - val_accuracy: 0.0764
Epoch 3/50
21/21 - 274s - loss: 0.4026 - accuracy: 0.1672 - val_loss: 0.3474 - val_accuracy: 0.1944
Epoch 4/50
21/21 - 274s - loss: 0.3236 - accuracy: 0.1943 - val_loss: 0.2922 - val_accuracy: 0.1944
Epoch 5/50
21/21 - 272s - loss: 0.2811 - accuracy: 0.1943 - val_loss: 0.2646 - val_accuracy: 0.1944
Epoch 6/50
21/21 - 271s - loss: 0.2564 - accuracy: 0.1943 - val_loss: 0.2464 - val_accuracy: 0.1944
Epoch 7/50


KeyboardInterrupt: 

In [13]:
X_train.shape

(1436, 63441)