In [42]:
import numpy as np 
import pandas as pd 

from sklearn.metrics import  classification_report
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding,  Conv1D, MaxPooling1D, GlobalAveragePooling1D,  Flatten, GRU, SpatialDropout1D, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import  regularizers
from tensorflow.keras.layers import BatchNormalization
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt


In [31]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [32]:
X_train = train.comment
X_test = test.comment
y_train = train["label"]
y_test = test["label"]

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_train = to_categorical(y_train)

y_test = label_encoder.fit_transform(y_test)
y_test = to_categorical(y_test)

In [33]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
max_words = len(tokenizer.word_index) + 1  

max_len = 100


In [34]:
#Hyperparameter value
batch_size = 128
embedding_size =100
# Convolution
kernel_size = 5
filters = 64
pool_size = 4

In [35]:
#Tokenize Text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [36]:
# sequence padding 
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
print(X_train.shape, X_test.shape)


(11320, 100) (2831, 100)


In [37]:
def plot_performance(history=None, figure_directory=None, ylim_pad=[0, 0]):
    xlabel = 'Epoch'
    legends = ['Training', 'Validation']

    plt.figure(figsize=(20, 5))

    y1 = history.history['accuracy']
    y2 = history.history['val_accuracy']

    min_y = min(min(y1), min(y2))-ylim_pad[0]
    max_y = max(max(y1), max(y2))+ylim_pad[0]


    plt.subplot(121)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Accuracy\n', fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Accuracy', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()

    y1 = history.history['loss']
    y2 = history.history['val_loss']

    min_y = min(min(y1), min(y2))-ylim_pad[1]
    max_y = max(max(y1), max(y2))+ylim_pad[1]


    plt.subplot(122)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Loss\n', fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Loss', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()
    if figure_directory:
        plt.savefig(figure_directory+"/history")

    plt.show()

## CNN Based Model

In [40]:
model1= Sequential()
model1.add(Embedding(max_words, embedding_size, input_length=max_len))
model1.add(Dropout(0.5))
model1.add(Conv1D(256, kernel_size=3,padding='same',activation='relu',strides=1))
model1.add(GlobalAveragePooling1D())
model1.add(Dropout(0.5))
model1.add(Dense(
    units=128,
    kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),
    bias_regularizer=regularizers.l2(1e-4),
    activity_regularizer=regularizers.l2(1e-5)
))
model1.add(BatchNormalization())
model1.add(Dropout(0.5))
model1.add(Dense(2,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
history3=model1.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=2, batch_size=batch_size, verbose=1)


# Generate predictions on the test set
Y_pred = model1.predict(X_test)
# Convert predictions from one-hot encoding to class labels
Y_pred_labels = np.argmax(Y_pred, axis=1)
Y_true_labels = np.argmax(y_test, axis=1)

# Print classification report
print(classification_report(Y_true_labels, Y_pred_labels))


# Visualization
plot_performance(history=history3)

## CNN + GRU

In [44]:
model4 = Sequential()
model4.add(Embedding(max_words, embedding_size, input_length=max_len))
model4.add(Conv1D(256, kernel_size=3, padding='same', activation='relu'))
model4.add(MaxPooling1D(pool_size=2))
model4.add(Conv1D(256, kernel_size=3, padding='same', activation='relu'))
model4.add(MaxPooling1D(pool_size=2))
model4.add(Dropout(0.25))
model4.add(SpatialDropout1D(0.25))
model4.add(Bidirectional(GRU(256, return_sequences=True))) 
model4.add(Dropout(0.5))
model4.add(Flatten())
model4.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model4.add(Dropout(0.5))
model4.add(Dense(2, activation='softmax', kernel_regularizer=l2(0.001)))
model4.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
model4.summary()


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
history3=model4.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=1, batch_size=batch_size, verbose=1)
# Generate predictions on the test set
Y_pred = model4.predict(X_test)
# Convert predictions from one-hot encoding to class labels
Y_pred_labels = np.argmax(Y_pred, axis=1)
Y_true_labels = np.argmax(y_test, axis=1)

# Print classification report
print(classification_report(Y_true_labels, Y_pred_labels))

#Visualization
plot_performance(history=history3)