## CNN - pre-trained W2V

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import gensim
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Embedding, AveragePooling1D, GlobalAveragePooling1D, Dropout
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import spacy
import string
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
# Load Word2Vec embeddings
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format('/datasets/drive/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Load the dataset
data = pd.read_csv('balanced.csv')

In [None]:
# Turn label into discrete variable
label_encoder = LabelEncoder()
data['condition_label'] = label_encoder.fit_transform(data['condition_label'])

In [None]:
# Tokenize the text and pad to fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['clean_text'])
sequences = tokenizer.texts_to_sequences(data['clean_text'])

max_sequence_length = 250
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

In [None]:
# Create embedding matrix using Word2Vec
embedding_dim = 300
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in w2v_model:
        embedding_matrix[i] = w2v_model[word]

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, data['condition_label'], test_size=0.2, random_state=12)

In [None]:
# Define CNN model
def create_model(embedding_dim=300, filters=128, kernel_size=5, dense_units=128, dropout_rate=0.5):
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(dense_units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
# Define the hyperparameter grid for CNN
param_grid = {
    'filters': [64, 128, 256],
    'kernel_size': [3, 5, 7],
    'dense_units': [64, 128, 256],
    'dropout_rate': [0.3, 0.5, 0.7]
}

In [None]:
# Use GridSearchCV to define best hyperparameters
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', save_best_only=True, verbose=1)

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train, validation_split=0.1, callbacks=[es,mc])

print("Best Parameters: ", grid_result.best_params_)
print("Best Accuracy: ", grid_result.best_score_)

In [None]:
# Make predictions on the test data
best_model = grid_result.best_estimator_.model
loss, accuracy = best_model.evaluate(X_test, np.array(y_test))
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

y_pred = best_model.predict(X_test[:, :, np.newaxis])
y_pred_classes = np.argmax(y_pred, axis=1)

y_test_classes = label_encoder.inverse_transform(y_test)
y_pred_classes = label_encoder.inverse_transform(y_pred_classes)

In [None]:
# Generate and print classification report for the best model
print(classification_report(y_test_classes, y_pred_classes))

In [None]:
# Define and display confusion matrix
def plot_confusion_matrix(y_test, y_pred, labels):
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(6,6))
    plt.rcParams.update({'font.size':13, 'font.monospace':'Computer Modern Typewriter'})
    disp.plot(ax=ax, cmap = "binary", colorbar=False)

plot_confusion_matrix(y_test_classes, y_pred_classes,   labels=label_encoder.classes_)