## CNN - Bag of Words model

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, GlobalAveragePooling1D
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data = pd.read_csv('balanced.csv')

In [None]:
# Turn label into discrete variable
label_encoder = LabelEncoder()
data['condition_label'] = label_encoder.fit_transform(data['condition_label'])

In [None]:
# Create Bag-of-Words representation
features = 500
vectorizer = CountVectorizer(max_features=features)
X_bow = vectorizer.fit_transform(data['clean_text']).toarray()

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_bow, data['condition_label'], test_size=0.2, random_state=12)

In [None]:
# Reshape the target labels
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
# Define CNN model
def create_model(filters=128, kernel_size=5, dense_units=128, dropout_rate=0.5):
    model = Sequential()
    model.add(Conv1D(filters, kernel_size, activation='relu', input_shape=(features, 1)))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(dense_units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model_wrapper = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
# Define the hyperparameter grid for CNN
param_grid = {
    'filters': [64, 128, 256],
    'kernel_size': [3, 5, 7],
    'dense_units': [64, 128, 256],
    'dropout_rate': [0.3, 0.5, 0.7]
}

In [None]:
best_param_grid = {
    'filters': [256],
    'kernel_size': [5],
    'dense_units': [256],
    'dropout_rate': [0.3]
}

In [None]:
# Use GridSearchCV to define best hyperparameters
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', save_best_only=True, verbose=1)

grid = GridSearchCV(estimator=model_wrapper, param_grid=best_param_grid, cv=3, verbose=1)
grid_result = grid.fit(X_train[:, :, np.newaxis], y_train, callbacks=[es, mc], validation_split=0.1)

print("Best Parameters: ", grid_result.best_params_)
print("Best Accuracy: ", grid_result.best_score_)

In [None]:
# Make predictions on the test data
best_model = grid_result.best_estimator_.model
loss, accuracy = best_model.evaluate(X_test[:, :, np.newaxis], y_test)
print(f"Test Loss (Best Model): {loss}, Test Accuracy (Best Model): {accuracy}")

y_pred = best_model.predict(X_test[:, :, np.newaxis])
y_pred_classes = np.argmax(y_pred, axis=1)

y_test_classes = label_encoder.inverse_transform(y_test)
y_pred_classes = label_encoder.inverse_transform(y_pred_classes)

In [None]:
# Generate and print classification report for the best model
print(classification_report(y_test_classes, y_pred_classes))

In [None]:
# Define and display confusion matrix
def plot_confusion_matrix(y_test, y_pred, labels):
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(6,6))
    plt.rcParams.update({'font.size':13, 'font.monospace':'Computer Modern Typewriter'})
    disp.plot(ax=ax, cmap = "binary", colorbar=False)

plot_confusion_matrix(y_test_classes, y_pred_classes,   labels=label_encoder.classes_)