In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import json
import os

In [None]:
# Read in files
from google.colab import drive
drive.mount('/content/drive')

train_embeddings = np.load('/content/drive/MyDrive/train_bert_embeddings.npy')
valid_embeddings = np.load('/content/drive/MyDrive/valid_bert_embeddings.npy')
test_embeddings = np.load('/content/drive/MyDrive/test_bert_embeddings.npy')
train_labels = pd.read_csv("/content/drive/MyDrive/train_with_bert_embeddings.csv")
valid_labels = pd.read_csv("/content/drive/MyDrive/valid_with_bert_embeddings.csv")
test_labels = pd.read_csv("/content/drive/MyDrive/test_with_bert_embeddings.csv")




Mounted at /content/drive


In [None]:
# Ensure indices are integers
train_indices = train_labels['embedding_index'].values.astype(int)
valid_indices = valid_labels['embedding_index'].values.astype(int)
test_indices = test_labels['embedding_index'].values.astype(int)

# Extract embeddings using indices
train_X = train_embeddings[train_indices]
valid_X = valid_embeddings[valid_indices]
test_X = test_embeddings[test_indices]

# Extract labels
train_Y = train_labels['label'].values.astype(int)
valid_Y = valid_labels['label'].values.astype(int)
test_Y = test_labels['label'].values.astype(int)

# Print shapes to verify
print(f"train_X shape: {train_X.shape}, train_Y shape: {train_Y.shape}")
print(f"valid_X shape: {valid_X.shape}, valid_Y shape: {valid_Y.shape}")
print(f"test_X shape: {test_X.shape}, test_Y shape: {test_Y.shape}")

train_X shape: (133999, 768), train_Y shape: (133999,)
valid_X shape: (17223, 768), valid_Y shape: (17223,)
test_X shape: (17063, 768), test_Y shape: (17063,)


In [None]:
#L2 normalisation

from sklearn.preprocessing import Normalizer

normalizer = Normalizer(norm='l2')
train_X_normalised = normalizer.fit_transform(train_X)
valid_X_normalised = normalizer.fit_transform(valid_X)
test_X_normalised = normalizer.transform(test_X)


In [None]:
#try linear kernel to get a baseline
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(train_X_normalised, train_Y)


In [None]:
#evaluate performance of linearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

valid_Y_pred = svm.predict(valid_X_normalised)
print("Accuracy:", accuracy_score(valid_Y, valid_Y_pred))
print("Classification report:\n", classification_report(valid_Y, valid_Y_pred))


Accuracy: 0.43650931893398365
Classification report:
               precision    recall  f1-score   support

           0       0.51      0.63      0.56      4137
           1       0.37      0.37      0.37      3493
           2       0.39      0.25      0.30      2919
           3       0.45      0.49      0.47      3671
           4       0.41      0.37      0.39      3003

    accuracy                           0.44     17223
   macro avg       0.42      0.42      0.42     17223
weighted avg       0.43      0.44      0.43     17223



In [None]:
#try SMOTE to fix class imbalance, match everyth to class 0 which has the most
from imblearn.over_sampling import SMOTE
import numpy as np

# Check the number of samples in each class before SMOTE
class_counts = np.bincount(train_Y)
print("Class counts before SMOTE:", class_counts)

# Initialize SMOTE
smote = SMOTE(sampling_strategy={i: class_counts[0] for i in range(len(class_counts))}, random_state=42)

# Apply SMOTE to create synthetic samples
train_X_smote,train_Y_smote = smote.fit_resample(train_X_normalised, train_Y)

# Check the new class counts after SMOTE
class_counts_smote = np.bincount(train_Y_smote)
print("Class counts after SMOTE:", class_counts_smote)


Class counts before SMOTE: [31384 27748 23281 27744 23842]
Class counts after SMOTE: [31384 31384 31384 31384 31384]


In [None]:
#try linear kernel again after SMOTE
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(train_X_smote, train_Y_smote)

In [None]:
#evaluate performance of linearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

valid_Y_pred = svm.predict(valid_X_normalised)
print("Accuracy:", accuracy_score(valid_Y, valid_Y_pred))
print("Classification report:\n", classification_report(valid_Y, valid_Y_pred))

Accuracy: 0.4369738140858155
Classification report:
               precision    recall  f1-score   support

           0       0.56      0.58      0.57      4137
           1       0.39      0.28      0.32      3493
           2       0.36      0.32      0.34      2919
           3       0.45      0.51      0.48      3671
           4       0.38      0.45      0.41      3003

    accuracy                           0.44     17223
   macro avg       0.42      0.43      0.42     17223
weighted avg       0.43      0.44      0.43     17223



In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'max_iter': [1000, 2000, 3000],  # Max number of iterations
    'tol': [1e-4, 1e-3, 1e-2]  # Tolerance for stopping criteria
}

# Initialize the model
linear_svc = LinearSVC()

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(linear_svc, param_grid, cv=5, n_jobs=-1)

# Fit GridSearchCV to training data
grid_search.fit(train_X_smote, train_Y_smote)

# Print best hyperparameters and score from GridSearchCV
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score:", grid_search.best_score_)

# Evaluate the model on validation set
valid_Y_pred = grid_search.predict(valid_X_normalised)
from sklearn.metrics import classification_report, accuracy_score

# Print accuracy and classification report for validation set
print("Validation Accuracy:", accuracy_score(valid_Y, valid_Y_pred))
print("Classification Report:\n", classification_report(valid_Y, valid_Y_pred))


Best Parameters: {'C': 1, 'max_iter': 1000, 'tol': 0.0001}
Best Cross-validation Score: 0.43095845016568957
Validation Accuracy: 0.4369738140858155
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.58      0.57      4137
           1       0.39      0.28      0.32      3493
           2       0.36      0.32      0.34      2919
           3       0.45      0.51      0.48      3671
           4       0.38      0.45      0.41      3003

    accuracy                           0.44     17223
   macro avg       0.42      0.43      0.42     17223
weighted avg       0.43      0.44      0.43     17223



In [None]:
# Predict on the test set using the best estimator
test_Y_pred = grid_search.best_estimator_.predict(test_X_normalised)

# Evaluate performance on the test set
from sklearn.metrics import classification_report, accuracy_score

print("Test Accuracy:", accuracy_score(test_Y, test_Y_pred))
print("Classification Report:\n", classification_report(test_Y, test_Y_pred))



Test Accuracy: 0.4323975854187423
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.58      0.56      3934
           1       0.38      0.26      0.31      3546
           2       0.36      0.35      0.35      2911
           3       0.45      0.52      0.48      3619
           4       0.37      0.42      0.40      3053

    accuracy                           0.43     17063
   macro avg       0.42      0.43      0.42     17063
weighted avg       0.43      0.43      0.43     17063

