**Binary Classifiers - Machine Learning and Foundations - Final Assignment**

***Importing required libraries***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import string
import nltk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.manifold import TSNE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pickle

***Setting SEED to Ensure Reproducibility***

In [None]:
# Seed and downloads
SEED = 42
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Load and preprocess dataset
df = pd.read_csv('/content/drive/MyDrive/ML Final/2.csv')
df = df.drop(columns=['Unnamed: 0', 'authors', 'link', 'date'])
print(df.head())
df['combined_text'] = df['headline'] + " " + df['short_description']
df.drop(columns=['headline', 'short_description'], inplace=True)
df = df.dropna()

Dropped authors, link and date as they won't be relevant to extract features from.

Function to clean the text data and prepare it for analysis.

In [None]:
# Clean text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+|http\S+", " ", text)
    text = ''.join([char for char in text if char not in string.punctuation])
    words = [word for word in text.split() if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [None]:
df['cleaned_text'] = df['combined_text'].apply(clean_text)

The category distribution below shows an imbalance in our dataset where more articles are in the 'ENTERTAINMENT' category than in 'STYLE.' This means that the STYLE category is underrepresented in our dataset, this might lead to errors in classifying the articles from that category.

In [None]:
# Plot category distribution
def plot_category_distribution(data):
    plt.figure(figsize=(10, 5))
    sns.countplot(data=data, x='category')
    plt.title('Number of Articles in Each Category')
    plt.xlabel('Category')
    plt.ylabel('Number of Articles')
    plt.show()

plot_category_distribution(df)

Plotting the most frequent words from both categories.

In [None]:
# Plot most common words for each category
def plot_most_common_words(data, category, num_words=20):
    category_data = data[data['category'] == category]
    all_words = [word for text in category_data['cleaned_text'] for word in text.split()]
    word_freq = Counter(all_words).most_common(num_words)
    words, counts = zip(*word_freq)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=counts, y=words, hue=words, palette='viridis')
    plt.title(f'Most Common Words in {category}')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()

In [None]:
categories = df['category'].unique()
for category in categories:
    plot_most_common_words(df, category)

Mapping the categorical variables to numbers for training purposes.

In [None]:
# Prepare data for training
df_cleaned = df[['category', 'cleaned_text']].dropna()
category_mapping = {'ENTERTAINMENT': 0, 'STYLE': 1}
df_cleaned['category'] = df_cleaned['category'].map(category_mapping)

***Splitting the Data***:
Our original dataset has been split into train, valid and test datasets in the ratio 70:15:15.

In [None]:
train_data, test_data = train_test_split(df_cleaned, test_size=0.3, random_state=SEED, stratify=df_cleaned['category'])
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=SEED, stratify=test_data['category'])

In [None]:
# Save split datasets
train_data.to_csv('train.csv', index=False)
valid_data.to_csv('valid.csv', index=False)
test_data.to_csv('test.csv', index=False)

In [None]:
# Load split datasets
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')

Using TfIDF vectoriser to extract features from the text data. 'n_grams' is used to ensure that uni-grams and bi-grams are taken into consideration while extracting features. max_features is set to 5000 after trying a varied feature range while training models and 5000 was found to be the optimum feature size for the best F1 score.

**F1-Score:** is taken to be our primary metric as this takes into consideration both precision and recall when calculated. This ensures we acknowledge all the misclassifications when trying to improve the model performance.

In [None]:
# Vectorize text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
x_train_vectorized = tfidf_vectorizer.fit_transform(train['cleaned_text'])
x_valid_vectorized = tfidf_vectorizer.transform(valid['cleaned_text'])
y_train = train['category']
y_valid = valid['category']

In [None]:
n_samples = 1000
indices = np.random.choice(range(x_train_vectorized.shape[0]), size=n_samples, replace=False)
x_subset = x_train_vectorized[indices]
y_subset = y_train.iloc[indices]
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(x_subset.toarray())
plt.figure(figsize=(12, 8))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=y_subset, cmap='viridis', alpha=0.6)
plt.colorbar(scatter)
plt.title('t-SNE visualization of Text Data')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

It is observed from the above TSNE plot that there is a feature overlap between the two categories. Hence, it is not suggested that we use linear models. Therefor a decision has been made to use **Random Forest Classifier** and **Support Vector Machine**.

Training **Random Forest Classifier**

In [None]:
def train_rf(x_train, y_train):
    rf = RandomForestClassifier(random_state=SEED)
    rf.fit(x_train, y_train)
    return rf

In [None]:
rf = train_rf(x_train_vectorized, y_train)

Training **Support Vector Machine**

In [None]:
def train_svm(x_train, y_train):
    svm = SVC(kernel='linear', random_state=SEED)
    svm.fit(x_train, y_train)
    return svm

Here we used linear kernel as it was giving the highest F1 Score, but we have to see as we go through the process.

In [None]:
svm = train_svm(x_train_vectorized, y_train)

**End-to-end Deep Learning Model**
All the parameters below were set to achieve the highest initial F1 score and will be changed if necessary.

In [None]:
# Define neural network classifier
class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.dropout(self.relu(self.fc3(x)))
        x = self.fc4(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x_train_tensor = torch.tensor(x_train_vectorized.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
x_valid_tensor = torch.tensor(x_valid_vectorized.toarray(), dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
def train_nn(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    train_losses = []
    train_accuracies = []

    for epoch in range(1, num_epochs + 1):
        epoch_loss = 0.0
        epoch_true = 0
        epoch_total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            _, pred = torch.max(outputs, dim=1)
            epoch_true += torch.sum(pred == labels).item()
            epoch_total += labels.size(0)

        train_accuracy = 100 * epoch_true / epoch_total
        train_losses.append(epoch_loss)
        train_accuracies.append(train_accuracy)
        print(f"Epoch {epoch}/{num_epochs} finished: train_loss = {epoch_loss:.4f}, train_accuracy = {train_accuracy:.2f}%")

    return model

In [None]:
input_dim = x_train_vectorized.shape[1]
categories = 2
model = TextClassifier(input_dim, categories).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
trained_model = train_nn(model, criterion, optimizer, train_loader, num_epochs=15)

In [None]:
# Evaluate models
def evaluate_model(model, x_train, y_train, x_valid, y_valid, model_name):
    y_train_pred = model.predict(x_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    y_valid_pred = model.predict(x_valid)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_f1 = f1_score(y_valid, y_valid_pred)
    print(f"{model_name} Performance:")
    print(f"Training Accuracy: {train_accuracy:.4f}, Training F1 Score: {train_f1:.4f}")
    print(f"Validation Accuracy: {valid_accuracy:.4f}, Validation F1 Score: {valid_f1:.4f}")
    print("Classification Report (Validation Set):")
    print(classification_report(y_valid, y_valid_pred))
    cnf_matrix = confusion_matrix(y_valid, y_valid_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
    disp.plot(cmap='Blues_r')
    plt.title(f'Confusion Matrix ({model_name} - Validation Set)')
    plt.show()
    return train_accuracy, train_f1, valid_accuracy, valid_f1

In [None]:
# Evaluate standard Random Forest
rf_train_acc, rf_train_f1, rf_valid_acc, rf_valid_f1 = evaluate_model(rf, x_train_vectorized, y_train, x_valid_vectorized, y_valid, "Random Forest")

In [None]:
# Evaluate SVM
svm_train_acc, svm_train_f1, svm_valid_acc, svm_valid_f1 = evaluate_model(svm, x_train_vectorized, y_train, x_valid_vectorized, y_valid, "SVM")

In [None]:
# Evaluate neural network
def evaluate_nn(model, train_loader, x_valid_tensor, y_valid_tensor):
    model.eval()
    y_train_true = []
    y_train_pred = []
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, dim=1)
        y_train_true.extend(labels.cpu().numpy())
        y_train_pred.extend(preds.cpu().numpy())

    train_accuracy = accuracy_score(y_train_true, y_train_pred)
    train_f1 = f1_score(y_train_true, y_train_pred)

    with torch.no_grad():
        x_valid_tensor = x_valid_tensor.to(device)
        y_valid_tensor = y_valid_tensor.to(device)
        outputs = model(x_valid_tensor)
        _, y_valid_pred = torch.max(outputs, dim=1)

    valid_accuracy = accuracy_score(y_valid_tensor.cpu(), y_valid_pred.cpu())
    valid_f1 = f1_score(y_valid_tensor.cpu(), y_valid_pred.cpu())

    print(f"Neural Network Performance:")
    print(f"Training Accuracy: {train_accuracy:.4f}, Training F1 Score: {train_f1:.4f}")
    print(f"Validation Accuracy: {valid_accuracy:.4f}, Validation F1 Score: {valid_f1:.4f}")
    print("Classification Report (Validation Set):")
    print(classification_report(y_valid_tensor.cpu(), y_valid_pred.cpu()))
    cnf_matrix = confusion_matrix(y_valid_tensor.cpu(), y_valid_pred.cpu())
    disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
    disp.plot(cmap='Blues_r')
    plt.title('Confusion Matrix (Neural Network - Validation Set)')
    plt.show()

    return train_accuracy, train_f1, valid_accuracy, valid_f1

In [None]:
# Evaluate Neural Network
nn_train_acc, nn_train_f1, nn_valid_acc, nn_valid_f1 = evaluate_nn(trained_model, train_loader, x_valid_tensor, y_valid_tensor)

From the above evaluation observations, Random Forest Classifier yielded the highest F1 score on validation data, i.e., 79.39%. SVM F1 score = 78.51% and Deep Learning Model = 75.60%

In [None]:
# Error analysis
def error_analysis(model, x_data, y_data, data_text, model_name):
    predictions = model.predict(x_data)
    errors = np.where(predictions != y_data)[0]
    print(f"Error Analysis for {model_name}: {len(errors)} misclassifications out of {len(y_data)} samples")

    if len(errors) > 5:
        sample_errors = np.random.choice(errors, 5, replace=False)
    else:
        sample_errors = errors

    for i in sample_errors:
        print(f"\nIndex: {i}, Predicted: {predictions[i]}, Actual: {y_data[i]}")
        print(f"Text: {data_text.iloc[i]}")
    return errors

In [None]:
rf_errors = error_analysis(rf, x_valid_vectorized, y_valid, valid['cleaned_text'], "Random Forest")

In [None]:
svm_errors = error_analysis(svm, x_valid_vectorized, y_valid, valid['cleaned_text'], "SVM")

In [None]:
def nn_error_analysis(model, loader, y_data, data_text, model_name):
    model.eval()
    predictions, actuals = [], []
    text_indices = []
    for batch, (inputs, labels) in enumerate(loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())
        actuals.extend(labels.cpu().numpy())
        text_indices.extend(range(batch * loader.batch_size, batch * loader.batch_size + labels.size(0)))

    errors = np.where(np.array(predictions) != np.array(actuals))[0]
    print(f"Error Analysis for {model_name}: {len(errors)} misclassifications out of {len(actuals)} samples")

    if len(errors) > 5:
        sample_errors = np.random.choice(errors, 5, replace=False)
    else:
        sample_errors = errors

    for i in sample_errors:
        idx = text_indices[i]
        print(f"\nIndex: {idx}, Predicted: {predictions[i]}, Actual: {actuals[i]}")
        print(f"Text: {data_text.iloc[idx]}")
    return errors

In [None]:
# Prepare DataLoader for validation set
valid_dataset = TensorDataset(x_valid_tensor, y_valid_tensor)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [None]:
# Perform error analysis on validation set for Neural Network
nn_errors_valid = nn_error_analysis(trained_model, valid_loader, y_valid, valid['cleaned_text'], "Neural Network - Validation")

From the above error analysis it is evident that all the models are misclassifying articles from STYLE more than articles from ENTERTAINMENT. This must be because of the underrepresentation of STYLE category in the original data.

To try and negate this issue of underrepresentation, a couple of approaches were adopted.

**Weighted Samples**

In [None]:
# Refined models with class_weight='balanced'
def refined_train_rf(x_train, y_train):
    refined_rf = RandomForestClassifier(n_estimators=1000, class_weight='balanced', random_state=SEED)
    refined_rf.fit(x_train, y_train)
    return refined_rf

In [None]:
refined_rf = refined_train_rf(x_train_vectorized, y_train)

In [None]:
def refined_train_svm(x_train, y_train):
    refined_svm = SVC(kernel='linear', class_weight='balanced', random_state=SEED)
    refined_svm.fit(x_train, y_train)
    return refined_svm

In [None]:
refined_svm = refined_train_svm(x_train_vectorized, y_train)

In [None]:
# Evaluate refined models
refined_rf_train_acc, refined_rf_train_f1, refined_rf_valid_acc, refined_rf_valid_f1 = evaluate_model(refined_rf, x_train_vectorized, y_train, x_valid_vectorized, y_valid, "Refined Random Forest")

In [None]:
refined_svm_train_acc, refined_svm_train_f1, refined_svm_valid_acc, refined_svm_valid_f1 = evaluate_model(refined_svm, x_train_vectorized, y_train, x_valid_vectorized, y_valid, "Refined SVM")

In [None]:
# Neural network with weighted sampler
from torch.utils.data import WeightedRandomSampler

def create_sampler(y_train):
    class_counts = np.bincount(y_train)
    class_weights = 1. / class_counts
    sample_weights = class_weights[y_train]
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
    return sampler

train_sampler = create_sampler(y_train_tensor.numpy())
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False, sampler=train_sampler)

refined_model = TextClassifier(input_dim, categories).to(device)
optimizer = optim.Adam(refined_model.parameters(), lr=0.001)
refined_trained_model = train_nn(refined_model, criterion, optimizer, train_loader, num_epochs=10)

In [None]:
refined_nn_train_acc, refined_nn_train_f1, refined_nn_valid_acc, refined_nn_valid_f1 = evaluate_nn(refined_trained_model, train_loader, x_valid_tensor, y_valid_tensor)

Weighted samples did not prove to be useful as the F1 scores were worse compared to before. But the deep learning model's performance has improved noticeably from F1 Score 75.6% to 77.37%

Below is a combination of **Synthetic Minority Oversampling Technique (SMOTE)** and **Hyperparameter Tuning**

In [None]:
# SMOTE for imbalanced data
smote = SMOTE(random_state=SEED)
x_train_smote, y_train_smote = smote.fit_resample(x_train_vectorized, y_train)

In [None]:
param_grid_rf = {
    'n_estimators': [300, 500],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced']
}
rf = RandomForestClassifier(random_state=SEED)

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, verbose=2, scoring='accuracy', n_jobs=-1)

refined_rf_smote = grid_search_rf.fit(x_train_smote, y_train_smote)

In [None]:
# Evaluate Random Forest with SMOTE
refined_rf_smote_train_acc, refined_rf_smote_train_f1, refined_rf_smote_valid_acc, refined_rf_smote_valid_f1 = evaluate_model(refined_rf_smote, x_train_smote, y_train_smote, x_valid_vectorized, y_valid, "Refined Random Forest with SMOTE")

In [None]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'class_weight': ['balanced']
}

# Initialize the SVM classifier
svm = SVC(random_state=SEED)

# Create the GridSearchCV object for SVM
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, verbose=2, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
svm_smote = grid_search_svm.fit(x_train_smote, y_train_smote)

In [None]:
# Evaluate SVM with SMote
svm_smote_train_acc, svm_smote_train_f1, svm_smote_valid_acc, svm_smote_valid_f1 = evaluate_model(svm_smote, x_train_smote, y_train_smote, x_valid_vectorized, y_valid, "SVM with SMOTE")

In [None]:
# Convert SMOTE data to tensors for Neural Network training
x_train_smote_tensor = torch.tensor(x_train_smote.toarray(), dtype=torch.float32)
y_train_smote_tensor = torch.tensor(y_train_smote, dtype=torch.long)

# DataLoader with SMOTE data
train_dataset_smote = TensorDataset(x_train_smote_tensor, y_train_smote_tensor)
train_loader_smote = DataLoader(train_dataset_smote, batch_size=64, shuffle=True)

nn_smote_model = TextClassifier(input_dim=x_train_smote_tensor.shape[1], output_dim=len(np.unique(y_train_smote))).to(device)
optimizer = optim.Adam(nn_smote_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
trained_nn_smote = train_nn(nn_smote_model, criterion, optimizer, train_loader_smote, num_epochs=10)
smote_nn_train_acc, smote_nn_train_f1, smote_nn_valid_acc, smote_nn_valid_f1 = evaluate_nn(trained_nn_smote, train_loader, x_valid_tensor, y_valid_tensor)

Upon performing this, we can see that the F1 score increased for both the Random Forest Classifier and SVM. Now, the SVM turned out to be a better performer than the other two models with an F1 score of 78.36%

**Saving the models**

In [None]:
best_rf_model = grid_search_rf.best_estimator_

with open('best_rf_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

In [None]:
best_svm_model = grid_search_svm.best_estimator_

with open('best_svm_model.pkl', 'wb') as file:
    pickle.dump(best_svm_model, file)

In [None]:
torch.save(nn_smote_model.state_dict(), 'nn_smote_model.pth')

In [None]:
with open('best_rf_model.pkl', 'rb') as file:
    loaded_rf_model = pickle.load(file)

with open('best_svm_model.pkl', 'rb') as file:
    loaded_svm_model = pickle.load(file)

**Combining the train and validation sets to perform cross validation on models**

In [None]:
# Merge train and validation datasets
train_val_combined = pd.concat([train_data, valid_data])

# Preprocess text data as previously done
X_train_val = tfidf_vectorizer.transform(train_val_combined['cleaned_text'])
y_train_val = train_val_combined['category']

# Perform cross-validation on the loaded RF and SVM models
rf_scores = cross_val_score(loaded_rf_model, X_train_val, y_train_val, cv=5, scoring='accuracy')
svm_scores = cross_val_score(loaded_svm_model, X_train_val, y_train_val, cv=5, scoring='accuracy')

print("Random Forest average cross-validation score:", np.mean(rf_scores))
print("SVM average cross-validation score:", np.mean(svm_scores))

In [None]:
model = TextClassifier(input_dim, categories)
model.load_state_dict(torch.load('nn_smote_model.pth'))
model.to(device)
model.eval()
X_train_val_vectorized = tfidf_vectorizer.transform(train_val_combined['cleaned_text'])
X_train_val_tensor = torch.tensor(X_train_val_vectorized.toarray(), dtype=torch.float32)
y_train_val_tensor = torch.tensor(y_train_val.values, dtype=torch.long)
full_dataset = TensorDataset(X_train_val_tensor, y_train_val_tensor)
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False)

*KFold Cross Validation for Deep Learning Model*

In [None]:
def evaluate_kfold_nn(model, test_loader):
    model.eval()
    total = 0
    correct = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    accuracy = correct / total
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, f1

In [None]:
def cross_validate_nn(model, dataset, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=SEED)
    accuracies = []
    f1_scores = []

    for train_idx, test_idx in kf.split(dataset):
        # Create data loaders for the current fold
        test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)
        test_loader = DataLoader(dataset, batch_size=64, sampler=test_subsampler)

        # Evaluate the model
        model.eval()  # Ensure the model is in evaluation mode
        accuracy, f1 = evaluate_kfold_nn(model, test_loader)  # Adjust 'evaluate_nn' to handle a single loader
        accuracies.append(accuracy)
        f1_scores.append(f1)

    print(f"Average Accuracy: {np.mean(accuracies)}")
    print(f"Average F1 Score: {np.mean(f1_scores)}")

# Execute the cross-validation
cross_validate_nn(model, full_dataset)

After merging the train and valid datasets and performing cross validation, the deep-learning model has a higher Average Cross Validation F1 score of 98.4%.

Taking the high CV F1 Score into consideration, the deep learning model was evaluated with the test dataset below.

In [None]:
model = TextClassifier(input_dim, categories)
model.load_state_dict(torch.load('nn_smote_model.pth'))
model.to(device)
model.eval()

In [None]:
X_test_vectorized = tfidf_vectorizer.transform(test_data['cleaned_text'])
X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(test_data['category'].values, dtype=torch.long)

# Create a DataLoader for the test set
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
def test_evaluate_nn(model, loader):
    model.eval()
    correct = 0
    total = 0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    accuracy = correct / total
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    return accuracy, f1

In [None]:
accuracy, f1 = test_evaluate_nn(model, test_loader)

We achieved an F1 score of 91.15% without retraining it with the combined dataset of train and valid.

**Training the model on Combined Dataset and evaluating its performance on Test data**

In [None]:
train_val_dataset = TensorDataset(X_train_val_tensor, y_train_val_tensor)
train_val_loader = DataLoader(train_val_dataset, batch_size=64, shuffle=True)
model = TextClassifier(input_dim, categories)
model.to(device)
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_nn(model, criterion, optimizer, train_val_loader, num_epochs=15)

In [None]:
accuracy, f1 = test_evaluate_nn(model, test_loader)

The final F1-score also improved from **91.15%** to **91.67%** compared to the model without training on the combined dataset.