In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/Colab_Notebooks')

Mounted at /content/drive


# **Task 1.1 - TF-IDF + ANN**

# Small dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

def preprocess_pandas(data, columns):
    # Create a new dataframe
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP addresses
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)                                           # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                    # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if w not in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return df_  # It makes sense to return the new preprocessed DataFrame

if __name__ == "__main__":
    # Load data and assign column names
    data = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/amazon_cells_labelled.txt",
                       delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index  # add new column index
    columns = ['index', 'Class', 'Sentence']

    # Pre-process the data
    data_preprocessed = preprocess_pandas(data, columns)

    # First, split the full dataset into training (80%) and temporary (20%) sets.
    train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(
        data_preprocessed['Sentence'].values.astype('U'),
        data_preprocessed['Class'].values.astype('int32'),
        test_size=0.2,          # 20% goes into the temporary set
        random_state=0,
        shuffle=True
    )

    # Then split the temporary set equally into validation and test sets (10% each overall).
    # You can adjust the ratio depending on your requirements.
    validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(
        temp_sentences,
        temp_labels,
        test_size=0.5,          # 50% of the temporary set becomes test set, the rest validation
        random_state=0,
        shuffle=True
    )

    # Vectorize the text using TFIDF; fit only on the training data.
    word_vectorizer = TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
        max_features=30000,
        max_df=0.5,
        use_idf=True,
        norm='l2'
    )
    # Fit and transform the training sentences
    train_data = word_vectorizer.fit_transform(train_sentences)
    train_data = train_data.todense()  # Convert sparse matrix to dense

    # Transform validation and test sets using the fitted vectorizer
    validation_data = word_vectorizer.transform(validation_sentences)
    validation_data = validation_data.todense()

    test_data = word_vectorizer.transform(test_sentences)
    test_data = test_data.todense()

    # Convert the data to PyTorch tensors
    train_x_tensor = torch.from_numpy(np.array(train_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(train_labels)).long()

    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

    test_x_tensor = torch.from_numpy(np.array(test_data)).type(torch.FloatTensor)
    test_y_tensor = torch.from_numpy(np.array(test_labels)).long()

    # Optionally, you can create DataLoader objects for batching during training:
    batch_size = 32

    train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
    validation_dataset = TensorDataset(validation_x_tensor, validation_y_tensor)
    test_dataset = TensorDataset(test_x_tensor, test_y_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define a simple ANN model for binary classification
class SimpleANN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Set model parameters
input_dim = train_x_tensor.shape[1]  # number of TF-IDF features
hidden_dim = 20                     # experiment with this value as needed
output_dim = 2                       # 0 (negative) or 1 (positive)

# Instantiate the model, loss function, and optimizer, and move the model to the device
model = SimpleANN(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()  # suitable for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early Stopping Parameters
early_stopping_patience = 3  # number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0

# Train the model
num_epochs = 50  # you can adjust the number of epochs
for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0
    for batch_x, batch_y in train_loader:
        # Move data to the GPU
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()

    avg_train_loss = epoch_train_loss / len(train_loader)

    # Validation phase (Monitoring Performance)
    model.eval()
    epoch_val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for val_x, val_y in validation_loader:
            val_x = val_x.to(device)
            val_y = val_y.to(device)

            outputs = model(val_x)
            loss = criterion(outputs, val_y)
            epoch_val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_val += val_y.size(0)
            correct_val += (predicted == val_y).sum().item()

    avg_val_loss = epoch_val_loss / len(validation_loader)
    val_accuracy = 100 * correct_val / total_val

    print(f'Epoch [{epoch+1}/{num_epochs}] -> '
          f'Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | '
          f'Val Accuracy: {val_accuracy:.2f}%')

    # Early Stopping and Checkpointing:
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save model checkpoint when validation loss improves
        torch.save(model.state_dict(), 'best_model.pt')
        print("Model improved. Checkpoint saved.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= early_stopping_patience:
            print("Early stopping triggered: Validation loss has not improved for "
                  f"{early_stopping_patience} consecutive epochs.")
            break  # Stop training if no improvement

# Load the best model checkpoint for testing
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
correct_test = 0
total_test = 0
with torch.no_grad():
    for test_x, test_y in test_loader:
        test_x = test_x.to(device)
        test_y = test_y.to(device)

        outputs = model(test_x)
        _, predicted = torch.max(outputs.data, 1)
        total_test += test_y.size(0)
        correct_test += (predicted == test_y).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Accuracy: {test_accuracy:.2f}%')



Using device: cpu
Epoch [1/50] -> Train Loss: 0.6936 | Val Loss: 0.6972 | Val Accuracy: 46.00%
Model improved. Checkpoint saved.
Epoch [2/50] -> Train Loss: 0.6808 | Val Loss: 0.6828 | Val Accuracy: 59.00%
Model improved. Checkpoint saved.
Epoch [3/50] -> Train Loss: 0.6494 | Val Loss: 0.6556 | Val Accuracy: 83.00%
Model improved. Checkpoint saved.
Epoch [4/50] -> Train Loss: 0.5916 | Val Loss: 0.6190 | Val Accuracy: 77.00%
Model improved. Checkpoint saved.
Epoch [5/50] -> Train Loss: 0.5125 | Val Loss: 0.5739 | Val Accuracy: 78.00%
Model improved. Checkpoint saved.
Epoch [6/50] -> Train Loss: 0.4229 | Val Loss: 0.5324 | Val Accuracy: 80.00%
Model improved. Checkpoint saved.
Epoch [7/50] -> Train Loss: 0.3370 | Val Loss: 0.4935 | Val Accuracy: 79.00%
Model improved. Checkpoint saved.
Epoch [8/50] -> Train Loss: 0.2636 | Val Loss: 0.4612 | Val Accuracy: 82.00%
Model improved. Checkpoint saved.
Epoch [9/50] -> Train Loss: 0.2059 | Val Loss: 0.4385 | Val Accuracy: 81.00%
Model improved. C

In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_review = "This product exceeded my expectations and works great!"

# Preprocess the sample similar to your training data:
sample_review_lower = sample_review.lower()

# Vectorize the sample using the same TF-IDF vectorizer (assumes that word_vectorizer is already fitted)
sample_vector = word_vectorizer.transform([sample_review_lower]).todense()

# Convert the vector to a PyTorch tensor and move it to the same device as the model (GPU if available)
sample_tensor = torch.from_numpy(np.array(sample_vector)).type(torch.FloatTensor).to(device)

# Set the model to evaluation mode and compute the prediction
model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    _, predicted_label = torch.max(output, 1)

# Print the input and its predicted class
print("Input review:", sample_review)
print("Predicted class:", predicted_label.item())



Input review: This product exceeded my expectations and works great!
Predicted class: 1


In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_review = "I completely detest this product, despite its cheap price."

# Preprocess the sample similar to your training data:
sample_review_lower = sample_review.lower()

# Vectorize the sample using the same TF-IDF vectorizer (assumes that word_vectorizer is already fitted)
sample_vector = word_vectorizer.transform([sample_review_lower]).todense()

# Convert the vector to a PyTorch tensor and move it to the same device as the model (GPU if available)
sample_tensor = torch.from_numpy(np.array(sample_vector)).type(torch.FloatTensor).to(device)

# Set the model to evaluation mode and compute the prediction
model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    _, predicted_label = torch.max(output, 1)

# Print the input and its predicted class
print("Input review:", sample_review)
print("Predicted class:", predicted_label.item())


Input review: I completely detest this product, despite its cheap price.
Predicted class: 0


# Large dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

def preprocess_pandas(data, columns):
    # Create a new dataframe
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP addresses
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)                                           # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                    # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if w not in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return df_  # It makes sense to return the new preprocessed DataFrame

if __name__ == "__main__":
    # Load data and assign column names
    data = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/amazon_cells_labelled_LARGE_25K.txt",
                       delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index  # add new column index
    columns = ['index', 'Class', 'Sentence']

    # Pre-process the data
    data_preprocessed = preprocess_pandas(data, columns)

    # First, split the full dataset into training (60%) and temporary (40%) sets.
    train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(
        data_preprocessed['Sentence'].values.astype('U'),
        data_preprocessed['Class'].values.astype('int32'),
        test_size=0.4,          # 40% goes into the temporary set
        random_state=0,
        shuffle=True
    )

    # Then split the temporary set equally into validation and test sets (20% each overall).
    # You can adjust the ratio depending on your requirements.
    validation_sentences, test_sentences, validation_labels, test_labels = train_test_split(
        temp_sentences,
        temp_labels,
        test_size=0.5,          # 50% of the temporary set becomes test set, the rest validation
        random_state=0,
        shuffle=True
    )

    # Vectorize the text using TFIDF; fit only on the training data.
    word_vectorizer = TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
        max_features=30000,
        max_df=0.5,
        use_idf=True,
        norm='l2'
    )
    # Fit and transform the training sentences
    train_data = word_vectorizer.fit_transform(train_sentences)
    train_data = train_data.todense()  # Convert sparse matrix to dense

    # Transform validation and test sets using the fitted vectorizer
    validation_data = word_vectorizer.transform(validation_sentences)
    validation_data = validation_data.todense()

    test_data = word_vectorizer.transform(test_sentences)
    test_data = test_data.todense()

    # Convert the data to PyTorch tensors
    train_x_tensor = torch.from_numpy(np.array(train_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(train_labels)).long()

    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

    test_x_tensor = torch.from_numpy(np.array(test_data)).type(torch.FloatTensor)
    test_y_tensor = torch.from_numpy(np.array(test_labels)).long()

    # Optionally, you can create DataLoader objects for batching during training:
    batch_size = 32

    train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
    validation_dataset = TensorDataset(validation_x_tensor, validation_y_tensor)
    test_dataset = TensorDataset(test_x_tensor, test_y_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class SimpleANN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# Assume train_x_tensor is already defined
input_dim = train_x_tensor.shape[1]
hidden_dim = 20
output_dim = 2

model = SimpleANN(input_dim, hidden_dim, output_dim, dropout_rate=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)


# Early Stopping Parameters
early_stopping_patience = 3  # number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0

# Train the model
num_epochs = 100  # you can adjust the number of epochs
for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0
    for batch_x, batch_y in train_loader:
        # Move data to the GPU
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()

    avg_train_loss = epoch_train_loss / len(train_loader)

    # Validation phase (Monitoring Performance)
    model.eval()
    epoch_val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for val_x, val_y in validation_loader:
            val_x = val_x.to(device)
            val_y = val_y.to(device)

            outputs = model(val_x)
            loss = criterion(outputs, val_y)
            epoch_val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_val += val_y.size(0)
            correct_val += (predicted == val_y).sum().item()

    avg_val_loss = epoch_val_loss / len(validation_loader)
    val_accuracy = 100 * correct_val / total_val

    print(f'Epoch [{epoch+1}/{num_epochs}] -> '
          f'Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | '
          f'Val Accuracy: {val_accuracy:.2f}%')

    # Early Stopping and Checkpointing:
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save model checkpoint when validation loss improves
        torch.save(model.state_dict(), 'best_model.pt')
        print("Model improved. Checkpoint saved.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= early_stopping_patience:
            print("Early stopping triggered: Validation loss has not improved for "
                  f"{early_stopping_patience} consecutive epochs.")
            break  # Stop training if no improvement

# Load the best model checkpoint for testing
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
correct_test = 0
total_test = 0
with torch.no_grad():
    for test_x, test_y in test_loader:
        test_x = test_x.to(device)
        test_y = test_y.to(device)

        outputs = model(test_x)
        _, predicted = torch.max(outputs.data, 1)
        total_test += test_y.size(0)
        correct_test += (predicted == test_y).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Accuracy: {test_accuracy:.2f}%')



Using device: cpu




Epoch [1/100] -> Train Loss: 0.4540 | Val Loss: 0.3349 | Val Accuracy: 84.94%
Model improved. Checkpoint saved.
Epoch [2/100] -> Train Loss: 0.2775 | Val Loss: 0.3229 | Val Accuracy: 85.76%
Model improved. Checkpoint saved.
Epoch [3/100] -> Train Loss: 0.1842 | Val Loss: 0.3535 | Val Accuracy: 84.98%
Epoch [4/100] -> Train Loss: 0.1395 | Val Loss: 0.3845 | Val Accuracy: 84.88%
Epoch [5/100] -> Train Loss: 0.1119 | Val Loss: 0.4207 | Val Accuracy: 85.14%
Early stopping triggered: Validation loss has not improved for 3 consecutive epochs.
Test Accuracy: 85.44%


In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_review = "This product exceeded my expectations and works great!"

# Preprocess the sample similar to your training data:
sample_review_lower = sample_review.lower()

# Vectorize the sample using the same TF-IDF vectorizer (assumes that word_vectorizer is already fitted)
sample_vector = word_vectorizer.transform([sample_review_lower]).todense()

# Convert the vector to a PyTorch tensor and move it to the same device as the model (GPU if available)
sample_tensor = torch.from_numpy(np.array(sample_vector)).type(torch.FloatTensor).to(device)

# Set the model to evaluation mode and compute the prediction
model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    _, predicted_label = torch.max(output, 1)

# Print the input and its predicted class
print("Input review:", sample_review)
print("Predicted class:", predicted_label.item())



Input review: This product exceeded my expectations and works great!
Predicted class: 1


In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_review = "I completely detest this product, despite its cheap price."

# Preprocess the sample similar to your training data:
sample_review_lower = sample_review.lower()

# Vectorize the sample using the same TF-IDF vectorizer (assumes that word_vectorizer is already fitted)
sample_vector = word_vectorizer.transform([sample_review_lower]).todense()

# Convert the vector to a PyTorch tensor and move it to the same device as the model (GPU if available)
sample_tensor = torch.from_numpy(np.array(sample_vector)).type(torch.FloatTensor).to(device)

# Set the model to evaluation mode and compute the prediction
model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    _, predicted_label = torch.max(output, 1)

# Print the input and its predicted class
print("Input review:", sample_review)
print("Predicted class:", predicted_label.item())


Input review: I completely detest this product, despite its cheap price.
Predicted class: 0


# **Task 1.2 - Transformer implementation**

# Small dataset

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk import word_tokenize
from nltk.corpus import stopwords

# Preprocessing function (adjust as needed)
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return df_

# Load and preprocess your data
data = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/amazon_cells_labelled.txt",
                   delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)

# First split: 80% training and 20% temporary data.
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.20,
    random_state=0,
    shuffle=True
)

# Second split: split temporary data equally into 50% validation and 50% test (i.e. 10% each overall)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=0,
    shuffle=True
)

# Initialize the transformer tokenizer (e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts for each set (with padding and truncation)
encoded_train = tokenizer(list(train_texts), padding=True, truncation=True, return_tensors='pt')
encoded_val = tokenizer(list(val_texts), padding=True, truncation=True, return_tensors='pt')
encoded_test = tokenizer(list(test_texts), padding=True, truncation=True, return_tensors='pt')

# Convert labels to tensors
train_y_tensor = torch.tensor(train_labels).long()
val_y_tensor = torch.tensor(val_labels).long()
test_y_tensor = torch.tensor(test_labels).long()

# Create TensorDatasets for training, validation, and test sets
train_dataset = TensorDataset(encoded_train['input_ids'],
                              encoded_train['attention_mask'],
                              train_y_tensor)
val_dataset = TensorDataset(encoded_val['input_ids'],
                            encoded_val['attention_mask'],
                            val_y_tensor)
test_dataset = TensorDataset(encoded_test['input_ids'],
                             encoded_test['attention_mask'],
                             test_y_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel

# Define a Transformer-based classifier model
class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model_name, hidden_dim, output_dim):
        super(TransformerClassifier, self).__init__()
        # Load a pretrained transformer model (e.g., BERT)
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        # Classification head: a simple feedforward network
        self.fc1 = nn.Linear(self.transformer.config.hidden_size, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        # Get the transformer outputs
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token embedding (first token) as the sentence representation
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        # Pass through the classification head
        out = self.fc1(cls_embedding)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Set model parameters
hidden_dim = 20          # You can experiment with this value
output_dim = 2           # Binary classification: 0 (negative), 1 (positive)

# Instantiate the model
model = TransformerClassifier('bert-base-uncased', hidden_dim, output_dim)

# Check if GPU is available and move the model to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Use a smaller learning rate for fine-tuning transformers
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# ---- Early Stopping Configuration ----
early_stopping_patience = 3  # Number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0

# ---- Training Loop ----
num_epochs = 10  # Maximum number of epochs to train

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        # Unpack the batch and move tensors to GPU
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()       # Reset gradients
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # ---- Validation Phase ----
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    print(f'Epoch [{epoch+1}/{num_epochs}] -> Train Loss: {avg_train_loss:.4f} | '
          f'Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%')

    # ---- Early Stopping Check ----
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save the current best model checkpoint
        torch.save(model.state_dict(), 'best_model_transformer.pt')
        print("Validation loss improved, saving model checkpoint.")
    else:
        epochs_no_improve += 1
        print(f"No improvement in validation loss for {epochs_no_improve} consecutive epoch(s).")
        if epochs_no_improve >= early_stopping_patience:
            print("Early stopping triggered. Stopping training.")
            break

# ---- Load Best Model & Test Evaluation ----
model.load_state_dict(torch.load('best_model_transformer.pt'))
model.eval()
correct_test = 0
total_test = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Accuracy: {test_accuracy:.2f}%')



Epoch [1/10] -> Train Loss: 0.6602 | Val Loss: 0.5437 | Val Accuracy: 86.00%
Validation loss improved, saving model checkpoint.
Epoch [2/10] -> Train Loss: 0.4406 | Val Loss: 0.3435 | Val Accuracy: 83.00%
Validation loss improved, saving model checkpoint.
Epoch [3/10] -> Train Loss: 0.2677 | Val Loss: 0.2817 | Val Accuracy: 85.00%
Validation loss improved, saving model checkpoint.
Epoch [4/10] -> Train Loss: 0.1925 | Val Loss: 0.3518 | Val Accuracy: 82.00%
No improvement in validation loss for 1 consecutive epoch(s).
Epoch [5/10] -> Train Loss: 0.1560 | Val Loss: 0.3617 | Val Accuracy: 81.00%
No improvement in validation loss for 2 consecutive epoch(s).
Epoch [6/10] -> Train Loss: 0.0964 | Val Loss: 0.4897 | Val Accuracy: 84.00%
No improvement in validation loss for 3 consecutive epoch(s).
Early stopping triggered. Stopping training.
Test Accuracy: 87.00%


In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_sentence = "This product exceeded my expectations and works great!"

# Set the model to evaluation mode
model.eval()

# Tokenize the sample sentence (using the same tokenizer as during training)
encoded_sample = tokenizer(sample_sentence, padding=True, truncation=True, return_tensors='pt')

# Move the tokenized inputs to the device (GPU if available)
encoded_sample = {key: val.to(device) for key, val in encoded_sample.items()}

# Pass the tokenized input through the model to get predictions
with torch.no_grad():
    outputs = model(encoded_sample['input_ids'], encoded_sample['attention_mask'])
    # Get the predicted class (0 for negative, 1 for positive)
    _, predicted = torch.max(outputs, 1)

# Extract the predicted class value
predicted_class = predicted.item()

print("Input sentence:", sample_sentence)
print("Predicted class:", predicted_class)



Input sentence: This product exceeded my expectations and works great!
Predicted class: 1


In [None]:
# Define a sample review sentence
sample_sentence = "I completely detest this product, despite its cheap price."

# Set the model to evaluation mode
model.eval()

# Tokenize the sample sentence (using the same tokenizer as during training)
encoded_sample = tokenizer(sample_sentence, padding=True, truncation=True, return_tensors='pt')

# Move the tokenized inputs to the device (GPU if available)
encoded_sample = {key: val.to(device) for key, val in encoded_sample.items()}

# Pass the tokenized input through the model to get predictions
with torch.no_grad():
    outputs = model(encoded_sample['input_ids'], encoded_sample['attention_mask'])
    # Get the predicted class (0 for negative, 1 for positive)
    _, predicted = torch.max(outputs, 1)

# Extract the predicted class value
predicted_class = predicted.item()

print("Input sentence:", sample_sentence)
print("Predicted class:", predicted_class)


Input sentence: I completely detest this product, despite cheap price.
Predicted class: 0


# Large dataset

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk import word_tokenize
from nltk.corpus import stopwords

# Preprocessing function (adjust as needed)
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return df_

# Load and preprocess your data
data = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/amazon_cells_labelled_LARGE_25K.txt",
                   delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)

# First split: 60% training and 40% temporary data.
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.40,
    random_state=0,
    shuffle=True
)

# Second split: split temporary data equally into 50% validation and 50% test (i.e. 20% each overall)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=0,
    shuffle=True
)

# Initialize the transformer tokenizer (e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts for each set (with padding and truncation)
encoded_train = tokenizer(list(train_texts), padding=True, truncation=True, return_tensors='pt')
encoded_val = tokenizer(list(val_texts), padding=True, truncation=True, return_tensors='pt')
encoded_test = tokenizer(list(test_texts), padding=True, truncation=True, return_tensors='pt')

# Convert labels to tensors
train_y_tensor = torch.tensor(train_labels).long()
val_y_tensor = torch.tensor(val_labels).long()
test_y_tensor = torch.tensor(test_labels).long()

# Create TensorDatasets for training, validation, and test sets
train_dataset = TensorDataset(encoded_train['input_ids'],
                              encoded_train['attention_mask'],
                              train_y_tensor)
val_dataset = TensorDataset(encoded_val['input_ids'],
                            encoded_val['attention_mask'],
                            val_y_tensor)
test_dataset = TensorDataset(encoded_test['input_ids'],
                             encoded_test['attention_mask'],
                             test_y_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model_name, hidden_dim, output_dim, dropout_rate=0.3):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(self.transformer.config.hidden_size, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        out = self.dropout(cls_embedding)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# Set model parameters
hidden_dim = 20          # Experiment with other sizes as needed
output_dim = 2           # Binary classification: 0 (negative), 1 (positive)
model = TransformerClassifier('bert-base-uncased', hidden_dim, output_dim, dropout_rate=0.3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=1e-2)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)

# ---- Early Stopping Configuration ----
early_stopping_patience = 3  # Number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')
epochs_no_improve = 0
freeze_epochs = 2  # Freeze transformer layers for first few epochs

for epoch in range(num_epochs):

      # Freeze the transformer layers for the first few epochs
    if epoch < freeze_epochs:
        for param in model.transformer.parameters():
            param.requires_grad = False
    else:
        for param in model.transformer.parameters():
            param.requires_grad = True
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        # Unpack the batch and move tensors to GPU
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()       # Reset gradients
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # ---- Validation Phase ----
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    print(f'Epoch [{epoch+1}/{num_epochs}] -> Train Loss: {avg_train_loss:.4f} | '
          f'Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%')

    # ---- Early Stopping Check ----
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save the current best model checkpoint
        torch.save(model.state_dict(), 'best_model_transformer.pt')
        print("Validation loss improved, saving model checkpoint.")
    else:
        epochs_no_improve += 1
        print(f"No improvement in validation loss for {epochs_no_improve} consecutive epoch(s).")
        if epochs_no_improve >= early_stopping_patience:
            print("Early stopping triggered. Stopping training.")
            break

# ---- Load Best Model & Test Evaluation ----
model.load_state_dict(torch.load('best_model_transformer.pt'))
model.eval()
correct_test = 0
total_test = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Accuracy: {test_accuracy:.2f}%')



Epoch [1/10] -> Train Loss: 0.6929 | Val Loss: 0.6670 | Val Accuracy: 61.54%
Validation loss improved, saving model checkpoint.
Epoch [2/10] -> Train Loss: 0.6684 | Val Loss: 0.6476 | Val Accuracy: 61.54%
Validation loss improved, saving model checkpoint.
Epoch [3/10] -> Train Loss: 0.4158 | Val Loss: 0.3170 | Val Accuracy: 86.82%
Validation loss improved, saving model checkpoint.
Epoch [4/10] -> Train Loss: 0.3150 | Val Loss: 0.3086 | Val Accuracy: 87.28%
Validation loss improved, saving model checkpoint.
Epoch [5/10] -> Train Loss: 0.2741 | Val Loss: 0.3035 | Val Accuracy: 87.62%
Validation loss improved, saving model checkpoint.
Epoch [6/10] -> Train Loss: 0.2481 | Val Loss: 0.3154 | Val Accuracy: 87.92%
No improvement in validation loss for 1 consecutive epoch(s).
Epoch [7/10] -> Train Loss: 0.2223 | Val Loss: 0.3322 | Val Accuracy: 87.40%
No improvement in validation loss for 2 consecutive epoch(s).
Epoch [8/10] -> Train Loss: 0.2003 | Val Loss: 0.3425 | Val Accuracy: 87.56%
No im

In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_sentence = "This product exceeded my expectations and works great!"

# Set the model to evaluation mode
model.eval()

# Tokenize the sample sentence (using the same tokenizer as during training)
encoded_sample = tokenizer(sample_sentence, padding=True, truncation=True, return_tensors='pt')

# Move the tokenized inputs to the device (GPU if available)
encoded_sample = {key: val.to(device) for key, val in encoded_sample.items()}

# Pass the tokenized input through the model to get predictions
with torch.no_grad():
    outputs = model(encoded_sample['input_ids'], encoded_sample['attention_mask'])
    # Get the predicted class (0 for negative, 1 for positive)
    _, predicted = torch.max(outputs, 1)

# Extract the predicted class value
predicted_class = predicted.item()

print("Input sentence:", sample_sentence)
print("Predicted class:", predicted_class)



Input sentence: This product exceeded my expectations and works great!
Predicted class: 1


In [None]:
# Explicit test: examine a single review and its prediction

# Define a sample review (or use one from your dataset)
sample_sentence = "I'm very disappointed with the product."

# Set the model to evaluation mode
model.eval()

# Tokenize the sample sentence (using the same tokenizer as during training)
encoded_sample = tokenizer(sample_sentence, padding=True, truncation=True, return_tensors='pt')

# Move the tokenized inputs to the device (GPU if available)
encoded_sample = {key: val.to(device) for key, val in encoded_sample.items()}

# Pass the tokenized input through the model to get predictions
with torch.no_grad():
    outputs = model(encoded_sample['input_ids'], encoded_sample['attention_mask'])
    # Get the predicted class (0 for negative, 1 for positive)
    _, predicted = torch.max(outputs, 1)

# Extract the predicted class value
predicted_class = predicted.item()

print("Input sentence:", sample_sentence)
print("Predicted class:", predicted_class)



Input sentence: I'm very disappointed with the product.
Predicted class: 0


In [None]:
# Define a sample review sentence
sample_sentence = "I completely detest this product, despite its cheap price."

# Set the model to evaluation mode
model.eval()

# Tokenize the sample sentence (using the same tokenizer as during training)
encoded_sample = tokenizer(sample_sentence, padding=True, truncation=True, return_tensors='pt')

# Move the tokenized inputs to the device (GPU if available)
encoded_sample = {key: val.to(device) for key, val in encoded_sample.items()}

# Pass the tokenized input through the model to get predictions
with torch.no_grad():
    outputs = model(encoded_sample['input_ids'], encoded_sample['attention_mask'])
    # Get the predicted class (0 for negative, 1 for positive)
    _, predicted = torch.max(outputs, 1)

# Extract the predicted class value
predicted_class = predicted.item()

print("Input sentence:", sample_sentence)
print("Predicted class:", predicted_class)


Input sentence: Very cheap product, and works okay.
Predicted class: 1


# **Task 1.3 Comparison**

*Compare the performance of the two models and explain in which scenarios you would prefer one over the other.*

The transformer implementation demonstrates superior performance in terms of both speed and accuracy. It converges in fewer epochs and generally achieves better accuracy on the test set. However, the difference in accuracy between the transformer and the TF-IDF+ANN model is relatively modest for this particular dataset. In contrast, the computational cost of the transformer is significantly higher. Therefore, if computational resources are limited, the TF-IDF+ANN model is preferable. On the other hand, if achieving higher accuracy is crucial and the sequential order of words substantially influences the neural network’s performance, the transformer model is the better choice.

*How did the two models’ complexity, accuracy, and efficiency differ? Did one model outperform the other in specific scenarios or tasks? If so, why?*

The TF-IDF+ANN implementation is considerably less complex and more efficient. This efficiency is most noticeable during CPU-based training: 40 epochs using 20 neurons in the hidden layers can be completed in just a few seconds, whereas the transformer implementation—also using a configuration involving 20 neurons—requires 20 minutes to complete only 6 epochs. Despite these differences in training time, both models achieve similar levels of accuracy.

Moreover, one significant advantage of the transformer model is its ability to attain reasonable accuracy with less data. For instance, when training on a small dataset, the TF-IDF+ANN model reached an accuracy of 82% on the test set, while the transformer achieved 87%—a level of performance comparable to that attained on a larger dataset (approximately 87.3% accuracy). Consequently, if the available training data is limited, the transformer model is likely to provide more reliable performance. Conversely, when ample training data is available but computational resources are constrained, the TF-IDF+ANN model can deliver satisfactory results with greater efficiency for simpler tasks.

*What insights did you obtain concerning data amount to train? Embedding utilized? Architectural choices made?*

Data amount:
TF-IDF+ANN: requires more data since it uses shallow, non-contextual features.
Transformers: Leverages pretrained contextual embeddings, often performing well with less training data.

Embedding utilized:
TF-IDF: Simple, interpretable, however lacks context and nuance.
Transformers: Provides rich, context-aware embeddings that caputure linguistic features.

Architectural choices:
TF-IDF+ANN: Lightweight feedforward network with TF-IDF inputs are fast and resource-efficient but less effective on complex tasks.
Transformers: Deep, pretrained model (BERT) with classification head are more computationally intensive yet delivers superior performance on nuanced tasks.





