In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("./data/movie.csv")
x = df.iloc[:,0]
y = df.iloc[:,1]
print(df)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True, norm="l1")
x_train_v = vectorizer.fit_transform(x_train)
x_test_v = vectorizer.transform(x_test)

x_train_tensor = torch.tensor(x_train_v.toarray(), dtype=torch.float32)
x_test_tensor = torch.tensor(x_test_v.toarray(), dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Long dtype for classification
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

class_counts = df['label'].value_counts()
print(f"Class 0 count: {class_counts.get(0, 0)}")
print(f"Class 1 count: {class_counts.get(1, 0)}")

                                                    text  label
0      I grew up (b. 1965) watching and loving the Th...      0
1      When I put this movie in my DVD player, and sa...      0
2      Why do people who do not know what a particula...      0
3      Even though I have great interest in Biblical ...      0
4      Im a die hard Dads Army fan and nothing will e...      1
...                                                  ...    ...
39995  "Western Union" is something of a forgotten cl...      1
39996  This movie is an incredible piece of work. It ...      1
39997  My wife and I watched this movie because we pl...      0
39998  When I first watched Flatliners, I was amazed....      1
39999  Why would this film be so good, but only gross...      1

[40000 rows x 2 columns]
Class 0 count: 20019
Class 1 count: 19981


In [14]:
# Create DataLoader
batch_size = 32
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.5),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.5),
            nn.Linear(32, 8),
            nn.ReLU(),
            nn.Linear(8, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize model
input_size = x_train_tensor.shape[1]
mlp = MLP(input_size, 2)

# Optimizer
loss_function = nn.CrossEntropyLoss()  # For classification
optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Training loop
num_epochs = 20
epsilon = 1e-5
prev_loss = float('inf')
patience_batches = 10
for epoch in range(num_epochs):
    print(f'Starting Epoch {epoch+1}')
    current_loss = 0.0
    batch_no_improve = 0

    for i, (inputs, targets) in enumerate(trainloader, 0):
        inputs, targets = inputs.float(), targets.long()  # Ensure correct types

        optimizer.zero_grad()
        outputs = mlp(inputs)

        loss = loss_function(outputs, targets)

        loss.backward()
        optimizer.step()

        current_loss += loss.item()
        avg_loss = current_loss / (i + 1)

        if i % 10 == 0:
            print(f'Loss after mini-batch {i+1}: {avg_loss:.6f}')

        if abs(prev_loss - avg_loss) < epsilon:
            batch_no_improve += 1
            if batch_no_improve >= patience_batches:
                print(f"Stopping epoch {epoch+1} early after {i+1} batches due to no improvement.")
                break
        else:
            batch_no_improve = 0

        prev_loss = avg_loss

    scheduler.step()  # Update learning rate

    print(f"Epoch {epoch+1} finished after {i+1} batches.")

print("Training has completed")

Starting Epoch 1
Loss after mini-batch 1: 0.724294
Loss after mini-batch 11: 0.651565
Loss after mini-batch 21: 0.626221
Loss after mini-batch 31: 0.597295
Loss after mini-batch 41: 0.585808
Loss after mini-batch 51: 0.575894
Loss after mini-batch 61: 0.565850
Loss after mini-batch 71: 0.552290
Loss after mini-batch 81: 0.540923
Loss after mini-batch 91: 0.538414
Loss after mini-batch 101: 0.527120
Loss after mini-batch 111: 0.516070
Loss after mini-batch 121: 0.507514
Loss after mini-batch 131: 0.503814
Loss after mini-batch 141: 0.500822
Loss after mini-batch 151: 0.493629
Loss after mini-batch 161: 0.487177
Loss after mini-batch 171: 0.483420
Loss after mini-batch 181: 0.479659
Loss after mini-batch 191: 0.474846
Loss after mini-batch 201: 0.469342
Loss after mini-batch 211: 0.463567
Loss after mini-batch 221: 0.463902
Loss after mini-batch 231: 0.459877
Loss after mini-batch 241: 0.456811
Loss after mini-batch 251: 0.455533
Loss after mini-batch 261: 0.451637
Loss after mini-batch 

In [16]:
mlp.eval() 

MLP(
  (layers): Sequential(
    (0): Linear(in_features=84479, out_features=128, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=128, out_features=32, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=32, out_features=8, bias=True)
    (9): ReLU()
    (10): Linear(in_features=8, out_features=2, bias=True)
  )
)

In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming outputs are raw logits or probabilities (shape: [batch_size, num_classes])
outputs = mlp(x_test_tensor)

# Get the index of the class with the highest confidence for each sample
_, predicted_labels = torch.max(outputs, 1)  # Returns the class index with max confidence

# Convert predicted labels to numpy array
predicted_labels = predicted_labels.numpy()

# Convert true labels to numpy array
test_targets = y_test_tensor.numpy()

# Calculate accuracy
accuracy = accuracy_score(test_targets, predicted_labels)
print(f'Accuracy: {accuracy:.4f}')

# Print a detailed classification report (precision, recall, F1-score)
report = classification_report(test_targets, predicted_labels, target_names=["Class 0", "Class 1"])
print(report)

# Compute the confusion matrix
cm = confusion_matrix(test_targets, predicted_labels)
print(cm)


Accuracy: 0.8846
              precision    recall  f1-score   support

     Class 0       0.89      0.88      0.88      3966
     Class 1       0.88      0.89      0.89      4034

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000

[[3477  489]
 [ 434 3600]]
