In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("./data/movie.csv")
x = df.iloc[:,0]
y = df.iloc[:,1]
print(df)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True, norm="l1")
x_train_v = vectorizer.fit_transform(x_train)
x_test_v = vectorizer.transform(x_test)


                                                    text  label
0      I grew up (b. 1965) watching and loving the Th...      0
1      When I put this movie in my DVD player, and sa...      0
2      Why do people who do not know what a particula...      0
3      Even though I have great interest in Biblical ...      0
4      Im a die hard Dads Army fan and nothing will e...      1
...                                                  ...    ...
39995  "Western Union" is something of a forgotten cl...      1
39996  This movie is an incredible piece of work. It ...      1
39997  My wife and I watched this movie because we pl...      0
39998  When I first watched Flatliners, I was amazed....      1
39999  Why would this film be so good, but only gross...      1

[40000 rows x 2 columns]


In [2]:
x_train_tensor = torch.tensor(x_train_v.toarray(), dtype=torch.float32)
x_test_tensor = torch.tensor(x_test_v.toarray(), dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Long dtype for classification
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader
batch_size = 32
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define MLP Model
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize model
input_size = x_train_tensor.shape[1]  # Match TF-IDF feature size
mlp = MLP(input_size, 2)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()  # For classification
optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01, momentum=0.9)


# Training loop with early stopping within an epoch
num_epochs = 5
epsilon = 1e-5  # Stopping threshold
prev_loss = float('inf')  # Initialize previous loss to a large value
patience_batches = 10  # Stop if no improvement in the last N batches

for epoch in range(num_epochs):
    print(f'Starting Epoch {epoch+1}')
    current_loss = 0.0
    batch_no_improve = 0  # Counter to track stagnation

    for i, (inputs, targets) in enumerate(trainloader, 0):
        inputs, targets = inputs.float(), targets.long()  # Ensure correct types

        optimizer.zero_grad()
        outputs = mlp(inputs)

        loss = loss_function(outputs, targets)

        loss.backward()
        optimizer.step()

        current_loss += loss.item()
        avg_loss = current_loss / (i + 1)  # Moving average loss

        if i % 10 == 0:  # Print loss every 10 batches
            print(f'Loss after mini-batch {i+1}: {avg_loss:.6f}')

        # Early stopping within an epoch (stop if loss stabilizes)
        if abs(prev_loss - avg_loss) < epsilon:
            batch_no_improve += 1
            if batch_no_improve >= patience_batches:
                print(f"Stopping epoch {epoch+1} early after {i+1} batches due to no improvement.")
                break
        else:
            batch_no_improve = 0  # Reset counter if loss improves

        prev_loss = avg_loss  # Update loss for next check

    print(f"Epoch {epoch+1} finished after {i+1} batches.")

print("Training has completed")


Starting Epoch 1
Loss after mini-batch 1: 0.724707
Loss after mini-batch 11: 0.699396
Loss after mini-batch 21: 0.702613
Loss after mini-batch 31: 0.701063
Loss after mini-batch 41: 0.700152
Loss after mini-batch 51: 0.698518
Loss after mini-batch 61: 0.697607
Loss after mini-batch 71: 0.697160
Loss after mini-batch 81: 0.696545
Loss after mini-batch 91: 0.695831
Loss after mini-batch 101: 0.695973
Loss after mini-batch 111: 0.695893
Loss after mini-batch 121: 0.695691
Loss after mini-batch 131: 0.695538
Loss after mini-batch 141: 0.695378
Loss after mini-batch 151: 0.695216
Loss after mini-batch 161: 0.695148
Loss after mini-batch 171: 0.695012
Loss after mini-batch 181: 0.694972
Loss after mini-batch 191: 0.694912
Loss after mini-batch 201: 0.694849
Loss after mini-batch 211: 0.694729
Loss after mini-batch 221: 0.694659
Loss after mini-batch 231: 0.694691
Loss after mini-batch 241: 0.694636
Loss after mini-batch 251: 0.694660
Loss after mini-batch 261: 0.694661
Loss after mini-batch 

In [5]:
mlp.eval() 

MLP(
  (layers): Sequential(
    (0): Linear(in_features=84479, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=8, bias=True)
    (5): ReLU()
    (6): Linear(in_features=8, out_features=4, bias=True)
    (7): ReLU()
    (8): Linear(in_features=4, out_features=2, bias=True)
  )
)

In [9]:
outputs = mlp(x_test_tensor)
predicted_labels = outputs.squeeze().tolist()

predicted_labels = np.array(predicted_labels)
test_targets = np.array(y_test_tensor)

print(predicted_labels)
print(test_targets)

[[-0.30407515 -0.32051724]
 [-0.30408213 -0.32052079]
 [-0.30404875 -0.32050344]
 ...
 [-0.30408004 -0.32052046]
 [-0.30407214 -0.32051522]
 [-0.30408233 -0.32052064]]
[0 1 0 ... 1 0 0]
