In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
df_reviews = pd.read_json('/SFS/project/ry/dp_sgteam/catherine/ada/dataset/cleaned_data.json')

print(f"Dataset shape: {df_reviews.shape}")
df_reviews.head()

Dataset shape: (573913, 7)


Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,1,oscar year shawshank redemption write direct f...,10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,1,shawshank redemption without doubt one brillia...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,1,believe film best story ever tell film tell ti...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,1,yes spoiler film emotional impact find hard wr...,10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,1,heart extraordinary movie brilliant indelible ...,8,"Great story, wondrously told and acted"


In [6]:
df_reviews.dropna(subset=['review_text', 'is_spoiler'], inplace=True)

X = df_reviews[['review_text']]
y = df_reviews['is_spoiler'].astype(int)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 459130
Testing set size: 114783


In [13]:
n = y_train.value_counts()
pos_weight = round(n[0]/n[1],2)
pos_weight = torch.tensor(pos_weight, dtype=torch.float64)

print(f"Positive Weight: {pos_weight}")

Positive Weight: 2.8


In [14]:
# --- Build Vocabulary ---
def build_vocab(texts, min_freq=2):
    word_counts = Counter()
    for text in texts:
        word_counts.update(text.split())
    
    # Create a vocabulary with special tokens
    # <pad>: for padding short sentences
    # <unk>: for unknown words not in the vocabulary
    vocab = {'<pad>': 0, '<unk>': 1}
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = len(vocab)
    return vocab

# Build vocab only on the training data
vocab = build_vocab(X_train.review_text, min_freq=1)
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 231082


In [15]:
class SpoilerDataset(Dataset):
    def __init__(self, X, y, vocab):
        self.X = X
        self.y = y
        self.vocab = vocab

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X.iloc[idx]
        label = self.y.iloc[idx]
        
        token_ids = [self.vocab.get(word, self.vocab['<unk>']) for word in text.split()] 
        
        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.float)

train_dataset = SpoilerDataset(X_train.review_text, y_train, vocab)
test_dataset = SpoilerDataset(X_test.review_text, y_test, vocab)

# This custom collate function handles padding within each batch.
def collate_fn(batch):
    texts, labels = zip(*batch)
    
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>'])
    labels = torch.stack(labels)
    
    return padded_texts, labels

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [16]:
class SpoilerRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0
        )
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # *2 because it's bidirectional
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx
        
    def forward(self, text):
        # text: [B, T]
        embedded = self.dropout(self.embedding(text))  # [B, T, E]
        output, _ = self.lstm(embedded)                # [B, T, 2H]

        # Mask out padding positions
        mask = (text != self.pad_idx).unsqueeze(-1)    # [B, T, 1]
        # Use max pooling over time with -inf for pads
        output_masked = output.masked_fill(~mask, float('-inf'))
        pooled, _ = torch.max(output_masked, dim=1)    # [B, 2H]
        pooled = self.dropout(pooled)
        logits = self.fc(pooled)                       # [B, 1]
        
        return logits

In [17]:
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.3
LR = 0.01
WEIGHT_DECAY = 1e-5
GRAD_CLIP = 1.0
PAD_IDX = vocab['<pad>']

In [19]:
model = SpoilerRNN(
    VOCAB_SIZE,
    EMBEDDING_DIM,
    HIDDEN_DIM,
    OUTPUT_DIM,
    N_LAYERS,
    BIDIRECTIONAL,
    DROPOUT,
    PAD_IDX
    ).to(device)

pos_weight = pos_weight.to(device)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight = pos_weight).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay = WEIGHT_DECAY)

In [20]:
from tqdm import tqdm

def train(model, train_loader, optimizer, criterion):
    print("Training...")
    model.train()
    epoch_loss = 0
    for eposh in range(10):
        epoch_loss = 0
        for texts, labels in tqdm(train_loader):
            texts, labels = texts.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
    return (f"Epoch {epoch}, Loss: {epoch_loss / len(train_loader)}")

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for texts, labels in tqdm(loader):
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
         
            preds = torch.round(torch.sigmoid(predictions))
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            epoch_loss += loss.item()
            
    report = classification_report(all_labels, all_preds, target_names=['Not Spoiler', 'Spoiler'], zero_division=0)
    return epoch_loss / len(loader), report

In [24]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss, report = evaluate(model, test_loader, criterion)
    
    tqdm.write(f"Epoch: {epoch+1:02}")
    tqdm.write(f"\tTrain Loss: {train_loss:.3f}")
    tqdm.write(f"\tVal. Loss: {valid_loss:.3f}")

Training...


 82%|███████████████████████████████████████████████████           | 94510/114783 [48:29<08:41, 38.86it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 79%|█████████████████████████████████████████████████             | 90867/114783 [38:30<10:12, 39.04it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|█████████████████████████████████████████████████████████████| 114783/114783 [48:50<00:00, 39.17it/s]
  7%|████▏                                   

In [26]:
print(report)

              precision    recall  f1-score   support

 Not Spoiler       0.85      0.78      0.82     84598
     Spoiler       0.51      0.62      0.56     30185

    accuracy                           0.74    114783
   macro avg       0.68      0.70      0.69    114783
weighted avg       0.76      0.74      0.75    114783



# IGNORE

In [18]:
def evaluate(model, val_loader, criterion, device):
    """
    Evaluates the model on the validation set on a single GPU.
    This function should only be called by the rank 0 process.
    """
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad(): # No need to calculate gradients during evaluation
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            
            # Move to CPU and convert to numpy/list for scikit-learn
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    
    print(f"\n--- Validation Results ---")
    print(f"Average Loss: {avg_loss:.4f}")

    print("\nClassification Report:")
    report = classification_report(all_labels, all_predictions, target_names=class_names)
    print(report)
    
    print(f"--------------------------\n")

In [25]:
def main(rank, world_size):
    print(f"Running DDP on rank {rank}.")
    setup(rank, world_size)
    
    epochs = 5
    batch_size = 4

    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)

    if rank == 0:
        val_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    model = SpoilerRNN(
        VOCAB_SIZE,
        EMBEDDING_DIM,
        HIDDEN_DIM,
        OUTPUT_DIM,
        N_LAYERS,
        BIDIRECTIONAL,
        DROPOUT,
        PAD_IDX
        ).to(rank)
    
    ddp_model = DDP(model, device_ids=[rank])

    pos_weight = pos_weight.to(rank)
    criterion = nn.BCEWithLogitsLoss(pos_weight = pos_weight).to(rank)
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.01)

    for texts, labels in tqdm(train_loader):
        texts, labels = texts.to(rank), labels.to(rank)
        
        optimizer.zero_grad()
        predictions = ddp_model(texts).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
    
    epoch_loss += loss.item()
                
    for epoch in range(epochs):
        train_sampler.set_epoch(epoch)
        ddp_model.train()
        running_loss = 0.0
        
        for inputs, labels in tqdm(train_loader):
            inputs = inputs.to(rank)
            labels = labels.to(rank)

            optimizer.zero_grad()
            outputs = ddp_model(inputs).squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        if rank == 0:
            avg_epoch_loss = running_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_epoch_loss:.4f}")

        if rank == 0:
            evaluate(ddp_model.module, val_loader, criterion, rank)

        dist.barrier()

    cleanup()

In [26]:
import os
os.environ["MKL_THREADING_LAYER"] = "GNU"
import numpy as np

In [28]:
if __name__ == "__main__":
    world_size = torch.cuda.device_count() or 1
    mp.spawn(
        main,
        args=(world_size,),
        nprocs=world_size,
        join=True
    )

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/kangcat/miniconda3/envs/documentation/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kangcat/miniconda3/envs/documentation/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'main' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/kangcat/miniconda3/envs/documentation/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kangcat/miniconda3/envs/documentation/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
       

ProcessExitedException: process 1 terminated with exit code 1

In [None]:
N_EPOCHS = 10

print("Training..")
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss, report = evaluate(model, valid_loader, criterion)

    print(f'Rank:{rank}')
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

In [None]:
test_loss, final_report = evaluate(model, test_loader, criterion)
print(final_report)

In [None]:
valid_loss

In [None]:
train_loss

In [None]:
report

In [None]:
from tqdm import tqdm

def train(model, train_loader, optimizer, criterion):
    print("Training...")
    model.train()
    epoch_loss = 0
    for eposh in range(10):
        epoch_loss = 0
        for texts, labels in tqdm(train_loader):
            texts, labels = texts.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
    return (f"Epoch {epoch}, Loss: {epoch_loss / len(train_loader)}")

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for texts, labels in tqdm(loader):
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
         
            preds = torch.round(torch.sigmoid(predictions))
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            epoch_loss += loss.item()
            
    report = classification_report(all_labels, all_preds, target_names=['Not Spoiler', 'Spoiler'], zero_division=0)
    return epoch_loss / len(loader), report
