In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sentence_transformers import SentenceTransformer, util

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json

  from .autonotebook import tqdm as notebook_tqdm
  warn(


### Preparing dataset
1. user_id
2. review_text
3. plot_synopsis
4. review_text embeddings
5. cosine similarity
6. ngram vector
7. review text length
   

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
directory_path = "/SFS/project/ry/dp_sgteam/catherine/ada/dataset"
df_reviews = pd.read_json(f"{directory_path}/cleaned_data.json")
df_reviews['label'] = df_reviews['is_spoiler'].astype(int)

df_movies = pd.read_json(f"{directory_path}/IMDB_movie_details.json", lines=True)

print(f"Dataset shape: {df_reviews.shape}")
print(f"Dataset shape: {df_movies.shape}")
df_reviews.head()

Dataset shape: (573913, 8)
Dataset shape: (1572, 7)


Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,label
0,10 February 2006,tt0111161,ur1898687,1,oscar year shawshank redemption write direct f...,10,A classic piece of unforgettable film-making.,1
1,6 September 2000,tt0111161,ur0842118,1,shawshank redemption without doubt one brillia...,10,Simply amazing. The best film of the 90's.,1
2,3 August 2001,tt0111161,ur1285640,1,believe film best story ever tell film tell ti...,8,The best story ever told on film,1
3,1 September 2002,tt0111161,ur1003471,1,yes spoiler film emotional impact find hard wr...,10,Busy dying or busy living?,1
4,20 May 2004,tt0111161,ur0226855,1,heart extraordinary movie brilliant indelible ...,8,"Great story, wondrously told and acted",1


In [4]:
df = pd.merge(df_reviews, df_movies, on='movie_id', how='left')

print("Data loaded and merged successfully.")
df['label'] = df['is_spoiler'].astype(int)
df['user_id'] = df['user_id'].str[2:].astype(int)

df = df[['user_id', 'review_text', 'plot_synopsis', 'label']]
df['review_text_length'] = df['review_text'].str.len()

df.head()

Data loaded and merged successfully.


Unnamed: 0,user_id,review_text,plot_synopsis,label,review_text_length
0,1898687,oscar year shawshank redemption write direct f...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,2776
1,842118,shawshank redemption without doubt one brillia...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,709
2,1285640,believe film best story ever tell film tell ti...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,962
3,1003471,yes spoiler film emotional impact find hard wr...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,2148
4,226855,heart extraordinary movie brilliant indelible ...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,2735


In [5]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)
print(f"Data split into train ({len(train_df)}) and test ({len(test_df)}) sets.")

Data split into train (459130) and test (114783) sets.


In [6]:
for obj in (train_df, test_df):
    obj.reset_index(drop=True, inplace=True)

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [8]:
def embedd_to_vector(text):
    print("Encoding texts...")
    embeddings = model.encode(text.fillna('').tolist(), 
                              convert_to_tensor=True, show_progress_bar=True, device=device)
    print("Encoded.")
    return embeddings.cpu().numpy().tolist()

def concat_to_df(main_df, embedd_list, column_name):
    embedd_df = pd.DataFrame(embedd_list)
    num_dims = len(embedd_df.columns)
    new_names = [f'{column_name}_{i}' for i in range(num_dims)]
    embedd_df.columns = new_names
    df = pd.concat([main_df, embedd_df], axis = 1)
    print(f"Finish concatenate: with dataframe {len(main_df)} and list {len(embedd_df)}")
    return df

In [9]:
train_review_embeddings = embedd_to_vector(train_df['review_text'])
train_synopsis_embeddings = embedd_to_vector(train_df['plot_synopsis'])

train_df = concat_to_df(train_df, train_review_embeddings, "dim")

Encoding texts...


Batches: 100%|█████████████████████████████████████████████████████████████████| 14348/14348 [02:37<00:00, 91.08it/s]


Encoded.
Encoding texts...


Batches: 100%|█████████████████████████████████████████████████████████████████| 14348/14348 [08:49<00:00, 27.08it/s]


Encoded.
Finish concatenate: with dataframe 459130 and list 459130


In [10]:
test_review_embeddings = embedd_to_vector(test_df['review_text'])
test_synopsis_embeddings = embedd_to_vector(test_df['plot_synopsis'])

test_df = concat_to_df(test_df, test_review_embeddings, "dim")

Encoding texts...


Batches: 100%|██████████████████████████████████████████████████████████████████| 3587/3587 [00:28<00:00, 127.17it/s]


Encoded.
Encoding texts...


Batches: 100%|███████████████████████████████████████████████████████████████████| 3587/3587 [01:38<00:00, 36.48it/s]


Encoded.
Finish concatenate: with dataframe 114783 and list 114783


In [11]:
print("Calculating similarities for training set...")
sim_scores_train = util.pairwise_cos_sim(train_review_embeddings, train_synopsis_embeddings)
train_df['sim_score_synopsis_review'] = sim_scores_train.flatten()

print("Calculating similarities for testing set...")
sim_scores_test = util.pairwise_cos_sim(test_review_embeddings, test_synopsis_embeddings)
test_df['sim_score_synopsis_review'] = sim_scores_test.flatten()

Calculating similarities for training set...
Calculating similarities for testing set...


In [12]:
# N-gram
with open("/SFS/project/ry/dp_sgteam/catherine/ada/dataset/ngram_vocab_list.json", "r") as f:
    ngram_vocab_list = json.load(f)

ngram_to_idx = {ngram: idx for idx, ngram in enumerate(ngram_vocab_list)}
vocab_size = len(ngram_vocab_list)

def build_token_set(text):
    tokens = re.findall(r"[A-Za-z]+", text.lower())
    return tokens

def ngram_vector_for_text(text, ngram_to_idx, vocab_size):
    tokens = build_token_set(text)
    token_set = set(tokens)
    bigram_set = set(" ".join(pair) for pair in zip(tokens, tokens[1:]))

    vec = [0] * vocab_size
    for ng, idx in ngram_to_idx.items():
        if " " in ng:
            if ng in bigram_set:
                vec[idx] = 1
        else: 
            if ng in token_set:
                vec[idx] = 1
    return vec


In [13]:
train_ngram_df = [ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in train_df['review_text']]
train_df_ngram = concat_to_df(train_df, train_ngram_df, "ngram")

test_ngram_df = [ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in test_df['review_text']]
test_df_ngram = concat_to_df(test_df, test_ngram_df, "ngram")

Finish concatenate: with dataframe 459130 and list 459130
Finish concatenate: with dataframe 114783 and list 114783


### Storing the dataset

In [14]:
train_df.to_json(f"{directory_path}/train_data.json",  orient="records", indent=4)
test_df.to_json(f"{directory_path}/test_data.json",  orient="records", indent=4)

### Model

In [15]:
train_xg = train_df_ngram.drop(columns= ['review_text', 'plot_synopsis'])
test_xg = test_df_ngram.drop(columns= ['review_text', 'plot_synopsis'])
train_xg.head()

Unnamed: 0,user_id,label,review_text_length,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,...,ngram_442,ngram_443,ngram_444,ngram_445,ngram_446,ngram_447,ngram_448,ngram_449,ngram_450,ngram_451
0,5291991,1,1821,-0.005757,-0.051729,-0.02155,-0.005745,-0.074771,0.078937,-0.069202,...,0,0,0,0,0,0,0,0,0,0
1,48053412,1,829,-0.002652,-0.056541,0.058831,-0.003095,0.043479,0.030117,0.054024,...,0,0,0,0,0,0,0,0,0,0
2,28438054,1,829,-0.017446,-0.091489,0.014513,-0.028227,-0.026655,0.060746,0.043278,...,0,0,0,0,0,0,0,0,0,0
3,35553121,0,182,-0.081114,0.030047,0.042871,0.093194,-0.001539,-0.004182,0.031489,...,0,0,0,0,0,0,0,0,0,0
4,14069613,0,1333,-0.051064,-0.072159,-0.044826,-0.0128,-0.006169,0.065184,0.030398,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X_train = train_xg.drop(columns=["label"])
y_train = train_xg["label"]

X_test = test_xg.drop(columns=["label"])
y_test = test_xg["label"]

In [17]:
# no need to run this code, can lose context
svd = TruncatedSVD(n_components = 100, random_state = 42)

X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

In [18]:
n = y_train.value_counts()
pos_weight = round(n[0]/n[1],2)

print(f"Positive Weight: {pos_weight}")

Positive Weight: 2.8


In [20]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, classification_report

clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",          # Use "gpu_hist" if you have a GPU
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=pos_weight
)

param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1
)

print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train)

# 5. Analyze the Results
print("\n--- GridSearchCV Results ---")
print("Best Parameters Found: ", grid_search.best_params_)
print("Best Recall Score (on cross-validation): ", grid_search.best_score_)

best_clf = grid_search.best_estimator_

print("\n--- Evaluation on Test Set using Best Estimator ---")
proba = best_clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("AUC:", roc_auc_score(y_test, proba))
print("Accuracy:", accuracy_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))
print("Recall:", recall_score(y_test, pred)) # Most important metric for this search

print("\nClassification Report on Test Set:")
print(classification_report(y_test, pred))

Starting GridSearchCV...
Fitting 3 folds for each of 108 candidates, totalling 324 fits




[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time= 3.2min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.7; total time=10.4min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8; total time= 4.1min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.8; total time= 9.2min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=100, subsample=0.7; total time= 5.2min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=100, subsample=0.7; total time= 5.3min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.7; total time=12.6min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=500, subsample=0.7; total time=19.1min
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=4, n_estima

In [21]:
importances = pd.Series(best_clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("\nTop 20 Feature Importances (from best model):")
print(importances.head(20))


Top 20 Feature Importances (from best model):
ngram_0                      0.077707
review_text_length           0.074029
dim_319                      0.066200
user_id                      0.058513
dim_223                      0.056795
dim_127                      0.029801
dim_187                      0.026517
dim_49                       0.024938
dim_298                      0.022573
dim_53                       0.020818
dim_92                       0.019412
dim_111                      0.017276
sim_score_synopsis_review    0.016807
dim_244                      0.014500
dim_173                      0.014371
dim_193                      0.014094
dim_46                       0.011644
dim_132                      0.011226
dim_139                      0.011040
dim_217                      0.010838
dtype: float32


In [22]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from xgboost import XGBClassifier

clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    min_child_weight=1,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",          # use "gpu_hist" if you have a GPU
    random_state=42,
    n_jobs=-1,
    scale_pos_weight = pos_weight
)

print(f"Type of y_train: {y_train.dtype}")
print(f"Type of y_test: {y_test.dtype}")

clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Evaluation
proba = clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)
print("AUC:", roc_auc_score(y_test, proba))
print("Accuracy:", accuracy_score(y_test, pred))
print("F1:", f1_score(y_test, pred))


Type of y_train: int64
Type of y_test: int64
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time= 3.0min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.8; total time= 6.4min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.8; total time=10.1min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=500, subsample=0.7; total time=13.8min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=100, subsample=0.8; total time= 5.4min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.7; total time=12.6min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=500, subsample=0.8; total time=18.6min
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=4, n_estimators=500, subsample=0.8; total time= 8.4min
[CV] END colsample_bytree=0

In [23]:
print(classification_report(y_test, pred, target_names = ['Spoiler', 'Not Spoiler']))

              precision    recall  f1-score   support

     Spoiler       0.87      0.70      0.78     84598
 Not Spoiler       0.46      0.70      0.55     30185

    accuracy                           0.70    114783
   macro avg       0.66      0.70      0.67    114783
weighted avg       0.76      0.70      0.72    114783

