In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sentence_transformers import SentenceTransformer, util

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [14]:
directory_path = os.environ["DIR_PATH"]

df = pd.read_json(f"{directory_path}/dataset/cleaned_data.json")
df['label'] = df['is_spoiler'].astype(int)

print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (573913, 7)


Unnamed: 0,user_id,movie_id,review_text,is_spoiler,plot_synopsis,plot_summary,label
0,ur1898687,tt0111161,oscar year shawshank redemption write direct f...,1,andy dufresne tim robbins banker maine convict...,chronicle experience formerly successful banke...,1
1,ur0842118,tt0111161,shawshank redemption without doubt one brillia...,1,andy dufresne tim robbins banker maine convict...,chronicle experience formerly successful banke...,1
2,ur1285640,tt0111161,believe film best story ever tell film tell ti...,1,andy dufresne tim robbins banker maine convict...,chronicle experience formerly successful banke...,1
3,ur1003471,tt0111161,yes spoiler film emotional impact find hard wr...,1,andy dufresne tim robbins banker maine convict...,chronicle experience formerly successful banke...,1
4,ur0226855,tt0111161,heart extraordinary movie brilliant indelible ...,1,andy dufresne tim robbins banker maine convict...,chronicle experience formerly successful banke...,1


In [35]:
df = df[['review_text', 'label']]

df.head()

Unnamed: 0,review_text,label
0,oscar year shawshank redemption write direct f...,1
1,shawshank redemption without doubt one brillia...,1
2,believe film best story ever tell film tell ti...,1
3,yes spoiler film emotional impact find hard wr...,1
4,heart extraordinary movie brilliant indelible ...,1


In [36]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)
print(f"Data split into train ({len(train_df)}) and test ({len(test_df)}) sets.")

Data split into train (459130) and test (114783) sets.


In [37]:
for obj in (train_df, test_df):
    obj.reset_index(drop=True, inplace=True)

In [38]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [39]:
def embedd_to_vector(text):
    print("Encoding texts...")
    embeddings = model.encode(text.fillna('').tolist(), 
                              convert_to_tensor=True, show_progress_bar=True, device=device)
    print("Encoded.")
    return embeddings.cpu().numpy().tolist()

def concat_to_df(main_df, embedd_list, column_name):
    embedd_df = pd.DataFrame(embedd_list)
    num_dims = len(embedd_df.columns)
    new_names = [f'{column_name}_{i}' for i in range(num_dims)]
    embedd_df.columns = new_names
    df = pd.concat([main_df, embedd_df], axis = 1)
    print(f"Finish concatenate: with dataframe {len(main_df)} and list {len(embedd_df)}")
    return df

In [41]:
train_review_embeddings = embedd_to_vector(train_df['review_text'])

train_df = concat_to_df(train_df, train_review_embeddings, "dim")

Encoding texts...


Batches: 100%|██████████████████████████████████████████████████████████████████████████████| 14348/14348 [01:28<00:00, 161.73it/s]


Encoded.
Finish concatenate: with dataframe 459130 and list 459130


In [42]:
test_review_embeddings = embedd_to_vector(test_df['review_text'])

test_df = concat_to_df(test_df, test_review_embeddings, "dim")

Encoding texts...


Batches: 100%|████████████████████████████████████████████████████████████████████████████████| 3587/3587 [00:21<00:00, 169.30it/s]


Encoded.
Finish concatenate: with dataframe 114783 and list 114783


### Model

In [43]:
train_xg = train_df.drop(columns= ['review_text'])
test_xg = test_df.drop(columns= ['review_text'])
train_xg.head()

Unnamed: 0,label,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_374,dim_375,dim_376,dim_377,dim_378,dim_379,dim_380,dim_381,dim_382,dim_383
0,1,-0.005757,-0.051729,-0.02155,-0.005745,-0.074771,0.078937,-0.069202,-0.060529,-0.002981,...,-0.034809,0.01688,-0.05537,0.037996,0.061832,0.005916,0.155962,-0.01376,0.004469,0.077324
1,1,-0.002652,-0.056541,0.058831,-0.003095,0.043479,0.030117,0.054024,-0.034403,-0.04325,...,0.059259,-0.005599,0.075528,0.079341,-0.026007,-0.008913,0.128236,-0.015391,0.019221,-0.026087
2,1,-0.017446,-0.091489,0.014513,-0.028227,-0.026655,0.060746,0.043278,-0.024781,0.048334,...,0.057191,0.033866,-0.01203,0.145682,0.003819,-0.071792,0.034811,-0.00887,-0.03989,-0.029504
3,0,-0.081114,0.030047,0.042871,0.093194,-0.001539,-0.004182,0.031489,-0.020019,0.104828,...,0.07166,-0.003618,0.006736,0.04733,-0.101248,0.018851,0.084173,0.028484,0.019516,-0.022237
4,0,-0.051064,-0.072159,-0.044826,-0.0128,-0.006169,0.065184,0.030398,-0.024782,-0.047542,...,0.066275,0.027673,-0.025575,0.043466,0.070013,0.032838,0.061937,0.0325,0.004026,-0.054742


In [44]:
X_train = train_xg.drop(columns=["label"])
y_train = train_xg["label"]

X_test = test_xg.drop(columns=["label"])
y_test = test_xg["label"]

In [45]:
n = y_train.value_counts()
pos_weight = round(n[0]/n[1],2)

print(f"Positive Weight: {pos_weight}")

Positive Weight: 2.8


In [46]:

clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.7,
    colsample_bytree=0.5,
    reg_lambda=1.0,
    reg_alpha=0.0,
    min_child_weight=1,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight = 2.8,
    device = "cuda"
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Evaluation
proba = clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)
print("Classification Report on test set:")
print(classification_report(y_test, pred))

Classification Report on test set:
              precision    recall  f1-score   support

           0       0.84      0.63      0.72     84598
           1       0.39      0.66      0.49     30185

    accuracy                           0.64    114783
   macro avg       0.61      0.65      0.60    114783
weighted avg       0.72      0.64      0.66    114783



In [51]:
from sklearn.decomposition import PCA
pca = PCA(n_components=125, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

In [52]:
clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.7,
    colsample_bytree=0.5,
    reg_lambda=1.0,
    reg_alpha=0.0,
    min_child_weight=1,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight = 2.8,
    device = "cuda"
)

clf.fit(
    X_train_pca, y_train,
    eval_set=[(X_test_pca, y_test)],
    verbose=False
)

# Evaluation
proba = clf.predict_proba(X_test_pca)[:, 1]
pred = (proba >= 0.5).astype(int)
print("Classification Report on test set:")
print(classification_report(y_test, pred))

Classification Report on test set:
              precision    recall  f1-score   support

           0       0.82      0.65      0.73     84598
           1       0.38      0.61      0.47     30185

    accuracy                           0.64    114783
   macro avg       0.60      0.63      0.60    114783
weighted avg       0.71      0.64      0.66    114783



In [48]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, classification_report
from xgboost import XGBClassifier

clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",          # Use "gpu_hist" if you have a GPU
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=pos_weight
)

param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1
)

print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train)

# 5. Analyze the Results
print("\n--- GridSearchCV Results ---")
print("Best Parameters Found: ", grid_search.best_params_)
print("Best Recall Score (on cross-validation): ", grid_search.best_score_)

best_clf = grid_search.best_estimator_

print("\n--- Evaluation on Test Set using Best Estimator ---")
proba = best_clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("AUC:", roc_auc_score(y_test, proba))
print("Accuracy:", accuracy_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))
print("Recall:", recall_score(y_test, pred)) # Most important metric for this search

print("\nClassification Report on Test Set:")
print(classification_report(y_test, pred))

Starting GridSearchCV...
Fitting 3 folds for each of 108 candidates, totalling 324 fits




KeyboardInterrupt: 

In [29]:
print(classification_report(y_test, pred, target_names = ['Not Spoiler', 'Spoiler']))

              precision    recall  f1-score   support

     Spoiler       0.86      0.60      0.71     84598
 Not Spoiler       0.40      0.73      0.51     30185

    accuracy                           0.64    114783
   macro avg       0.63      0.67      0.61    114783
weighted avg       0.74      0.64      0.66    114783



In [30]:
importances = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("\nTop 20 Feature Importances (from best model):")
print(importances.head(20))

NotFittedError: need to call fit or load_model beforehand

[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time= 1.9min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.7; total time= 5.5min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.7; total time= 2.3min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.8; total time= 5.3min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=500, subsample=0.8; total time= 7.8min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.8; total time= 7.7min
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=4, n_estimators=100, subsample=0.7; total time= 1.7min
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=4, n_estimators=100, subsample=0.7; total time= 1.7min
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=4, n_estima