In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sentence_transformers import SentenceTransformer, util

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json

  from .autonotebook import tqdm as notebook_tqdm
  warn(


### Preparing dataset
1. user_id
2. review_text
3. plot_synopsis
4. review_text embeddings
5. cosine similarity
6. ngram vector
7. review text length
   

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [17]:
directory_path = "/SFS/project/ry/dp_sgteam/catherine/ada/dataset"
df_reviews = pd.read_json(f"{directory_path}/cleaned_data.json")
df_reviews['label'] = df_reviews['is_spoiler'].astype(int)

df_movies = pd.read_json(f"{directory_path}/IMDB_movie_details.json", lines=True)

print(f"Dataset shape: {df_reviews.shape}")
print(f"Dataset shape: {df_movies.shape}")
df_reviews.head()

Dataset shape: (573913, 8)
Dataset shape: (1572, 7)


Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,label
0,10 February 2006,tt0111161,ur1898687,1,oscar year shawshank redemption write direct f...,10,A classic piece of unforgettable film-making.,1
1,6 September 2000,tt0111161,ur0842118,1,shawshank redemption without doubt one brillia...,10,Simply amazing. The best film of the 90's.,1
2,3 August 2001,tt0111161,ur1285640,1,believe film best story ever tell film tell ti...,8,The best story ever told on film,1
3,1 September 2002,tt0111161,ur1003471,1,yes spoiler film emotional impact find hard wr...,10,Busy dying or busy living?,1
4,20 May 2004,tt0111161,ur0226855,1,heart extraordinary movie brilliant indelible ...,8,"Great story, wondrously told and acted",1


In [4]:
df = pd.merge(df_reviews, df_movies, on='movie_id', how='left')

print("Data loaded and merged successfully.")
df['label'] = df['is_spoiler'].astype(int)
df['user_id'] = df['user_id'].str[2:].astype(int)

df = df[['user_id', 'review_text', 'plot_synopsis', 'label']]
df['review_text_length'] = df['review_text'].str.len()

df.head()

Data loaded and merged successfully.


Unnamed: 0,user_id,review_text,plot_synopsis,label,review_text_length
0,1898687,oscar year shawshank redemption write direct f...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,2776
1,842118,shawshank redemption without doubt one brillia...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,709
2,1285640,believe film best story ever tell film tell ti...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,962
3,1003471,yes spoiler film emotional impact find hard wr...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,2148
4,226855,heart extraordinary movie brilliant indelible ...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,2735


In [5]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)
print(f"Data split into train ({len(train_df)}) and test ({len(test_df)}) sets.")

Data split into train (459130) and test (114783) sets.


In [7]:
for obj in (train_df, test_df):
    obj.reset_index(drop=True, inplace=True)

In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

HTTP Error 500 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


In [9]:
def embedd_to_vector(text):
    print("Encoding texts...")
    embeddings = model.encode(text.fillna('').tolist(), 
                              convert_to_tensor=True, show_progress_bar=True, device=device)
    print("Encoded.")
    return embeddings.cpu().numpy().tolist()

def concat_to_df(main_df, embedd_list, column_name):
    embedd_df = pd.DataFrame(embedd_list)
    num_dims = len(embedd_df.columns)
    new_names = [f'{column_name}_{i}' for i in range(num_dims)]
    embedd_df.columns = new_names
    df = pd.concat([main_df, embedd_df], axis = 1)
    print(f"Finish concatenate: with dataframe {len(main_df)} and list {len(embedd_df)}")
    return df

In [10]:
train_review_embeddings = embedd_to_vector(train_df['review_text'])
train_synopsis_embeddings = embedd_to_vector(train_df['plot_synopsis'])

train_df = concat_to_df(train_df, train_review_embeddings, "dim")

Encoding texts...


Batches: 100%|██████████████████████████████████████████████████████| 14348/14348 [02:40<00:00, 89.49it/s]


Encoded.
Encoding texts...


Batches: 100%|██████████████████████████████████████████████████████| 14348/14348 [09:49<00:00, 24.33it/s]


Encoded.
Finish concatenate: with dataframe 459130 and list 459130


In [12]:
test_review_embeddings = embedd_to_vector(test_df['review_text'])
test_synopsis_embeddings = embedd_to_vector(test_df['plot_synopsis'])

test_df = concat_to_df(test_df, test_review_embeddings, "dim")

Encoding texts...


Batches: 100%|████████████████████████████████████████████████████████| 3587/3587 [00:39<00:00, 89.76it/s]


Encoded.
Encoding texts...


Batches: 100%|████████████████████████████████████████████████████████| 3587/3587 [02:44<00:00, 21.85it/s]


Encoded.
Finish concatenate: with dataframe 114783 and list 114783


In [13]:
print("Calculating similarities for training set...")
sim_scores_train = util.pairwise_cos_sim(train_review_embeddings, train_synopsis_embeddings)
train_df['sim_score_synopsis_review'] = sim_scores_train.flatten()

print("Calculating similarities for testing set...")
sim_scores_test = util.pairwise_cos_sim(test_review_embeddings, test_synopsis_embeddings)
test_df['sim_score_synopsis_review'] = sim_scores_test.flatten()

Calculating similarities for training set...
Calculating similarities for testing set...


In [14]:
# N-gram
with open("/SFS/project/ry/dp_sgteam/catherine/ada/dataset/ngram_vocab_list.json", "r") as f:
    ngram_vocab_list = json.load(f)

ngram_to_idx = {ngram: idx for idx, ngram in enumerate(ngram_vocab_list)}
vocab_size = len(ngram_vocab_list)

def build_token_set(text):
    tokens = re.findall(r"[A-Za-z]+", text.lower())
    return tokens

def ngram_vector_for_text(text, ngram_to_idx, vocab_size):
    tokens = build_token_set(text)
    token_set = set(tokens)
    bigram_set = set(" ".join(pair) for pair in zip(tokens, tokens[1:]))

    vec = [0] * vocab_size
    for ng, idx in ngram_to_idx.items():
        if " " in ng:
            if ng in bigram_set:
                vec[idx] = 1
        else: 
            if ng in token_set:
                vec[idx] = 1
    return vec


In [15]:
train_ngram_df = pd.DataFrame([ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in train_df['review_text']])
train_df = concat_to_df(train_df, train_ngram_df, "ngram")

test_ngram_df = pd.DataFrame([ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in test_df['review_text']])
valid_df = concat_to_df(test_df, test_ngram_df, "ngram")

Finish concatenate: with dataframe 459130 and list 459130
Finish concatenate: with dataframe 114783 and list 114783


### Storing the dataset

In [18]:
train_df.to_json(f"{directory_path}/train_data.json",  orient="records", indent=4)
test_df.to_json(f"{directory_path}/test_data.json",  orient="records", indent=4)

KeyboardInterrupt: 

### Model

In [24]:
df_xg = train_df.drop(columns= ['review_text', 'plot_synopsis'])

df_xg.head()

Unnamed: 0,user_id,label,review_text_length,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,...,ngram_442,ngram_443,ngram_444,ngram_445,ngram_446,ngram_447,ngram_448,ngram_449,ngram_450,ngram_451
0,5291991,1,1821,-0.005757,-0.051729,-0.02155,-0.005745,-0.074771,0.078937,-0.069202,...,0,0,0,0,0,0,0,0,0,0
1,48053412,1,829,-0.002652,-0.056541,0.058831,-0.003095,0.043479,0.030117,0.054024,...,0,0,0,0,0,0,0,0,0,0
2,28438054,1,829,-0.017446,-0.091489,0.014513,-0.028227,-0.026655,0.060746,0.043278,...,0,0,0,0,0,0,0,0,0,0
3,35553121,0,182,-0.081114,0.030047,0.042871,0.093194,-0.001539,-0.004182,0.031489,...,0,0,0,0,0,0,0,0,0,0
4,14069613,0,1333,-0.051064,-0.072159,-0.044826,-0.0128,-0.006169,0.065184,0.030398,...,0,0,0,0,0,0,0,0,0,0


In [31]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from xgboost import XGBClassifier

X = df_xg.drop(columns=["label"])
y = df_xg["label"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    min_child_weight=1,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="gpu_hist",          # use "gpu_hist" if you have a GPU
    random_state=42,
    n_jobs=-1,
    scale_pos_weight = 2.8
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)

# Evaluation
proba = clf.predict_proba(X_valid)[:, 1]
pred = (proba >= 0.5).astype(int)
print("AUC:", roc_auc_score(y_valid, proba))
print("Accuracy:", accuracy_score(y_valid, pred))
print("F1:", f1_score(y_valid, pred))

# Feature importance (gain-based)
importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances.head(20))


AUC: 0.7811561100505529
Accuracy: 0.7038529392546773
F1: 0.5526272496956536
ngram_0                      0.087770
dim_319                      0.034076
review_text_length           0.028817
user_id                      0.019276
dim_223                      0.011733
dim_49                       0.008892
dim_187                      0.008151
dim_298                      0.006987
dim_92                       0.006019
sim_score_synopsis_review    0.006016
dim_111                      0.005069
ngram_10                     0.005048
dim_127                      0.004980
dim_331                      0.004912
ngram_115                    0.004893
dim_139                      0.004565
ngram_162                    0.004483
dim_244                      0.004433
ngram_8                      0.004386
dim_217                      0.004234
dtype: float32


In [33]:
print(classification_report(y_valid, pred, target_names=['Not Spoiler', 'Spoiler'], zero_division=0))

              precision    recall  f1-score   support

 Not Spoiler       0.87      0.71      0.78     67678
     Spoiler       0.46      0.70      0.55     24148

    accuracy                           0.70     91826
   macro avg       0.66      0.70      0.67     91826
weighted avg       0.76      0.70      0.72     91826



In [28]:
proba

array([0.21465172, 0.08334256, 0.24616382, ..., 0.31994042, 0.00211977,
       0.45506656], dtype=float32)