In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sentence_transformers import SentenceTransformer, util

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
df_reviews = pd.read_json('/SFS/project/ry/dp_sgteam/catherine/ada/dataset/IMDB_reviews.json', lines=True)
df_reviews['label'] = df_reviews['is_spoiler'].astype(int)

df_movies = pd.read_json('/SFS/project/ry/dp_sgteam/catherine/ada/dataset/IMDB_movie_details.json', lines=True)

print(f"Dataset shape: {df_reviews.shape}")
print(f"Dataset shape: {df_movies.shape}")
df_reviews.head()

Dataset shape: (573913, 8)
Dataset shape: (1572, 7)


Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,label
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,1
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,1
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film,1
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,1
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",1


In [4]:
df = pd.merge(df_reviews, df_movies, on='movie_id', how='left')

print("Data loaded and merged successfully.")
df['label'] = df['is_spoiler'].astype(int)
df['user_id'] = df['user_id'].str[2:].astype(int)

df = df[['user_id', 'review_text', 'plot_synopsis', 'label']]
df['review_text_length'] = df['review_text'].str.len()

df.head()

Data loaded and merged successfully.


Unnamed: 0,user_id,review_text,plot_synopsis,label,review_text_length
0,1898687,"In its Oscar year, Shawshank Redemption (writt...","In 1947, Andy Dufresne (Tim Robbins), a banker...",1,4751
1,842118,The Shawshank Redemption is without a doubt on...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,1218
2,1285640,I believe that this film is the best story eve...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,1470
3,1003471,"**Yes, there are SPOILERS here**This film has ...","In 1947, Andy Dufresne (Tim Robbins), a banker...",1,4096
4,226855,At the heart of this extraordinary movie is a ...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",1,4632


In [14]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)
print(f"Data split into train ({len(train_df)}) and test ({len(test_df)}) sets.")

Data split into train (459130) and test (114783) sets.


In [15]:
train_df, valid_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df['label'],
    random_state=42
)
print(f"Data split into train ({len(train_df)}) and valid ({len(valid_df)}) sets.")

Data split into train (367304) and valid (91826) sets.


In [16]:
for obj in (train_df, valid_df, test_df):
    obj.reset_index(drop=True, inplace=True)

In [17]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

HTTP Error 500 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
HTTP Error 500 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 2s [Retry 2/5].
HTTP Error 500 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 4s [Retry 3/5].
HTTP Error 500 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 8s [Retry 4/5].
HTTP Error 500 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 8s [Retry 5/5].


In [18]:
def embedd_to_vector(text):
    print("Encoding texts...")
    embeddings = model.encode(text.fillna('').tolist(), 
                              convert_to_tensor=True, show_progress_bar=True, device=device)
    print("Encoded.")
    return embeddings.cpu().numpy().tolist()

def concat_to_df(main_df, embedd_list, column_name):
    embedd_df = pd.DataFrame(embedd_list)
    num_dims = len(embedd_df.columns)
    new_names = [f'{column_name}_{i}' for i in range(num_dims)]
    embedd_df.columns = new_names
    df = pd.concat([main_df, embedd_df], axis = 1)
    print(f"Finish concatenate: with dataframe {len(main_df)} and list {len(embedd_df)}")
    return df

In [19]:
train_review_embeddings = embedd_to_vector(train_df['review_text'])
train_synopsis_embeddings = embedd_to_vector(train_df['plot_synopsis'])

train_df = concat_to_df(train_df, train_review_embeddings, "dim")

Encoding texts...


Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 11479/11479 [03:53<00:00, 49.24it/s]


Encoded.
Encoding texts...


Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 11479/11479 [06:39<00:00, 28.75it/s]


Encoded.
Finish concatenate: with dataframe 367304 and list 367304


In [20]:
valid_review_embeddings = embedd_to_vector(valid_df['review_text'])
valid_synopsis_embeddings = embedd_to_vector(valid_df['plot_synopsis'])

valid_df = concat_to_df(valid_df, valid_review_embeddings, "dim")

Encoding texts...


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████| 2870/2870 [00:57<00:00, 49.86it/s]


Encoded.
Encoding texts...


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████| 2870/2870 [01:42<00:00, 27.94it/s]


Encoded.
Finish concatenate: with dataframe 91826 and list 91826


In [21]:
test_review_embeddings = embedd_to_vector(test_df['review_text'])
test_synopsis_embeddings = embedd_to_vector(test_df['plot_synopsis'])

test_df = concat_to_df(test_df, test_review_embeddings, "dim")

Encoding texts...


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████| 3587/3587 [01:11<00:00, 49.91it/s]


Encoded.
Encoding texts...


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████| 3587/3587 [02:03<00:00, 28.93it/s]


Encoded.
Finish concatenate: with dataframe 114783 and list 114783


In [22]:
print("Calculating similarities for training set...")
sim_scores_train = util.pairwise_cos_sim(train_review_embeddings, train_synopsis_embeddings)
train_df['sim_score_synopsis_review'] = sim_scores_train.flatten()

print("Calculating similarities for validation set...")
sim_scores_valid = util.pairwise_cos_sim(valid_review_embeddings, valid_synopsis_embeddings)
valid_df['sim_score_synopsis_review'] = sim_scores_valid.flatten()

print("Calculating similarities for testing set...")
sim_scores_test = util.pairwise_cos_sim(test_review_embeddings, test_synopsis_embeddings)
test_df['sim_score_synopsis_review'] = sim_scores_test.flatten()

Calculating similarities for training set...
Calculating similarities for validation set...
Calculating similarities for testing set...


In [23]:
# N-gram
with open("/SFS/project/ry/dp_sgteam/catherine/ada/dataset/ngram_vocab_list.json", "r") as f:
    ngram_vocab_list = json.load(f)

ngram_to_idx = {ngram: idx for idx, ngram in enumerate(ngram_vocab_list)}
vocab_size = len(ngram_vocab_list)

def build_token_set(text):
    tokens = re.findall(r"[A-Za-z]+", text.lower())
    return tokens

def ngram_vector_for_text(text, ngram_to_idx, vocab_size):
    tokens = build_token_set(text)
    token_set = set(tokens)
    bigram_set = set(" ".join(pair) for pair in zip(tokens, tokens[1:]))

    vec = [0] * vocab_size
    for ng, idx in ngram_to_idx.items():
        if " " in ng:
            if ng in bigram_set:
                vec[idx] = 1
        else: 
            if ng in token_set:
                vec[idx] = 1
    return vec


In [24]:
train_ngram_df = pd.DataFrame([ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in train_df['review_text']])
train_df = concat_to_df(train_df, train_ngram_df, "ngram")

valid_ngram_df = pd.DataFrame([ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in valid_df['review_text']])
valid_df = concat_to_df(valid_df, valid_ngram_df, "ngram")

test_ngram_df = pd.DataFrame([ngram_vector_for_text(t, ngram_to_idx, vocab_size) for t in test_df['review_text']])
valid_df = concat_to_df(test_df, test_ngram_df, "ngram")

Finish concatenate: with dataframe 367304 and list 367304
Finish concatenate: with dataframe 91826 and list 91826
Finish concatenate: with dataframe 114783 and list 114783


In [29]:
%pip install xgboost

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.7 xgboost-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [27]:
train_df.to_json("/SFS/project/ry/dp_sgteam/catherine/ada/dataset/train_data.json",  orient="records", indent=4)
valid_df.to_json("/SFS/project/ry/dp_sgteam/catherine/ada/dataset/valid_data.json",  orient="records", indent=4)
test_df.to_json("/SFS/project/ry/dp_sgteam/catherine/ada/dataset/test_data.json",  orient="records", indent=4)

### Model