In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.model_selection import train_test_split
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rmisra/imdb-spoiler-dataset")

print("Path to dataset files:", path)
movie_details = []
movie_reviews = []

def load_dataset(filepath: str):
    dataset = []
    with open(filepath, "r") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if line:
                data = json.loads(line)
                dataset.append(data)
    return dataset

def duration_to_minutes(duration):
    hour_match = re.search(r'(\d+)\s*h', duration)
    min_match = re.search(r'(\d+)\s*min', duration)

    hours = 0
    minutes = 0

    if hour_match:
        hours = int(hour_match.group(1))
    if min_match:
        minutes = int(min_match.group(1))
        
    return (hours * 60) + minutes
    
from ast import literal_eval
import ast
# movie_details = pd.DataFrame(load_dataset("/Users/catherinekang/Documents/CS3244/project/data/IMDB_movie_details.json"))
movie_details = pd.DataFrame(load_dataset(path + "/IMDB_movie_details.json"))

movie_details['duration'] = movie_details['duration'].apply(duration_to_minutes)
movie_details['release_date'] = pd.to_datetime(movie_details['release_date'], errors='coerce')
movie_details['rating'] = pd.to_numeric(movie_details['rating'], errors='coerce')
movie_details['plot_synopsis'].replace("", np.nan, inplace=True)

movie_details.head()
movie_reviews = pd.DataFrame(load_dataset(path + "/IMDB_reviews.json"))

movie_reviews['review_date'] = pd.to_datetime(movie_reviews['review_date'], errors='coerce')
movie_reviews['is_spoiler'] = movie_reviews['is_spoiler'].astype(bool)
movie_reviews['rating'] = pd.to_numeric(movie_reviews['rating'], errors='coerce')
movie_reviews['review_id'] = np.arange(len(movie_reviews))

movie_reviews.head()
train_data, test_data = train_test_split(
    movie_reviews,
    test_size=0.2,
    stratify=movie_reviews['is_spoiler'], 
    random_state=42
)

Path to dataset files: /Users/isaacchin/.cache/kagglehub/datasets/rmisra/imdb-spoiler-dataset/versions/1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movie_details['plot_synopsis'].replace("", np.nan, inplace=True)


In [2]:
train_data

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,review_id
94625,2012-10-25,tt0033870,ur5291991,True,The Maltese Falcon film noir based on the nove...,10,A Classic Detective Thriller,94625
89585,2015-05-29,tt1392190,ur48053412,True,Water is a scarce resource so when it is dispe...,1,No plot spoilers,89585
187840,2014-02-26,tt1800241,ur28438054,True,I was really expecting a lot given all the hyp...,5,Most overrated film of 2013,187840
265694,2017-09-11,tt1396484,ur35553121,False,"This movie was just awful. It was not scary, t...",1,Waste of oxygen,265694
125407,2011-01-06,tt0840361,ur14069613,False,"""The Town"" is an excellent film that does have...",7,Entertaining Heist Film,125407
...,...,...,...,...,...,...,...,...
174403,2012-11-01,tt0401729,ur34579559,False,After all the hack job reviews when this film ...,9,Incredibly Impressed,174403
274593,2016-06-26,tt0144084,ur26136649,False,A film like American Psycho is one that attrac...,5,Underwhelming and Basic,274593
514599,2003-08-09,tt0112573,ur2572656,False,Braveheart is currently listed on the IMDB lis...,10,Some of Everything,514599
203500,2015-02-19,tt2911666,ur54376603,True,"Predictable, uninteresting, transparently all ...",3,Bad acting of bad script,203500


## Vectorisation

In [None]:
# Import necessary libraries for text vectorization and SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix

# Step 1: Text Vectorization using TF-IDF
print("Step 1: Vectorizing text data...")

# Initialize TF-IDF vectorizers for both review_text and review_summary
# Using different parameters to capture different aspects of the text
tfidf_review = TfidfVectorizer(
    max_features=5000,  # Limit vocabulary size for computational efficiency
    ngram_range=(1, 2),  # Include both unigrams and bigrams
    stop_words='english',
    min_df=5,  # Ignore terms that appear in less than 5 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of documents
    lowercase=True,
    strip_accents='unicode'
)

tfidf_summary = TfidfVectorizer(
    max_features=2000,  # Smaller vocabulary for summaries
    ngram_range=(1, 2),
    stop_words='english',
    min_df=3,
    max_df=0.9,
    lowercase=True,
    strip_accents='unicode'
)

# Handle missing values by filling with empty strings
train_data['review_text'] = train_data['review_text'].fillna('')
train_data['review_summary'] = train_data['review_summary'].fillna('')

# Fit and transform the text data
print("Vectorizing review_text...")
X_review_tfidf = tfidf_review.fit_transform(train_data['review_text'])

print("Vectorizing review_summary...")
X_summary_tfidf = tfidf_summary.fit_transform(train_data['review_summary'])

print(f"Review text TF-IDF shape: {X_review_tfidf.shape}")
print(f"Review summary TF-IDF shape: {X_summary_tfidf.shape}")

# Step 2: Merge review and summary texts, and include their vectorized forms, for model training input
print("\nStep 2: Preparing combined data for model training, including text and TF-IDF features...")

from scipy import sparse

# Construct the dataframe containing requested columns plus original texts
ml_input_df = train_data[['review_date', 'movie_id', 'user_id', 'is_spoiler', 'rating', 'review_id', 'review_text', 'review_summary']].copy()

# Also prepare the combined vectorized features (as before)
X_features = sparse.hstack([X_review_tfidf, X_summary_tfidf])

# Store the vectorized matrices in the dataframe for easy access (note: for efficiency, typically these are kept separate)
ml_input_df['X_review_tfidf'] = list(X_review_tfidf)
print(f"Combined vectorized features shape: {X_features.shape}")

Step 1: Vectorizing text data...
Vectorizing review_text...
Vectorizing review_summary...
Review text TF-IDF shape: (459130, 5000)
Review summary TF-IDF shape: (459130, 2000)

Step 2: Preparing combined data for model training, including text and TF-IDF features...
Combined vectorized features shape: (459130, 7000)

Step 2: Preparing original features and creating additional features...
Original dataset columns:
['review_date', 'movie_id', 'user_id', 'is_spoiler', 'review_text', 'rating', 'review_summary', 'review_id']

Additional features created:
- Review text length: count    459130.000000
mean       1460.233884
std        1126.151840
min          18.000000
25%         719.000000
50%        1052.000000
75%        1815.000000
max       14963.000000
Name: review_text_length, dtype: float64
- Word count: count    459130.000000
mean        258.799305
std         195.434762
min           1.000000
25%         131.000000
50%         189.000000
75%         321.000000
max        2675.000000


In [4]:
# Step 2: Prepare original features and create additional numerical features
print("\nStep 2: Preparing original features and creating additional features...")

# Remove previously engineered columns so the cell is idempotent when re-run
engineered_columns = [
    'review_text_length', 'review_summary_length', 'word_count',
    'user_spoiler_count', 'user_total_reviews', 'user_spoiler_rate',
    'movie_spoiler_count', 'movie_total_reviews', 'movie_spoiler_rate',
    'review_year', 'review_month', 'review_day_of_week'
]
train_data = train_data.drop(columns=[col for col in engineered_columns if col in train_data.columns], errors='ignore')

# First, let's see what original features we have
print("Original dataset columns:")
print(train_data.columns.tolist())

# Original features from the dataset (excluding text columns that we'll vectorize)
original_features = ['user_id', 'movie_id', 'review_date', 'rating', 'is_spoiler']

# Create additional derived features
# Review length features
train_data['review_text_length'] = train_data['review_text'].str.len()
train_data['review_summary_length'] = train_data['review_summary'].str.len()
train_data['word_count'] = train_data['review_text'].str.split().str.len()

# User-based features (from your EDA)
user_stats = train_data.groupby('user_id')['is_spoiler'].agg(['sum', 'count']).reset_index()
user_stats.columns = ['user_id', 'user_spoiler_count', 'user_total_reviews']
user_stats['user_spoiler_rate'] = user_stats['user_spoiler_count'] / user_stats['user_total_reviews']

# Merge user features back to train_data
train_data = train_data.merge(user_stats, on='user_id', how='left')

# Movie-based features
movie_stats = train_data.groupby('movie_id')['is_spoiler'].agg(['sum', 'count']).reset_index()
movie_stats.columns = ['movie_id', 'movie_spoiler_count', 'movie_total_reviews']
movie_stats['movie_spoiler_rate'] = movie_stats['movie_spoiler_count'] / movie_stats['movie_total_reviews']

# Merge movie features back to train_data
train_data = train_data.merge(movie_stats, on='movie_id', how='left')

# Rating features
train_data['rating'] = pd.to_numeric(train_data['rating'], errors='coerce')
train_data['rating'] = train_data['rating'].fillna(train_data['rating'].median())

# Date features
train_data['review_date'] = pd.to_datetime(train_data['review_date'], errors='coerce')
train_data['review_year'] = train_data['review_date'].dt.year
train_data['review_month'] = train_data['review_date'].dt.month
train_data['review_day_of_week'] = train_data['review_date'].dt.dayofweek

print("\nAdditional features created:")
print(f"- Review text length: {train_data['review_text_length'].describe()}")
print(f"- Word count: {train_data['word_count'].describe()}")
print(f"- User spoiler rate: {train_data['user_spoiler_rate'].describe()}")
print(f"- Movie spoiler rate: {train_data['movie_spoiler_rate'].describe()}")


Step 2: Preparing original features and creating additional features...
Original dataset columns:
['review_date', 'movie_id', 'user_id', 'is_spoiler', 'review_text', 'rating', 'review_summary', 'review_id']

Additional features created:
- Review text length: count    459130.000000
mean       1460.233884
std        1126.151840
min          18.000000
25%         719.000000
50%        1052.000000
75%        1815.000000
max       14963.000000
Name: review_text_length, dtype: float64
- Word count: count    459130.000000
mean        258.799305
std         195.434762
min           1.000000
25%         131.000000
50%         189.000000
75%         321.000000
max        2675.000000
Name: word_count, dtype: float64
- User spoiler rate: count    459130.000000
mean          0.262973
std           0.378300
min           0.000000
25%           0.000000
50%           0.008197
75%           0.500000
max           1.000000
Name: user_spoiler_rate, dtype: float64
- Movie spoiler rate: count    459130.0

In [8]:
# Step 3: Prepare numerical features for combination
print("\nStep 3: Preparing numerical features...")

# Select numerical features
numerical_features = [
    'review_text_length',
    'review_summary_length',
    # 'word_count',
    # 'user_spoiler_rate',
    # 'movie_spoiler_rate',
    # 'rating',
    # 'review_year',
    # 'review_month',
    # 'review_day_of_week'
]

# Handle missing values in numerical features
for feature in numerical_features:
    train_data[feature] = train_data[feature].fillna(train_data[feature].median())

# Extract numerical features
X_numerical = train_data[numerical_features].values

# Standardize numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

print(f"Numerical features shape: {X_numerical_scaled.shape}")
print(f"Numerical features: {numerical_features}")



Step 3: Preparing numerical features...
Numerical features shape: (459130, 2)
Numerical features: ['review_text_length', 'review_summary_length']


In [11]:
# Step 4: Combine all features into a single matrix
print("\nStep 4: Combining all features...")

# Convert numerical features to sparse matrix for efficient combination
X_numerical_sparse = csr_matrix(X_numerical_scaled)

# Combine TF-IDF features and numerical features
X_combined = hstack([X_review_tfidf, X_summary_tfidf, X_numerical_sparse])

print(f"Combined feature matrix shape: {X_combined.shape}")
print(f"Feature breakdown:")
print(f"- Review text TF-IDF: {X_review_tfidf.shape[1]} features")
print(f"- Review summary TF-IDF: {X_summary_tfidf.shape[1]} features")
print(f"- Numerical features: {X_numerical_scaled.shape[1]} features")
print(f"  * Original features: rating")
print(f"  * Derived features: review lengths, user/movie stats, date features")
print(f"- Total features: {X_combined.shape[1]}")

# Get target variable
y = train_data['is_spoiler'].values

print(f"\nTarget variable distribution:")
print(f"- Non-spoiler: {np.sum(y == False)} ({np.sum(y == False)/len(y)*100:.1f}%)")
print(f"- Spoiler: {np.sum(y == True)} ({np.sum(y == True)/len(y)*100:.1f}%)")

# Feature Summary
print("\n" + "="*60)
print("FEATURE COMBINATION SUMMARY")
print("="*60)
print("âœ… TEXT FEATURES (Vectorized):")
print(f"   - Review text TF-IDF: {X_review_tfidf.shape[1]} features")
print(f"   - Review summary TF-IDF: {X_summary_tfidf.shape[1]} features")
print("")
# print("âœ… ORIGINAL FEATURES (From dataset):")
# print("   - rating: User's rating of the movie (1-10)")
# print("")
print("âœ… DERIVED FEATURES (Engineered):")
print("   - review_text_length: Character count of review text")
print("   - review_summary_length: Character count of review summary") 
# print("   - word_count: Word count of review text")
# print("   - user_spoiler_rate: Historical spoiler rate of the user")
# print("   - movie_spoiler_rate: Historical spoiler rate for this movie")
# print("   - review_year: Year of review")
# print("   - review_month: Month of review")
# print("   - review_day_of_week: Day of week of review")
print("")
print(f"ðŸŽ¯ TOTAL FEATURES: {X_combined.shape[1]} (Ready for SMOTE)")
print("="*60)


Step 4: Combining all features...
Combined feature matrix shape: (459130, 7002)
Feature breakdown:
- Review text TF-IDF: 5000 features
- Review summary TF-IDF: 2000 features
- Numerical features: 2 features
  * Original features: rating
  * Derived features: review lengths, user/movie stats, date features
- Total features: 7002

Target variable distribution:
- Non-spoiler: 338391 (73.7%)
- Spoiler: 120739 (26.3%)

FEATURE COMBINATION SUMMARY
âœ… TEXT FEATURES (Vectorized):
   - Review text TF-IDF: 5000 features
   - Review summary TF-IDF: 2000 features

âœ… DERIVED FEATURES (Engineered):
   - review_text_length: Character count of review text
   - review_summary_length: Character count of review summary

ðŸŽ¯ TOTAL FEATURES: 7002 (Ready for SMOTE)
