In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [30]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [31]:
directory_path = os.environ["DIR_PATH"]
df_reviews = pd.read_json(f"{directory_path}/dataset/cleaned_data.json")

df_reviews.dropna(subset=['review_text', 'is_spoiler'], inplace=True)

X = df_reviews[['review_text']]
y = df_reviews['is_spoiler'].astype(int)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 459130
Testing set size: 114783


In [33]:
vectorizer = TfidfVectorizer(
    max_features = 1000,
    stop_words = 'english',
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train['review_text'])
X_test_tfidf = vectorizer.transform(X_test['review_text'])

print(f"Shape of TF-IDF matrices: Train = {X_train_tfidf.shape}, Test = {X_test_tfidf.shape}")

Shape of TF-IDF matrices: Train = (459130, 1000), Test = (114783, 1000)


In [34]:
model = LogisticRegression(solver='liblinear', class_weight = 'balanced', random_state = 42)
model.fit(X_train_tfidf, y_train)

print("Evaluating the model on the test set..")

y_pred = model.predict(X_test_tfidf)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names = ['Not Spoiler', 'Spoiler']))

Evaluating the model on the test set..
Classification Report:
              precision    recall  f1-score   support

 Not Spoiler       0.85      0.72      0.78     84598
     Spoiler       0.45      0.65      0.53     30185

    accuracy                           0.70    114783
   macro avg       0.65      0.68      0.66    114783
weighted avg       0.75      0.70      0.71    114783



In [35]:
# test dimensionality reduction technique 

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

model = LogisticRegression(solver='liblinear', class_weight = 'balanced', random_state = 42)
model.fit(X_train_svd, y_train)

print("Evaluating the model on the test set..")
y_pred = model.predict(X_test_svd)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names = ['Not Spoiler', 'Spoiler']))

Evaluating the model on the test set..
Classification Report:
              precision    recall  f1-score   support

 Not Spoiler       0.84      0.68      0.75     84598
     Spoiler       0.42      0.64      0.50     30185

    accuracy                           0.67    114783
   macro avg       0.63      0.66      0.63    114783
weighted avg       0.73      0.67      0.69    114783

