In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [13]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [19]:
directory_path = f"{os.environ['DIR_PATH']}/dataset"
df_reviews = pd.read_json(f"{directory_path}/cleaned_data.json")

df_reviews.dropna(subset=['review_text', 'is_spoiler'], inplace=True)

X = df_reviews[['review_text']]
y = df_reviews['is_spoiler'].astype(int)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 459130
Testing set size: 114783


In [21]:
vectorizer = TfidfVectorizer(
    max_features = 1000,
    stop_words = 'english',
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train['review_text'])
X_test_tfidf = vectorizer.transform(X_test['review_text'])

print(f"Shape of TF-IDF matrices: Train = {X_train_tfidf.shape}, Test = {X_test_tfidf.shape}")

Shape of TF-IDF matrices: Train = (459130, 1000), Test = (114783, 1000)


In [22]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train_tfidf, y_train)

print("\nAfter undersampling:")
print(pd.Series(y_resampled).value_counts())


After undersampling:
is_spoiler
0    120739
1    120739
Name: count, dtype: int64


In [23]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {
    'C': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],                   
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

clf = LogisticRegression(random_state = 42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,        
    verbose=1,
    n_jobs=-1
)

print("Starting GridSearchCV...")
grid_search.fit(X_resampled, y_resampled)

print("\n--- GridSearchCV Results ---")
print("Best Parameters Found: ", grid_search.best_params_)
print("Best Recall Score (on cross-validation): ", grid_search.best_score_)

best_clf = grid_search.best_estimator_

print("Evaluating the model on the test set..")

y_pred = best_clf.predict(X_test_tfidf)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names = ['Not Spoiler', 'Spoiler']))

Starting GridSearchCV...
Fitting 5 folds for each of 56 candidates, totalling 280 fits

--- GridSearchCV Results ---
Best Parameters Found:  {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Best Recall Score (on cross-validation):  0.6525812002528081
Evaluating the model on the test set..
Classification Report:
              precision    recall  f1-score   support

 Not Spoiler       0.85      0.70      0.77     84598
     Spoiler       0.44      0.66      0.52     30185

    accuracy                           0.69    114783
   macro avg       0.64      0.68      0.65    114783
weighted avg       0.74      0.69      0.70    114783



In [25]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    verbose=2,
    n_jobs=-1
)

print("Starting GridSearchCV...")
grid_search.fit(X_resampled, y_resampled)

print("--- GridSearchCV Results ---")
print("Best Parameters Found: ", grid_search.best_params_)
print("Best Recall Score (on cross-validation): ", grid_search.best_score_)

best_clf = grid_search.best_estimator_
proba = best_clf.predict_proba(X_test_tfidf)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Classification Report on Test Set:")
print(classification_report(y_test, pred))

Starting GridSearchCV...
Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.7; total time= 1.3min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.8; total time= 1.6min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.7; total time= 2.5min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=100, subsample=0.7; total time= 1.8min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=100, subsample=0.8; total time= 1.8min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.8; total time= 4.7min
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, n_estimators=500, subsample=0.8; total time= 7.4min
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=6, n_estimators=100, subsample=0.7; tot

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:review_text: object

In [29]:
proba = best_clf.predict_proba(X_test_tfidf)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Classification Report on Test Set:")
print(classification_report(y_test, pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.85      0.71      0.77     84598
           1       0.44      0.66      0.53     30185

    accuracy                           0.69    114783
   macro avg       0.65      0.68      0.65    114783
weighted avg       0.75      0.69      0.71    114783

