In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
directory_path = "/SFS/project/ry/dp_sgteam/catherine/ada/dataset"
df_reviews = pd.read_json(f"{directory_path}/cleaned_data.json")

df_reviews.dropna(subset=['review_text', 'is_spoiler'], inplace=True)

X = df_reviews[['review_text']]
y = df_reviews['is_spoiler'].astype(int)

In [13]:
X.head()

Unnamed: 0,review_text
0,oscar year shawshank redemption write direct f...
1,shawshank redemption without doubt one brillia...
2,believe film best story ever tell film tell ti...
3,yes spoiler film emotional impact find hard wr...
4,heart extraordinary movie brilliant indelible ...


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 459130
Testing set size: 114783


In [14]:
vectorizer = TfidfVectorizer(
    max_features = 20000,
    stop_words = 'english',
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train['review_text'])
X_test_tfidf = vectorizer.transform(X_test['review_text'])

print(f"Shape of TF-IDF matrices: Train = {X_train_tfidf.shape}, Test = {X_test_tfidf.shape}")

Shape of TF-IDF matrices: Train = (459130, 20000), Test = (114783, 20000)


In [17]:
model = LogisticRegression(solver='liblinear', class_weight = 'balanced', random_state = 42)
model.fit(X_train_tfidf, y_train)

print("Evaluating the model on the test set..")

y_pred = model.predict(X_test_tfidf)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names = ['Not Spoiler', 'Spoiler']))

Evaluating the model on the test set..
Classification Report:
              precision    recall  f1-score   support

 Not Spoiler       0.86      0.74      0.79     84598
     Spoiler       0.47      0.66      0.55     30185

    accuracy                           0.72    114783
   macro avg       0.67      0.70      0.67    114783
weighted avg       0.76      0.72      0.73    114783

