In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle
from reviewclassifierv2 import ReviewMTLPredictor

In [162]:
# Google Sheet file ID
file_id = "1xkgCgwFvHSIST_pRXB4zmKRGAqw3Yyti"

# Export as Excel (.xlsx)
url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"

df = pd.read_excel(url)

# Preview
print(df.head())

         locationName            category   reviewerName      publishedAtDate  \
0  Gardens by the Bay  Tourist attraction  Stylusmaestro  2021-12-29 10:13:13   
1  Gardens by the Bay  Tourist attraction      JoJo Chin  2020-03-21 09:06:31   
2  Gardens by the Bay  Tourist attraction     Theticus _  2020-03-03 00:12:25   
3  Gardens by the Bay  Tourist attraction        sky wda  2020-01-02 18:05:38   
4  Gardens by the Bay  Tourist attraction    Trúc Nguyễn  2020-01-01 12:22:38   

   rating                                         reviewText  \
0     0.5  For a weekday this is considered very crowded....   
1     1.0  Awsome view with breeze wind. Suitable for all...   
2     0.5  Great place to hang out and have picnic especi...   
3     1.0  Every years great place to catch firework on S...   
4     1.0        This is the 10 times I come here. Best view   

                                           imageUrls  reviewerNumberOfReviews  \
0  https://lh3.googleusercontent.com/geougc-cs/

In [163]:
def apply_rule_based(df):
    # Ensure publishedAtDate is datetime
    df["publishedAtDate"] = pd.to_datetime(df["publishedAtDate"], errors="coerce")
    
    # Start with all NaN (means "not yet decided")
    df["ruleQualityLabel"] = np.nan
    
    # Rule 1: Link → qualityLevel = 0
    df.loc[(df["hasLink"] == 1) | (df["isAdvertisement"] == 1), "ruleQualityLabel"] = 0

    
    # Rule 2: Spamming by same user at same location in short time
    df = df.sort_values(by=["reviewerName", "locationName", "publishedAtDate"])
    df["timeDiff"] = df.groupby(["reviewerName", "locationName"])["publishedAtDate"].diff().dt.total_seconds()
    
    # Suspicious if multiple reviews within 10 minutes (600 sec)
    spam_mask = df["timeDiff"].notna() & (df["timeDiff"] < 600)
    df.loc[spam_mask, "ruleQualityLabel"] = 0
    
    # Rule 3: Duplicate content (exact match across reviews)
    # Flag duplicates but do NOT delete them
    duplicate_mask = df.duplicated(subset=["reviewText"], keep=False)
    df.loc[duplicate_mask, "ruleQualityLabel"] = 0

    return df

In [164]:
X = df
y = df["qualityLevel"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [165]:
X_test = apply_rule_based(X_test)
rule_labels = X_test["ruleQualityLabel"]
X_test = X_test[X_test["ruleQualityLabel"] == 0]
print("Rule Based:")
print(classification_report(y_true=X_test["qualityLevel"], y_pred=X_test["ruleQualityLabel"]))

Rule Based:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       142
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1

    accuracy                           0.99       144
   macro avg       0.33      0.33      0.33       144
weighted avg       0.97      0.99      0.98       144



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [166]:
def final_quality_labels(rule_labels, rf_probs, dual_probs):
    """
    rule_labels: array of 0/1/2 from rule-based model
    rf_probs: n_samples x 3 array from RF predict_proba
    dual_probs: n_samples x 3 array from dual-head model predict_proba
    """
    n_samples = len(rf_probs)
    final = np.zeros(n_samples, dtype=int)

    rule_labels = np.array(rule_labels)
    rf_probs = np.array(rf_probs)
    dual_probs = np.array(dual_probs)
    
    # average the probabilities for samples where rule-based != 0
    avg_probs = (rf_probs + dual_probs) / 2

    # assign final labels
    for i in range(n_samples):
        if rule_labels[i] == 0:
            final[i] = 0
        else:
            final[i] = np.argmax(avg_probs[i])
    
    return final


In [167]:
# load rf
with open('random_forest_model.pkl', 'rb') as f:
    rf = pickle.load(f)

In [168]:
df = df.fillna(0)
X = df[["readabilityScore", "grammarSpellingScore", "sentimentRatingDiff", "reviewWordCount", "imageQualityScore"]]  
y = df["qualityLevel"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
rf_probs = rf.predict_proba(X_test) 


In [169]:
X = df["reviewText"]
y = df["qualityLevel"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

bundle_dir = "artifacts/review_mtl_v1"
clf = ReviewMTLPredictor.from_dir(bundle_dir)

pred_df = clf.predict(X_test.tolist())

In [170]:
dual_probs_quality = pred_df[["p_quality_low", "p_quality_medium", "p_quality_good"]].values.tolist()
dual_probs_relevance = pred_df[["p_relevance_irrel", "p_relevance_rel"]].values.tolist()

In [171]:
quality_labels = final_quality_labels(rule_labels, rf_probs, dual_probs_quality)
outcome = pd.DataFrame(X_test, columns=["reviewText"]) 

# Add the predicted labels
outcome["qualityLabel"] = quality_labels

print(outcome.head())

                                             reviewText  qualityLabel
3084                      Good quality drinks and food.             0
2984                                Seaside great view!             0
2260                                   Great experience             0
1979  2 hours since order and not received.\nHorribl...             1
1889  The interior is very beautiful. It was a bit s...             0


In [172]:
print(classification_report(y_true=y_test, y_pred=quality_labels))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       530
           1       0.72      0.62      0.66       334
           2       0.66      0.76      0.71       138

    accuracy                           0.77      1002
   macro avg       0.73      0.75      0.74      1002
weighted avg       0.77      0.77      0.76      1002



In [173]:
# load logistic regression
with open('logreg_model.pkl', 'rb') as f:
    logreg = pickle.load(f)

In [174]:
X = df[["categoryRelevanceScore", "imageRelevanceScore"]]
y = df["isRelevant"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
logreg_probs = logreg.predict_proba(X_test)

In [175]:
def final_relevance_labels(dual_probs, logreg_probs):
    dual_probs = np.array(dual_probs)  # from dual-head model
    logreg_probs = np.array(logreg_probs)  # from logistic regression

    # Average the probabilities of "1"
    avg_probs_1 = (dual_probs[:, 1] + logreg_probs[:, 1]) / 2

    # Convert to binary labels
    final_labels = (avg_probs_1 >= 0.5).astype(int)
    return final_labels

In [176]:
relevance_labels = final_relevance_labels(dual_probs_relevance, logreg_probs)

In [179]:
X = df["reviewText"]
y = df["isRelevant"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
outcome = pd.DataFrame(X_test, columns=["reviewText"]) 
outcome["relevanceLabel"] = relevance_labels
print(outcome.head())

                                             reviewText  relevanceLabel
973   The Pyramids of Giza arent just piles of stone...               1
1938  I have become a regular customer of this shop....               1
3296  We came to drink beyran soup for breakfast. We...               1
4613  The lakeside kayak rentals offered easy reserv...               1
2743  Lovely space with view of the river and downto...               1


In [178]:
print(classification_report(y_true=y_test, y_pred=relevance_labels))

              precision    recall  f1-score   support

           0       0.33      0.00      0.01       216
           1       0.78      1.00      0.88       786

    accuracy                           0.78      1002
   macro avg       0.56      0.50      0.44      1002
weighted avg       0.69      0.78      0.69      1002

