In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Google Sheet file ID
file_id = "1xkgCgwFvHSIST_pRXB4zmKRGAqw3Yyti"

# Export as Excel (.xlsx)
url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"

df = pd.read_excel(url)

# Preview
print(df.head())

         locationName            category   reviewerName      publishedAtDate  \
0  Gardens by the Bay  Tourist attraction  Stylusmaestro  2021-12-29 10:13:13   
1  Gardens by the Bay  Tourist attraction      JoJo Chin  2020-03-21 09:06:31   
2  Gardens by the Bay  Tourist attraction     Theticus _  2020-03-03 00:12:25   
3  Gardens by the Bay  Tourist attraction        sky wda  2020-01-02 18:05:38   
4  Gardens by the Bay  Tourist attraction    Trúc Nguyễn  2020-01-01 12:22:38   

   rating                                         reviewText  \
0     0.5  For a weekday this is considered very crowded....   
1     1.0  Awsome view with breeze wind. Suitable for all...   
2     0.5  Great place to hang out and have picnic especi...   
3     1.0  Every years great place to catch firework on S...   
4     1.0        This is the 10 times I come here. Best view   

                                           imageUrls  reviewerNumberOfReviews  \
0  https://lh3.googleusercontent.com/geougc-cs/

In [17]:
def apply_rule_based(df):
    # Ensure publishedAtDate is datetime
    df["publishedAtDate"] = pd.to_datetime(df["publishedAtDate"], errors="coerce")
    
    # Start with all NaN (means "not yet decided")
    df["ruleQualityLabel"] = np.nan
    
    # Rule 1: Link → qualityLevel = 0
    df.loc[(df["hasLink"] == 1) | (df["isAdvertisement"] == 1), "ruleQualityLabel"] = 0

    
    # Rule 2: Spamming by same user at same location in short time
    df = df.sort_values(by=["reviewerName", "locationName", "publishedAtDate"])
    df["timeDiff"] = df.groupby(["reviewerName", "locationName"])["publishedAtDate"].diff().dt.total_seconds()
    
    # Suspicious if multiple reviews within 10 minutes (600 sec)
    spam_mask = df["timeDiff"].notna() & (df["timeDiff"] < 600)
    df.loc[spam_mask, "ruleQualityLabel"] = 0
    
    # Rule 3: Duplicate content (exact match across reviews)
    # Flag duplicates but do NOT delete them
    duplicate_mask = df.duplicated(subset=["reviewText"], keep=False)
    df.loc[duplicate_mask, "ruleQualityLabel"] = 0

    return df

In [20]:
X = df
y = df["qualityLevel"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
X_test = apply_rule_based(X_test)
rule_labels = X_test["ruleQualityLabel"]
X_test = X_test[X_test["ruleQualityLabel"] == 0]
print("Rule Based:")
print(classification_report(y_true=X_test["qualityLevel"], y_pred=X_test["ruleQualityLabel"]))

Rule Based:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       133
           2       0.00      0.00      0.00         1

    accuracy                           0.99       134
   macro avg       0.50      0.50      0.50       134
weighted avg       0.99      0.99      0.99       134



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
def final_quality_labels(rule_labels, rf_probs, dual_probs):
    """
    rule_labels: array of 0/1/2 from rule-based model
    rf_probs: n_samples x 3 array from RF predict_proba
    dual_probs: n_samples x 3 array from dual-head model predict_proba
    """
    n_samples = len(rule_labels)
    final = np.zeros(n_samples, dtype=int)
    
    # average the probabilities for samples where rule-based != 0
    avg_probs = (rf_probs + dual_probs) / 2

    # assign final labels
    for i in range(n_samples):
        if rule_labels[i] == 0:
            final[i] = 0
        else:
            final[i] = np.argmax(avg_probs[i])
    
    return final


In [25]:
# load rf
with open('random_forest_model.pkl', 'rb') as f:
    rf = pickle.load(f)

In [26]:
df = df.fillna(0)
X = df[["readabilityScore", "grammarSpellingScore", "sentimentRatingDiff", "reviewWordCount", "imageQualityScore"]]  
y = df["qualityLevel"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
rf_probs = rf.predict_proba(X_test) 

In [None]:
quality_labels = final_quality_labels(rule_labels, rf_probs, dual_probs)

NameError: name 'final_labels' is not defined