In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

In [4]:
# Google Sheet file ID
file_id = "1xkgCgwFvHSIST_pRXB4zmKRGAqw3Yyti"

# Export as Excel (.xlsx)
url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"

df = pd.read_excel(url)

# Preview
print(df.head())

         locationName            category   reviewerName      publishedAtDate  \
0  Gardens by the Bay  Tourist attraction  Stylusmaestro  2021-12-29 10:13:13   
1  Gardens by the Bay  Tourist attraction      JoJo Chin  2020-03-21 09:06:31   
2  Gardens by the Bay  Tourist attraction     Theticus _  2020-03-03 00:12:25   
3  Gardens by the Bay  Tourist attraction        sky wda  2020-01-02 18:05:38   
4  Gardens by the Bay  Tourist attraction    Trúc Nguyễn  2020-01-01 12:22:38   

   rating                                         reviewText  \
0     0.5  For a weekday this is considered very crowded....   
1     1.0  Awsome view with breeze wind. Suitable for all...   
2     0.5  Great place to hang out and have picnic especi...   
3     1.0  Every years great place to catch firework on S...   
4     1.0        This is the 10 times I come here. Best view   

                                           imageUrls  reviewerNumberOfReviews  \
0  https://lh3.googleusercontent.com/geougc-cs/

In [7]:
df = df.fillna(0)
X = df[["categoryRelevanceScore", "imageRelevanceScore"]]
y = df["isRelevant"]

# -------- TRAIN-TEST SPLIT (stratified) --------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------- TRAIN LOGISTIC REGRESSION --------
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# -------- PREDICT PROBABILITIES & DEFAULT THRESHOLD --------
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)  # default threshold

# -------- EVALUATION --------
print("\nClassification Report (threshold = 0.5):")
print(classification_report(y_test, y_pred))

# -------- INSPECT COEFFICIENTS --------
for feat, coef in zip(X.columns, model.coef_[0]):
    print(f"Coefficient for {feat}: {coef:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")



Classification Report (threshold = 0.5):
              precision    recall  f1-score   support

           0       0.36      0.04      0.07       216
           1       0.79      0.98      0.87       786

    accuracy                           0.78      1002
   macro avg       0.58      0.51      0.47      1002
weighted avg       0.70      0.78      0.70      1002

Coefficient for categoryRelevanceScore: 8.1405
Coefficient for imageRelevanceScore: -0.0831
Intercept: -1.2733


In [9]:
with open("logreg_model.pkl", "wb") as f:
    pickle.dump(model, f)