In [35]:
import json
import pandas as pd
import numpy as np


with open("filter_all_t.json", "r") as f:
    data = json.load(f)

train_df = pd.DataFrame(data["train"])
val_df   = pd.DataFrame(data.get("val", []))
test_df  = pd.DataFrame(data.get("test", []))

In [22]:
train_df.head()

Unnamed: 0,business_id,user_id,rating,review_text,pics,history_reviews
0,60567465d335d0abfb415b26,101074926318992653684,4,The tang of the tomato sauce is outstanding. A...,"[AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...",[[101074926318992653684_6056272797d555cc6fb0d1...
1,6050fa9f5b4ccec8d5cae994,117065749986299237881,5,Chicken and waffles were really good!,[AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s],[[117065749986299237881_605206f8d8c08f462b93e8...
2,604be10877e81aaed3cc9a1e,106700937793048450809,4,The appetizer of colossal shrimp was very good...,"[AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...",[[106700937793048450809_6044300b27f39b7b5d1dbf...
3,60411e017cd8bf130362365a,101643045857250355161,5,The fish tacos here omg! The salad was great ...,"[AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...",[[101643045857250355161_604fbdd099686c10168c91...
4,604139dd7cd8bf1303624208,109802745326785766951,4,"Ribs are great, as are the mac and cheese, fri...",[AF1QipNVys4yq-5w_3EsDdHpSc9ZNb7Nl30Mfb6Y0Gup],[[109802745326785766951_60524fa9f09a4ffff042f9...


In [4]:
user_avg = train_df.groupby("user_id")["rating"].mean()
item_avg = train_df.groupby("business_id")["rating"].mean()
user_count = train_df.groupby("user_id").size()
item_count = train_df.groupby("business_id").size()

In [None]:
def add_features(df):
    df["user_avg"]   = df["user_id"].map(user_avg)
    df["item_avg"]   = df["business_id"].map(item_avg)
    df["user_count"] = df["user_id"].map(user_count)
    df["item_count"] = df["business_id"].map(item_count)
    df["review_len"] = df["review_text"].str.len()

    df[["user_avg", "item_avg", "user_count", "item_count"]] = \
    df[["user_avg", "item_avg", "user_count", "item_count"]].fillna(0)

    return df

In [15]:
train_df = add_features(train_df)
val_df   = add_features(val_df)
test_df  = add_features(test_df)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000, stop_words="english")

X_train_text = tfidf.fit_transform(train_df["review_text"])
X_val_text   = tfidf.transform(val_df["review_text"])
X_test_text  = tfidf.transform(test_df["review_text"])

In [17]:
import numpy as np
from scipy.sparse import hstack

numeric_cols = ["user_avg", "item_avg", "user_count", "item_count", "review_len"]

X_train_num = train_df[numeric_cols].values
X_val_num   = val_df[numeric_cols].values
X_test_num  = test_df[numeric_cols].values

X_train = hstack([X_train_text, X_train_num])
X_val   = hstack([X_val_text,   X_val_num])
X_test  = hstack([X_test_text,  X_test_num])

y_train = train_df["rating"].values
y_val   = val_df["rating"].values
y_test  = test_df["rating"].values

In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [19]:
from sklearn.metrics import mean_squared_error
import numpy as np

pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

rmse_val = np.sqrt(mean_squared_error(y_val, pred_val))
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))

print("Val RMSE:", rmse_val)
print("Test RMSE:", rmse_test)

Val RMSE: 3.429614772889446
Test RMSE: 3.4324610016438184


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def embed_reviews(df, model):
    review_list = df['review_text'].fillna("").tolist()

    embeddings = model.encode(
        review_list,
        batch_size=64,
        show_progress_bar=False,
        convert_to_numpy=True
    )
    df = df.copy()
    df['emb'] = list(embeddings)
    return df

def build_user_item_dict(train_df):
    # user embedding from train
    user_emb_dict = (
        train_df.groupby('user_id')['emb']
        .apply(lambda x: np.mean(np.vstack(x.values), axis=0))
        .to_dict()
    )

    # item embedding from train
    item_emb_dict = (
        train_df.groupby('business_id')['emb']
        .apply(lambda x: np.mean(np.vstack(x.values), axis=0))
        .to_dict()
    )

    # global average for cold-start users/items
    global_user_emb = np.mean(np.vstack(train_df['emb']), axis=0)
    global_item_emb = global_user_emb.copy()

    return user_emb_dict, item_emb_dict, global_user_emb, global_item_emb

def build_Xy(df, user_emb_dict, item_emb_dict, global_user_emb, global_item_emb):
    X, y = [], []

    for idx, row in df.iterrows():
        uid = row['user_id']
        bid = row['business_id']

        # 如果 val/test user 沒看過 → cold start
        u_emb = user_emb_dict.get(uid, global_user_emb)
        i_emb = item_emb_dict.get(bid, global_item_emb)

        X.append(np.concatenate([u_emb, i_emb]))

        y.append(row['rating'])

    return np.array(X), np.array(y)

In [37]:
X_train, y_train = EMB(train_df)
X_val, y_val = EMB(val_df)
X_test, y_test = EMB(test_df)

Batches:   0%|          | 0/1360 [00:00<?, ?it/s]

Batches:   0%|          | 0/170 [00:00<?, ?it/s]

ValueError: Length of values (10860) does not match length of index (87013)

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05
)

model.fit(X_train, y_train)

In [None]:
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

rmse_val = np.sqrt(mean_squared_error(y_val, pred_val))
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))

print("Val RMSE:", rmse_val)
print("Test RMSE:", rmse_test)