In [7]:
import os
import numpy as np
import pandas as pd
import AbstractBaseCollabFilterSGD
from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets
from sklearn.model_selection import train_test_split

from lightfm import LightFM
from lightfm.data import Dataset as LFDataset
from lightfm.evaluation import precision_at_k, auc_score

In [2]:
import matplotlib
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.0, style='whitegrid')

In [3]:
train_data_tuple, valid_data_tuple, test_data_tuple, total_n_users, total_n_items = load_train_valid_test_datasets()

data_path = 'data_movie_lens_100k/'

ratings_df = pd.read_csv(os.path.join(data_path, 'ratings_all_development_set.csv'))
users_df = pd.read_csv(os.path.join(data_path, 'user_info.csv'))
movies_df = pd.read_csv(os.path.join(data_path, 'movie_info.csv'))
masked_test_df = pd.read_csv(os.path.join(data_path, 'ratings_masked_leaderboard_set.csv'))

train_df, val_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [5]:
lf_data = LFDataset()
lf_data.fit(users_df['user_id'], movies_df['item_id'])

# Add features
user_features = list(zip(users_df['user_id'], users_df[['age', 'is_male']].astype(str).values.tolist()))
movie_features = list(zip(movies_df['item_id'], movies_df['release_year'].astype(str)))

lf_data.fit_partial(users=[u[0] for u in user_features],
                    items=[m[0] for m in movie_features],
                    user_features=[f for _, feats in user_features for f in feats],
                    item_features=[f for _, feats in movie_features for f in feats])

# Build interactions
(interactions, weights) = lf_data.build_interactions([(u, i, r) for u, i, r in train_df.values])

user_feat = lf_data.build_user_features(user_features)
item_feat = lf_data.build_item_features(movie_features)

In [6]:
model = LightFM(loss='warp')  # or 'bpr', 'logistic', or 'mae'
model.fit(interactions, 
          user_features=user_feat, 
          item_features=item_feat, 
          epochs=30, 
          num_threads=4)

<lightfm.lightfm.LightFM at 0x16cfeca10>

In [17]:
precision = precision_at_k(model, interactions, user_features=user_feat, item_features=item_feat, k=5).mean()
auc = auc_score(model, interactions, user_features=user_feat, item_features=item_feat).mean()

masked_user_ids = masked_test_df['user_id'].values
masked_item_ids = masked_test_df['item_id'].values

lightfm_preds = model.predict(
    masked_user_ids,
    masked_item_ids,
    user_features=user_feat,
    item_features=item_feat
)

lightfm_preds = np.clip(lightfm_preds, 1.0, 5.0)
np.savetxt("predicted_ratings_leaderboard.txt", lightfm_preds, fmt="%.2f")
