In [214]:
import pandas as pd
import numpy as np
from lightfm import LightFM

## 1. Data Pre-Proceesing

### A- User Interactions

In [215]:
ratings_df = pd.read_csv("Data/archive/rating.csv")

In [216]:
ratings_df["userId"] = ratings_df["userId"].astype("int32")
ratings_df["movieId"] = ratings_df["movieId"].astype("int32")
ratings_df["rating"] = ratings_df["rating"].astype("float32")

In [217]:
ratings_df.drop(["timestamp"], axis=1, inplace=True)
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5


In [218]:
ratings_df.values

array([[1.00000e+00, 2.00000e+00, 3.50000e+00],
       [1.00000e+00, 2.90000e+01, 3.50000e+00],
       [1.00000e+00, 3.20000e+01, 3.50000e+00],
       ...,
       [1.38493e+05, 6.96440e+04, 3.00000e+00],
       [1.38493e+05, 7.02860e+04, 5.00000e+00],
       [1.38493e+05, 7.16190e+04, 2.50000e+00]])

### B- Movie Features

In [230]:
movies_df = pd.read_csv("Data/imdb_encoded_3.csv")

In [231]:
movies_df.head(2)

Unnamed: 0,originalTitle,imdbId,movieId,ImdbRating,youth_rate,popularity_rate,Action,Adventure,Animation,Biography,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Blacksmith Scene,5,95541,0.033107,0.121019,0.000956,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,Edison Kinetoscopic Record of a Sneeze,8,88674,-0.766892,0.127389,0.000756,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [195]:
movies_df = movies_df.round(2)

In [196]:
movies_df.columns

Index(['originalTitle', 'imdbId', 'movieId', 'ImdbRating', 'youth_rate',
       'popularity_rate', 'Action', 'Adventure', 'Animation', 'Biography',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')

In [197]:
movies_df.drop(["imdbId"], inplace=True, axis=1)

In [198]:
movie_features_matrix = pd.DataFrame(
    {
        "movieId": movies_df["movieId"],
        "features": movies_df[movies_df.columns.drop("movieId")].to_dict(
            orient="records"
        ),
    }
)
movie_features_matrix.head(2)

Unnamed: 0,movieId,features
0,95541,"{'originalTitle': 'Blacksmith Scene', 'ImdbRat..."
1,88674,{'originalTitle': 'Edison Kinetoscopic Record ...


In [199]:
movie_features_matrix["features"] = movie_features_matrix["features"].apply(
    lambda x: list(f"{k}:{v}" for k, v in x.items())
)

In [200]:
movie_features = (
    movie_features_matrix["features"].explode().reset_index(drop=True).unique()
)
movie_features

array(['originalTitle:Blacksmith Scene', 'ImdbRating:0.03',
       'youth_rate:0.12', ..., 'originalTitle:Polskie gówno',
       'originalTitle:Power Rangers',
       'originalTitle:Aziz Ansari Live in Madison Square Garden'],
      dtype=object)

In [201]:
movie_features_matrix=list(zip(movie_features_matrix.movieId, movie_features_matrix.features))
movie_features_matrix

[(95541,
  ['originalTitle:Blacksmith Scene',
   'ImdbRating:0.03',
   'youth_rate:0.12',
   'popularity_rate:0.0',
   'Action:0',
   'Adventure:0',
   'Animation:0',
   'Biography:0',
   'Comedy:1',
   'Crime:0',
   'Documentary:0',
   'Drama:0',
   'Family:0',
   'Fantasy:0',
   'Film-Noir:0',
   'History:0',
   'Horror:0',
   'Music:0',
   'Musical:0',
   'Mystery:0',
   'Romance:0',
   'Sci-Fi:0',
   'Short:1',
   'Sport:0',
   'Thriller:0',
   'War:0',
   'Western:0']),
 (88674,
  ['originalTitle:Edison Kinetoscopic Record of a Sneeze',
   'ImdbRating:-0.77',
   'youth_rate:0.13',
   'popularity_rate:0.0',
   'Action:0',
   'Adventure:0',
   'Animation:0',
   'Biography:0',
   'Comedy:0',
   'Crime:0',
   'Documentary:1',
   'Drama:0',
   'Family:0',
   'Fantasy:0',
   'Film-Noir:0',
   'History:0',
   'Horror:0',
   'Music:0',
   'Musical:0',
   'Mystery:0',
   'Romance:0',
   'Sci-Fi:0',
   'Short:1',
   'Sport:0',
   'Thriller:0',
   'War:0',
   'Western:0']),
 (120869,
  ["ori

---

## 2. Dataset preparation

In [202]:
from lightfm.data import Dataset

dataset = Dataset()

In [203]:
dataset.fit(
    users=ratings_df["userId"],
    items=ratings_df["movieId"],
    item_features=movie_features,
)

In [204]:
dataset.model_dimensions()

(138493, 53100)

In [205]:
dataset.item_features_shape()

(26744, 53100)

### A- Interactions

In [206]:
interactions, weights = dataset.build_interactions(ratings_df.values)

In [207]:
from lightfm.cross_validation import random_train_test_split

train_interactions, test_interactions = random_train_test_split(
    interactions, test_percentage=0.2, random_state=42
)
train_weights, test_weights = random_train_test_split(
    weights, test_percentage=0.2, random_state=42
)

In [208]:
train_interactions, test_interactions

(<138493x26744 sparse matrix of type '<class 'numpy.int32'>'
 	with 16000210 stored elements in COOrdinate format>,
 <138493x26744 sparse matrix of type '<class 'numpy.int32'>'
 	with 4000053 stored elements in COOrdinate format>)

In [209]:
train_weights, test_weights

(<138493x26744 sparse matrix of type '<class 'numpy.float32'>'
 	with 16000210 stored elements in COOrdinate format>,
 <138493x26744 sparse matrix of type '<class 'numpy.float32'>'
 	with 4000053 stored elements in COOrdinate format>)

### B- Movies Features

In [211]:
processed_movie_features = dataset.build_item_features(movie_features_matrix)

ValueError: item id 111901 not in item id mappings.

---

## 3. Model

In [None]:
model = LightFM(loss="warp", learning_schedule="adagrad", random_state=42)
model.fit(
    interactions=train_interactions,
    sample_weight=train_weights,
    item_features=processed_movie_features,
    epochs=50,
    num_threads=14,
    verbose=True,
)

Epoch: 100%|██████████| 75/75 [08:33<00:00,  6.84s/it]


<lightfm.lightfm.LightFM at 0x73beeae409d0>

---

## 4. Evaulation

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_roc_auc = auc_score(
    model, train_interactions, item_features=processed_movie_features, num_threads=14
).mean()
test_roc_auc = auc_score(
    model, test_interactions, item_features=processed_movie_features, num_threads=14
).mean()

train_precision = precision_at_k(
    model, train_interactions, item_features=processed_movie_features, num_threads=14
).mean()
test_precision = precision_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    item_features=processed_movie_features,
    k=15,
    num_threads=14,
).mean()

train_recall = recall_at_k(
    model, train_interactions, item_features=processed_movie_features, num_threads=14
).mean()
test_recall = recall_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    k=15,
    item_features=processed_movie_features,
    num_threads=14,
).mean()

In [None]:
print(f"Train ROC AUC: {train_roc_auc:.2f}\tTest ROC AUC: {test_roc_auc:.2f}")
print(f"Train Precision: {train_precision:.2f}\tTest Precision: {test_precision:.2f}")
print(f"Train Recall: {train_recall:.2f}\tTest Recall: {test_recall:.2f}")

Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.49	Test Precision: 0.25
Train Recall: 0.09	Test Recall: 0.19
