In [171]:
import pandas as pd
import numpy as np
from lightfm import LightFM

## 1. Data Pre-Proceesing

### A- User Interactions

In [172]:
ratings_df = pd.read_csv("Data/Data_LightFm/ratings.csv")

In [173]:
ratings_df["userId"] = ratings_df["userId"].astype("int32")
ratings_df["movieId"] = ratings_df["movieId"].astype("int32")
ratings_df["rating"] = ratings_df["rating"].astype("float32")

In [174]:
#ratings_df.drop(["timestamp"], axis=1, inplace=True)
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5


In [175]:
ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)

In [176]:
ratings_df.values

array([[     1,      2,      0],
       [     1,     29,      0],
       [     1,     32,      0],
       ...,
       [138493,  69644,      0],
       [138493,  70286,      1],
       [138493,  71619,      0]])

### B- Movie Features

In [177]:
#movies_df = pd.read_csv("Data/Data_LightFm/movies.csv")

In [178]:
#movies_df.head(2)

In [179]:
#movies_df = movies_df.round(2)

In [180]:
#movies_df.columns

In [181]:
#movies_df.drop(["title","avg_movie_rating","movie_youth_rate","movie_popularity_rate"], inplace=True, axis=1)

In [182]:
#movies_df.head(2)

In [183]:
# genres = movies_df.drop("movieId",axis=1).columns
# genres

In [184]:
# genres_features_matrix = pd.DataFrame({"movieId":movies_df["movieId"],"genres":movies_df[movies_df.columns[1:]].apply(lambda x: list(genres[list((x>0))]),axis=1)})
# genres_features_matrix.head(2)

In [185]:
# genres_features_matrix=list(zip(genres_features_matrix["movieId"],genres_features_matrix["genres"]))
# genres_features_matrix

In [186]:
# movie_features_matrix = pd.DataFrame(
#     {
#         "movieId": movies_df["movieId"],
#         "features": movies_df[movies_df.columns.drop("movieId")].to_dict(
#             orient="records"
#         ),
#     }
# )
# movie_features_matrix.head(2)

In [187]:
# movie_features_matrix["features"] = movie_features_matrix["features"].apply(
#     lambda x: list(f"{k}:{v}" for k, v in x.items())
# )

In [188]:
# movie_features = (
#     movie_features_matrix["features"].explode().reset_index(drop=True).unique()
# )
# movie_features

In [189]:
# movie_features_matrix=list(zip(movie_features_matrix.movieId, movie_features_matrix.features))
# movie_features_matrix

---

## 2. Dataset preparation

In [190]:
from lightfm.data import Dataset
dataset = Dataset()

In [191]:
dataset.fit(
    users=ratings_df["userId"],
    items=ratings_df["movieId"],
    #item_features=movie_features,
)

In [192]:
dataset.model_dimensions()

(138493, 26689)

In [193]:
#dataset.item_features_shape()

### A- Interactions

In [194]:
interactions, weights = dataset.build_interactions(ratings_df.values)

In [195]:
from lightfm.cross_validation import random_train_test_split

train_interactions, test_interactions = random_train_test_split(
    interactions, test_percentage=0.7, random_state=42
)
train_weights, test_weights = random_train_test_split(
    weights, test_percentage=0.7, random_state=42
)

In [196]:
train_interactions, test_interactions

(<138493x26689 sparse matrix of type '<class 'numpy.int32'>'
 	with 5999824 stored elements in COOrdinate format>,
 <138493x26689 sparse matrix of type '<class 'numpy.int32'>'
 	with 13999591 stored elements in COOrdinate format>)

In [197]:
train_weights, test_weights

(<138493x26689 sparse matrix of type '<class 'numpy.float32'>'
 	with 5999824 stored elements in COOrdinate format>,
 <138493x26689 sparse matrix of type '<class 'numpy.float32'>'
 	with 13999591 stored elements in COOrdinate format>)

### B- Movies Features

In [198]:
#processed_movie_features = dataset.build_item_features(movie_features_matrix)

---

## 3. Model

In [199]:
model = LightFM(loss="warp", learning_schedule="adagrad", random_state=42)
model.fit(
    interactions=train_interactions,
    sample_weight=train_weights,
    #item_features=processed_movie_features,
    epochs=50,
    num_threads=14,
    verbose=True,
)

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 100%|██████████| 50/50 [01:40<00:00,  2.01s/it]


<lightfm.lightfm.LightFM at 0x79658ccc4d10>

---

## 4. Evaulation

In [200]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_roc_auc = auc_score(
    model,
    train_interactions,
    # item_features=processed_movie_features,
    num_threads=14,
).mean()
test_roc_auc = auc_score(
    model, test_interactions, num_threads=14
).mean()

train_precision = precision_at_k(
    model, train_interactions, num_threads=14
).mean()
test_precision = precision_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    #item_features=processed_movie_features,
    k=15,
    num_threads=14,
).mean()

train_recall = recall_at_k(
    model, train_interactions,  num_threads=14
).mean()
test_recall = recall_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    k=15,
    # item_features=processed_movie_features,
    num_threads=14,
).mean()

In [201]:
print(f"Train ROC AUC: {train_roc_auc:.2f}\tTest ROC AUC: {test_roc_auc:.2f}")
print(f"Train Precision: {train_precision:.2f}\tTest Precision: {test_precision:.2f}")
print(f"Train Recall: {train_recall:.2f}\tTest Recall: {test_recall:.2f}")

Train ROC AUC: 0.98	Test ROC AUC: 0.97
Train Precision: 0.17	Test Precision: 0.43
Train Recall: 0.08	Test Recall: 0.11


In [202]:
import numpy as np
from lightfm import LightFM
from scipy.sparse import csr_matrix

def hit_rate_at_k(model, test_interactions, k=10):
    # Convert to CSR format if not already
    if not isinstance(test_interactions, csr_matrix):
        test_interactions = test_interactions.tocsr()
    
    n_users, n_items = test_interactions.shape
    hits = 0
    
    for user_id in range(n_users):
        # Predict scores for all items for this user
        scores = model.predict(user_id, np.arange(n_items))
        
        # Get the top-k items
        top_k_items = np.argsort(-scores)[:k]
        
        # Get the items the user interacted with in the test set
        test_items = test_interactions[user_id].indices
        
        # Check if any of the top-k items are in the test set
        if np.intersect1d(top_k_items, test_items).size > 0:
            hits += len(np.intersect1d(top_k_items, test_items))
        all_test_items = test_interactions[user_id].indices
    # Calculate the hit rate
    hit_rate = hits / n_items
    return hit_rate

# Example usage
hit_rate = hit_rate_at_k(model, test_interactions, k=10)
print(f'Hit Rate@10: {hit_rate}')


Hit Rate@10: 18.79152459814905


In [203]:
test_interactions.size

13999591

In [204]:
n_users, n_items = test_interactions.shape
hits = hit_rate*n_items
hits/test_interactions.size

0.035824403727223175