In [257]:
import pandas as pd
import numpy as np
from lightfm import LightFM

## 1. Data Pre-Proceesing

### A- User Interactions

In [284]:
ratings_df = pd.read_csv("Data/Data_LightFm/ratings.csv")

In [285]:
ratings_df["userId"] = ratings_df["userId"].astype("int32")
ratings_df["movieId"] = ratings_df["movieId"].astype("int32")
ratings_df["rating"] = ratings_df["rating"].astype("float32")

In [286]:
#ratings_df.drop(["timestamp"], axis=1, inplace=True)
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5


In [287]:
ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)

In [288]:
ratings_df.values

array([[     1,      2,      0],
       [     1,     29,      0],
       [     1,     32,      0],
       ...,
       [138493,  69644,      0],
       [138493,  70286,      1],
       [138493,  71619,      0]])

### B- Movie Features

In [263]:
#movies_df = pd.read_csv("Data/Data_LightFm/movies.csv")

In [264]:
#movies_df.head(2)

Unnamed: 0,movieId,avg_movie_rating,title,movie_youth_rate,movie_popularity_rate,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,3.92124,toy story,0.83871,0.738297,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,3.211977,jumanji,0.83871,0.330446,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [265]:
#ratings_df = ratings_df.merge(movies_df[["movieId","avg_movie_rating","movie_youth_rate","movie_popularity_rate"]],on="movieId",how="left")

In [267]:
#ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,avg_movie_rating,movie_youth_rate,movie_popularity_rate
0,1,2,0,3.211977,0.83871,0.330446
1,1,29,0,3.95223,0.83871,0.126566


In [268]:
#movies_df = movies_df.round(2)

In [269]:
#movies_df.columns

In [270]:
#movies_df.drop(["title","avg_movie_rating","movie_youth_rate","movie_popularity_rate"], inplace=True, axis=1)

In [271]:
#movies_df.head(2)

In [272]:
# genres = movies_df.drop("movieId",axis=1).columns
# genres

In [273]:
# genres_features_matrix = pd.DataFrame({"movieId":movies_df["movieId"],"genres":movies_df[movies_df.columns[1:]].apply(lambda x: list(genres[list((x>0))]),axis=1)})
# genres_features_matrix.head(2)

In [274]:
# genres_features_matrix=list(zip(genres_features_matrix["movieId"],genres_features_matrix["genres"]))
# genres_features_matrix

In [275]:
# movie_features_matrix = pd.DataFrame(
#     {
#         "movieId": movies_df["movieId"],
#         "features": movies_df[movies_df.columns.drop("movieId")].to_dict(
#             orient="records"
#         ),
#     }
# )
# movie_features_matrix.head(2)

In [276]:
# movie_features_matrix["features"] = movie_features_matrix["features"].apply(
#     lambda x: list(f"{k}:{v}" for k, v in x.items())
# )

In [277]:
# movie_features = (
#     movie_features_matrix["features"].explode().reset_index(drop=True).unique()
# )
# movie_features

In [278]:
# movie_features_matrix=list(zip(movie_features_matrix.movieId, movie_features_matrix.features))
# movie_features_matrix

---

## 2. Dataset preparation

In [289]:
from lightfm.data import Dataset
dataset = Dataset()

In [290]:
dataset.fit(
    users=ratings_df["userId"],
    items=ratings_df["movieId"],
    #item_features=movie_features,
)

In [291]:
dataset.model_dimensions()

(138493, 26689)

In [282]:
#dataset.item_features_shape()

### A- Interactions

In [292]:
interactions, weights = dataset.build_interactions(ratings_df.values)

In [293]:
from lightfm.cross_validation import random_train_test_split

train_interactions, test_interactions = random_train_test_split(
    interactions, test_percentage=0.7, random_state=42
)
train_weights, test_weights = random_train_test_split(
    weights, test_percentage=0.7, random_state=42
)

In [294]:
train_interactions, test_interactions

(<138493x26689 sparse matrix of type '<class 'numpy.int32'>'
 	with 5999824 stored elements in COOrdinate format>,
 <138493x26689 sparse matrix of type '<class 'numpy.int32'>'
 	with 13999591 stored elements in COOrdinate format>)

In [295]:
train_weights, test_weights

(<138493x26689 sparse matrix of type '<class 'numpy.float32'>'
 	with 5999824 stored elements in COOrdinate format>,
 <138493x26689 sparse matrix of type '<class 'numpy.float32'>'
 	with 13999591 stored elements in COOrdinate format>)

### B- Movies Features

In [296]:
#processed_movie_features = dataset.build_item_features(movie_features_matrix)

---

## 3. Model

In [297]:
model = LightFM(loss="warp", learning_schedule="adagrad", random_state=42)
model.fit(
    interactions=train_interactions,
    sample_weight=train_weights,
    #item_features=processed_movie_features,
    epochs=60,
    num_threads=14,
    verbose=True,
)

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Epoch: 100%|██████████| 60/60 [01:13<00:00,  1.23s/it]


<lightfm.lightfm.LightFM at 0x745906dbcb50>

---

## 4. Evaulation

In [298]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_roc_auc = auc_score(
    model,
    train_interactions,
    # item_features=processed_movie_features,
    num_threads=14,
).mean()
test_roc_auc = auc_score(
    model, test_interactions, num_threads=14
).mean()

train_precision = precision_at_k(
    model, train_interactions, num_threads=14
).mean()
test_precision = precision_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    #item_features=processed_movie_features,
    k=15,
    num_threads=14,
).mean()

train_recall = recall_at_k(
    model, train_interactions,  num_threads=14
).mean()
test_recall = recall_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    k=15,
    # item_features=processed_movie_features,
    num_threads=14,
).mean()

In [299]:
print(f"Train ROC AUC: {train_roc_auc:.2f}\tTest ROC AUC: {test_roc_auc:.2f}")
print(f"Train Precision: {train_precision:.2f}\tTest Precision: {test_precision:.2f}")
print(f"Train Recall: {train_recall:.2f}\tTest Recall: {test_recall:.2f}")

Train ROC AUC: 0.98	Test ROC AUC: 0.97
Train Precision: 0.18	Test Precision: 0.43
Train Recall: 0.08	Test Recall: 0.11


In [300]:
import numpy as np
from lightfm import LightFM
from scipy.sparse import csr_matrix

def hit_rate_at_k(model, test_interactions, k=10):
    # Convert to CSR format if not already
    if not isinstance(test_interactions, csr_matrix):
        test_interactions = test_interactions.tocsr()
    
    n_users, n_items = test_interactions.shape
    hits = 0
    
    for user_id in range(n_users):
        # Predict scores for all items for this user
        scores = model.predict(user_id, np.arange(n_items))
        
        # Get the top-k items
        top_k_items = np.argsort(-scores)[:k]
        
        # Get the items the user interacted with in the test set
        test_items = test_interactions[user_id].indices
        
        # Check if any of the top-k items are in the test set
        if np.intersect1d(top_k_items, test_items).size > 0:
            hits += len(np.intersect1d(top_k_items, test_items))
        all_test_items = test_interactions[user_id].indices
    # Calculate the hit rate
    hit_rate = hits / n_items
    return hit_rate

# Example usage
hit_rate = hit_rate_at_k(model, test_interactions, k=10)
print(f'Hit Rate@10: {hit_rate}')


Hit Rate@10: 18.789313949567237


In [301]:
test_interactions.size

13999591

In [302]:
n_users, n_items = test_interactions.shape
hits = hit_rate*n_items
hits/test_interactions.size

0.03582018931838794

---

## Cross Validation

In [303]:
import numpy as np
import scipy.sparse as sp

def _shuffle(uids, iids, data, random_state):
    shuffle_indices = np.arange(len(uids))
    random_state.shuffle(shuffle_indices)
    return uids[shuffle_indices], iids[shuffle_indices], data[shuffle_indices]

def inverse_cross_validation_split(interactions, n_splits=5, random_state=None):
    """
    Split interactions into `n_splits` test sets and `n-1` train sets for inverse cross-validation.
    
    This function creates `n_splits` folds of the dataset, with each fold being used as the training set,
    and the remaining `n_splits-1` folds combined to form the test set.

    Parameters
    ----------
    interactions: scipy sparse matrix
        The interactions to split.
    n_splits: int, optional
        Number of folds. Must be at least 2.
    random_state: int or numpy.random.RandomState, optional
        Random seed used to initialize the numpy.random.RandomState number generator.
        Accepts an instance of numpy.random.RandomState for backwards compatibility.

    Yields
    ------
    (train, test): (scipy.sparse.COOMatrix, scipy.sparse.COOMatrix)
        A generator yielding `n_splits` tuples of (train data, test data).
    """
    
    if not sp.issparse(interactions):
        raise ValueError("Interactions must be a scipy.sparse matrix.")
    
    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(seed=random_state)
    
    interactions = interactions.tocoo()
    shape = interactions.shape
    uids, iids, data = interactions.row, interactions.col, interactions.data
    
    uids, iids, data = _shuffle(uids, iids, data, random_state)
    
    fold_size = len(uids) // n_splits
    
    for i in range(n_splits):
        train_start = i * fold_size
        train_end = (i + 1) * fold_size if i < n_splits - 1 else len(uids)
        
        train_idx = slice(train_start, train_end)
        test_idx = np.concatenate([np.arange(0, train_start), np.arange(train_end, len(uids))])
        
        train = sp.coo_matrix(
            (data[train_idx], (uids[train_idx], iids[train_idx])),
            shape=shape,
            dtype=interactions.dtype,
        )
        test = sp.coo_matrix(
            (data[test_idx], (uids[test_idx], iids[test_idx])),
            shape=shape,
            dtype=interactions.dtype,
        )
        
        yield train, test


In [304]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

def test_model(model,train_interactions,test_interactions):
    train_roc_auc = auc_score(
        model,
        train_interactions,
        # item_features=processed_movie_features,
        num_threads=14,
    ).mean()
    test_roc_auc = auc_score(
        model, test_interactions, num_threads=14
    ).mean()

    train_precision = precision_at_k(
        model, train_interactions, num_threads=14
    ).mean()
    test_precision = precision_at_k(
        model,
        test_interactions,
        train_interactions=train_interactions,
        #item_features=processed_movie_features,
        k=15,
        num_threads=14,
    ).mean()

    train_recall = recall_at_k(
        model, train_interactions,  num_threads=14
    ).mean()
    test_recall = recall_at_k(
        model,
        test_interactions,
        train_interactions=train_interactions,
        k=15,
        # item_features=processed_movie_features,
        num_threads=14,
    ).mean()
    print(f"Train ROC AUC: {train_roc_auc:.2f}\tTest ROC AUC: {test_roc_auc:.2f}")
    print(f"Train Precision: {train_precision:.2f}\tTest Precision: {test_precision:.2f}")
    print(f"Train Recall: {train_recall:.2f}\tTest Recall: {test_recall:.2f}")
    return train_roc_auc,test_roc_auc,train_precision,test_precision,train_recall,test_recall

In [305]:
from scipy.sparse import coo_matrix
# Assuming you have a sparse interaction matrix named 'interactions'
n_splits = 5
random_state = 42
results={}
model = LightFM(loss="warp", learning_schedule="adagrad", random_state=random_state)
for i, (train, test) in enumerate(inverse_cross_validation_split(interactions, n_splits=n_splits, random_state=random_state)):
    print(f"Fold {i + 1}")
    model.fit(
        train,
        epochs=50,
        num_threads=14,
        verbose=True,
    )
    results[i+1]=test_model(model,train,test)
    print("Train set shape:", train.shape, "Test set shape:", test.shape)


Fold 1


Epoch: 100%|██████████| 50/50 [00:35<00:00,  1.41it/s]


Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.12	Test Precision: 0.47
Train Recall: 0.09	Test Recall: 0.11
Train set shape: (138493, 26689) Test set shape: (138493, 26689)
Fold 2


Epoch: 100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.13	Test Precision: 0.47
Train Recall: 0.09	Test Recall: 0.11
Train set shape: (138493, 26689) Test set shape: (138493, 26689)
Fold 3


Epoch: 100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.12	Test Precision: 0.46
Train Recall: 0.09	Test Recall: 0.11
Train set shape: (138493, 26689) Test set shape: (138493, 26689)
Fold 4


Epoch: 100%|██████████| 50/50 [00:36<00:00,  1.36it/s]


Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.12	Test Precision: 0.46
Train Recall: 0.09	Test Recall: 0.11
Train set shape: (138493, 26689) Test set shape: (138493, 26689)
Fold 5


Epoch: 100%|██████████| 50/50 [00:44<00:00,  1.11it/s]


Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.12	Test Precision: 0.46
Train Recall: 0.09	Test Recall: 0.11
Train set shape: (138493, 26689) Test set shape: (138493, 26689)


In [306]:
items_mapping = {v: k for k, v in dataset.mapping()[2].items()}
items_mapping

{0: 2,
 1: 29,
 2: 32,
 3: 47,
 4: 50,
 5: 112,
 6: 151,
 7: 223,
 8: 253,
 9: 260,
 10: 293,
 11: 296,
 12: 318,
 13: 337,
 14: 367,
 15: 541,
 16: 589,
 17: 593,
 18: 653,
 19: 919,
 20: 924,
 21: 1009,
 22: 1036,
 23: 1079,
 24: 1080,
 25: 1089,
 26: 1090,
 27: 1097,
 28: 1136,
 29: 1193,
 30: 1196,
 31: 1198,
 32: 1200,
 33: 1201,
 34: 1208,
 35: 1214,
 36: 1215,
 37: 1217,
 38: 1219,
 39: 1222,
 40: 1240,
 41: 1243,
 42: 1246,
 43: 1249,
 44: 1258,
 45: 1259,
 46: 1261,
 47: 1262,
 48: 1266,
 49: 1278,
 50: 1291,
 51: 1304,
 52: 1321,
 53: 1333,
 54: 1348,
 55: 1350,
 56: 1358,
 57: 1370,
 58: 1374,
 59: 1387,
 60: 1525,
 61: 1584,
 62: 1750,
 63: 1848,
 64: 1920,
 65: 1967,
 66: 1994,
 67: 1997,
 68: 2021,
 69: 2100,
 70: 2118,
 71: 2138,
 72: 2140,
 73: 2143,
 74: 2173,
 75: 2174,
 76: 2193,
 77: 2194,
 78: 2253,
 79: 2288,
 80: 2291,
 81: 2542,
 82: 2628,
 83: 2644,
 84: 2648,
 85: 2664,
 86: 2683,
 87: 2692,
 88: 2716,
 89: 2761,
 90: 2762,
 91: 2804,
 92: 2872,
 93: 2918,
 94

In [320]:
original_ratings = pd.read_csv("Data/Data_LightFm/ratings.csv")

In [309]:
movies_df = pd.read_csv("Data/archive/movie.csv")

In [324]:
def make_pred (user_id, model,movies_df,nb_preds):
    mapped_user_id = dataset.mapping()[0][user_id]
    watched_movies = ratings_df[ratings_df["userId"]==user_id]["movieId"].values
    mapped_watched_movies = [dataset.mapping()[2][movie] for movie in watched_movies]
    non_watched_movies = np.setdiff1d(np.arange(n_items), mapped_watched_movies)
    prediction = model.predict(mapped_user_id,non_watched_movies)
    predicted_movies = prediction.argsort()[::-1] [:nb_preds]
    results = [items_mapping[item] for item in predicted_movies]
    recommended_movies = pd.DataFrame({"movieId":results,"title":movies_df[movies_df["movieId"].isin(results)]["title"],"genres":movies_df[movies_df["movieId"].isin(results)]["genres"],"Score":sorted(prediction)[::-1][:nb_preds]})
    return recommended_movies

In [321]:
def get_watched_movies(user_id):
    # mapped_user_id = dataset.mapping()[0][user_id]
    # watched_movies = interactions.tocsr()[mapped_user_id]
    # watched_movies.indices
    # mapped_watched_movies = [items_mapping[movie_id] for movie_id in watched_movies.indices]
    # rated_watched_movies = pd.DataFrame({"movieId":mapped_watched_movies,"title":movies_df[movies_df.movieId.isin(mapped_watched_movies)].title}).merge(ratings_df[ratings_df.userId==user_id],on="movieId",how="left")
    # return rated_watched_movies.sort_values("rating",ascending=False)
    return original_ratings[original_ratings.userId==user_id].merge(movies_df,on="movieId",how="left").sort_values("rating",ascending=False)

In [334]:
def user_pred(user_id,nb_preds=10):
    print("Watched Movies by user ",user_id)
    display(get_watched_movies(user_id)[:nb_preds])
    print("Recommended Movies for user ",user_id)
    display(make_pred(user_id,model,movies_df,nb_preds))

In [337]:
user_pred(1609)

Watched Movies by user  1609


Unnamed: 0,userId,movieId,rating,title,genres
97,1609,39292,5.0,"Good Night, and Good Luck. (2005)",Crime|Drama
16,1609,919,5.0,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical
56,1609,3996,5.0,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",Action|Drama|Romance
57,1609,4011,5.0,Snatch (2000),Comedy|Crime|Thriller
59,1609,4226,5.0,Memento (2000),Mystery|Thriller
60,1609,4306,5.0,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
63,1609,4993,5.0,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
32,1609,1527,5.0,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi
66,1609,5618,5.0,Spirited Away (Sen to Chihiro no kamikakushi) ...,Adventure|Animation|Fantasy
30,1609,1291,5.0,Indiana Jones and the Last Crusade (1989),Action|Adventure


Recommended Movies for user  1609


Unnamed: 0,movieId,title,genres,Score
250,2042,Interview with the Vampire: The Vampire Chroni...,Drama|Horror,1.989173
290,3753,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,1.969483
363,924,"Mask, The (1994)",Action|Comedy|Crime|Fantasy,1.962369
372,253,"River Wild, The (1994)",Action|Thriller,1.958772
902,367,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,1.907299
907,376,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,1.902722
1958,2448,D2: The Mighty Ducks (1994),Children|Comedy,1.89065
2363,2793,Virus (1999),Horror|Sci-Fi,1.880929
2707,293,"American Werewolf in Paris, An (1997)",Comedy|Horror|Romance|Thriller,1.871633
3662,919,"Patriot, The (2000)",Action|Drama|War,1.84112
