# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [2]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import pandas as pd

from itertools import islice, cycle, product

from lightfm.data import Dataset
from lightfm import LightFM

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')




## 1. 1. Helper functions to avoid copy paste

In [3]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [4]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [5]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [6]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [7]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


## 2.2 Data preparation using LightFM Dataset

To use this class we need the in the following format:
- userId
- movieId
- user_features - user feature names
- item_features - item feature names

It has several methods:
- build_interactions - definition of user / item interactions matrix using iterators on top of tuples:
1. (userId, movieId);
2. (userId, movieId, weight / rating)
- build_user_features/build_item_features - defition of user/item features using iterators on top of tuples:
1. (userId, [user_feature_name1, user_feature_name2, ...]);
2. (userId, {user_feature_name1: weight});
3. The same goes for item features


In [8]:
# init class
dataset = Dataset()

In [9]:
# fit tuple of user and movie interactions
dataset.fit(interactions['userId'].unique(), interactions['movieId'].unique())

We do not have users data in MovieLens dataset so let's skip part features generation

In [10]:
# now, we define lightfm mapper to use it later for checks
lightfm_mapping = dataset.mapping()
# lightfm_mapping

In [11]:
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'user_features_mapping': lightfm_mapping[1],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('user mapper length - ', len(lightfm_mapping['users_mapping']))
print('user features mapper length - ', len(lightfm_mapping['user_features_mapping']))
print('movies mapper length - ', len(lightfm_mapping['items_mapping']))
print('Users movie features mapper length - ', len(lightfm_mapping['item_features_mapping']))

user mapper length -  671
user features mapper length -  671
movies mapper length -  9066
Users movie features mapper length -  9066


As we do not have user / movie features their length are the same as userId and movieId

In [12]:
# here we create inverted mappers to check recommendations later
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

As we mentioned earlier, we need to create iterators

In [13]:
def df_to_tuple_iterator(df: pd.DataFrame):
    '''
    :df: pd.DataFrame, interactions dataframe
    returs iterator
    '''
    return zip(*df.values.T)

def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [14]:
# defining train set on the whole interactions dataset (as HW you will have to split into test and train for evaluation)
train_mat, train_mat_weights = dataset.build_interactions(df_to_tuple_iterator(interactions_filtered[['userId', 'movieId']]))

In [15]:
train_mat

<671x9066 sparse matrix of type '<class 'numpy.int32'>'
	with 44989 stored elements in COOrdinate format>

In [16]:
train_mat_weights

<671x9066 sparse matrix of type '<class 'numpy.float32'>'
	with 44989 stored elements in COOrdinate format>

## 2.3. Model Training & Evaluation

### 2.3.1. Train Model

In [17]:
# set params
NO_COMPONENTS = 64
LEARNING_RATE = .03
LOSS = 'warp'
MAX_SAMPLED = 5
RANDOM_STATE = 42
EPOCHS = 20

In [18]:
# init model
lfm_model = LightFM(
    no_components = NO_COMPONENTS,
    learning_rate = LEARNING_RATE,
    loss = LOSS,
    max_sampled = MAX_SAMPLED,
    random_state = RANDOM_STATE
    )

In [19]:
# execute training
for _ in tqdm_notebook(range(EPOCHS), total = EPOCHS):
    lfm_model.fit_partial(
        train_mat, 
        num_threads = 4
    )

  0%|          | 0/20 [00:00<?, ?it/s]

### 2.3.2. Evaluate the Model

In [42]:
# let's make sense-check
top_N = 10
user_id = interactions['userId'][1000]
row_id = lightfm_mapping['users_mapping'][user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 15, row number in matrix - 14


In [36]:
all_cols = list(lightfm_mapping['items_mapping'].values())
len(all_cols)

9066

In [43]:
pred = lfm_model.predict(
    row_id,
    all_cols,
    num_threads = 4)
pred, pred.shape

(array([-4.3351274, -4.6056437, -4.6856403, ..., -3.575996 , -4.7737412,
        -4.539937 ], dtype=float32),
 (9066,))

In [44]:
top_cols = np.argpartition(pred, -np.arange(top_N))[-top_N:][::-1]
top_cols

array([ 49,  92,  99, 122, 119, 480,  72, 474, 110, 468])

In [45]:
pred[top_cols]

array([2.8615909, 2.6189132, 2.606794 , 2.4146852, 2.177145 , 2.144121 ,
       2.1371117, 1.9876009, 1.9398937, 1.8170773], dtype=float32)

In [40]:
# crate mapper for movieId and title names
item_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['original_title']))

In [46]:
recs = pd.DataFrame({'col_id': top_cols})
recs['movieId'] = recs['col_id'].map(lightfm_mapping['items_inv_mapping'].get).astype(str)
recs['title'] = recs['movieId'].map(item_name_mapper)
recs

Unnamed: 0,col_id,movieId,title
0,49,296,Terminator 3: Rise of the Machines
1,92,593,Солярис
2,99,318,The Million Dollar Hotel
3,122,2959,License to Wed
4,119,2762,Young and Innocent
5,480,4993,5 Card Stud
6,72,480,Monsoon Wedding
7,474,4226,Shriek If You Know What I Did Last Friday the ...
8,110,1580,Rope
9,468,3578,Der Tunnel


In [26]:
recs = pd.DataFrame({'col_id': top_cols})
recs['movieId'] = recs['col_id'].map(lightfm_mapping['items_inv_mapping'].get).astype(str)
recs['title'] = recs['movieId'].map(item_name_mapper)
recs

Unnamed: 0,col_id,movieId,title
0,143,260,The 39 Steps
1,72,480,Monsoon Wedding
2,49,296,Terminator 3: Rise of the Machines
3,92,593,Солярис
4,113,2028,Say Anything...
5,110,1580,Rope
6,119,2762,Young and Innocent
7,505,608,Men in Black II
8,79,527,Once Were Warriors
9,99,318,The Million Dollar Hotel


# TODO
- Make train/test split -- train the model appropiately and predict on test set;
- Wrap up in function recommendations - lfm_recommend();
- Calculate `NDCG@10` on test set