#  Задача. Стажёр в команду core ml. 2 этап

На датасете MovieLens 20M сравнить два подохода к построени рекомендаций:
* коллаборативный: использовать только рейтинги. Например, SVD-like алгоритмы, ALS, Implicit-ALS
* коллаборативрный + контентный: использовать рейтинги и всю дополнительную инфомрацию о фильмах, имеющуюся в датасете. Нарпимер, LightFM.

Для выполнения задания:
1. Выбрать метрику и обосновать выбор.
1. Придумать и обосновать способ разбиения данных на обучение и валидацию.
3. Обратить внимание на сходимость обучения и настрйоку важных гиперпараметров моделей.
4. Выбрать лучшую модель (При необхдоимости, оценить ститистическую значимость результатов).
1. Ввыложить на github воспроизводимый код и краткий текстовый отчёт с выводами.

**Цель рекомендации:**

**Метрики:**

* качество рекомендаций
* бизнесовость? например, если показали фильм, значит типо пользователь его "купил", мы заработали
* чем точнее покажем, тем больше заработаем

**Разбиение данных**
Исходя из последнего данные следует разбить по времени. Учится на ранних данных, а тестировать на поздних.


In [1]:
import os
import pandas as pd
from scipy.linalg import sqrtm

from scipy.sparse import csr_matrix


# Данные

In [2]:
print(os.listdir("./data"))

['archive.zip', 'codf.csv', 'genome_scores.csv', 'genome_tags.csv', 'link.csv', 'movie.csv', 'movie_svd', 'rating.csv', 'tag.csv']


The data are contained in six files.

tag.csv that contains tags applied to movies by users:

    userId
    movieId
    tag
    timestamp

rating.csv that contains ratings of movies by users:

    userId
    movieId
    rating
    timestamp

movie.csv that contains movie information:

    movieId
    title
    genres

link.csv that contains identifiers that can be used to link to other sources:

    movieId
    imdbId
    tmbdId

genome_scores.csv that contains movie-tag relevance data:

    movieId
    tagId
    relevance

genome_tags.csv that contains tag descriptions:

    tagId
    tag


In [3]:
# tag = pd.read_csv('./data/tag.csv')
rating = pd.read_csv('./data/rating.csv')
movies = pd.read_csv('./data/movie.csv')
# genome_scores = pd.read_csv('./data/genome_scores.csv')
# link = pd.read_csv('./data/link.csv')
# genome_tag = pd.read_csv('./data/genome_tags.csv')

In [4]:
# сколько оценок сделали пользователи

print('Сколько уникальных пользователей сделало оценки', rating['userId'].value_counts().shape)
rating['userId'].value_counts()
# rating[rating['userId'] == 80291]

Сколько уникальных пользователей сделало оценки (138493,)


118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
59390       20
23558       20
34668       20
80291       20
58028       20
Name: userId, Length: 138493, dtype: int64

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Простое решение без разделения по времени

## Формирование таблицы "пользователь-фильм"

In [5]:
# Оставляем пользователей, которые сделали много оценок

x = rating['userId'].value_counts() > 500  # сделано оценок больше 500
y = x[x].index
rating = rating[rating['userId'].isin(y)]
print(rating.shape)

del x, y

(6554416, 4)


In [6]:
# Объединяем рейтинги и характеристики фильмов

movie_details = movies.merge(rating,on='movieId')
movie_details.drop(columns=['timestamp'], inplace=True)
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,91,4.0


In [7]:
# Сколько оценок у фильма

number_rating = movie_details.groupby('title')['rating'].count().reset_index()
number_rating.rename(columns={'rating':'number of rating'},inplace=True)
number_rating.head()

Unnamed: 0,title,number of rating
0,#chicagoGirl: The Social Network Takes on a Di...,2
1,$ (Dollars) (1971),7
2,$5 a Day (2008),10
3,$9.99 (2008),17
4,$ellebrity (Sellebrity) (2012),1


In [8]:
# Оставляем фильмы с большим количеством оценок

df = movie_details.merge(number_rating, on='title')
df = df[df['number of rating'] >= 50]
df.drop_duplicates(['title','userId'], inplace=True)
df.drop(columns=['number of rating'], inplace=True)

# df['rating']=df['rating'].astype(int)

movie_details = df
del df
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,91,4.0


In [9]:
# Таблица "пользователь-фильм"

movie_pivot_na = movie_details.pivot_table(columns='userId', index='movieId', values='rating')
movie_pivot = movie_pivot_na.fillna(0)
movie_sparse = csr_matrix(movie_pivot)

movie_pivot

userId,11,24,54,58,91,104,116,134,156,208,...,138270,138301,138307,138325,138382,138397,138406,138411,138437,138474
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.5,4.0,4.0,5.0,4.0,0.0,3.0,4.0,5.0,4.0,...,0.0,2.5,3.5,5.0,3.0,0.0,4.0,5.0,4.0,5.0
2,0.0,0.0,3.0,0.0,3.5,0.0,2.0,0.0,5.0,0.0,...,0.0,2.5,2.5,3.0,4.0,0.0,3.0,0.0,0.0,4.0
3,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
5,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# movie_details.drop(columns=['genres','userId','rating'],inplace=True)
# movie_details.drop_duplicates(inplace=True)
# movie_details

In [158]:
# Название фильмов 

ids = movie_pivot.index.values
movies_idtitle = movie_details[['movieId', 'title']].drop_duplicates('movieId')
movies_idtitle[movies_idtitle['movieId'].isin(ids)]
movies_idtitle = movies_idtitle.reset_index()
movies_idtitle

Unnamed: 0,index,movieId,title
0,0,1,Toy Story (1995)
1,6303,2,Jumanji (1995)
2,10913,3,Grumpier Old Men (1995)
3,13130,4,Waiting to Exhale (1995)
4,13776,5,Father of the Bride Part II (1995)
...,...,...,...
9360,6550974,118700,Selma (2014)
9361,6551253,118900,Wild (2014)
9362,6551476,118997,Into the Woods (2014)
9363,6551623,119141,The Interview (2014)


In [11]:
# del tag, rating, movies, link, genome_scores, genome_tag, number_rating, ids
del rating, movies, number_rating, ids

## Коллаборативный по рейтингам

На вход принимаем фильм, а на выходе список фильмов, рекомендуемые к просмотру после этого фильма

In [152]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors( n_neighbors=7,algorithm='brute',metric='cosine')

model.fit(movie_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=7)

###  Чистые данные

In [153]:
# Рекмоендация фильма
# Пока что сам фильм, для которго рекомендуем, входит в список рекомендуемых

def reco(movie_name):
    movie_id = movies_idtitle[movies_idtitle['title'] == movie_name]['movieId']

    if(len(movie_id) == 0):  # фильм с таким названием не найден
        return ['No films']

    movie_id = movie_id.values[0]
    print('Рекомендация для фильма:', movie_id, "-", movie_name)

    distances, suggestions = model.kneighbors(movie_pivot.loc[movie_id,:].values.reshape(1,-1))
    res = movies_idtitle[movies_idtitle['movieId'].isin(movie_pivot.index[suggestions[0]].values)]['title'].values
#     print(movie_pivot.index[suggestions[0]].values)
#     remove = movie_pivot.index[suggestions[0]].values == movie_id   # удаляем сам фильм, по которому рекомендуем ,если он попал в список
#     res = res[~remove]

    return res  # , res[~remove], res[remove]
# reco(title)

In [154]:
random_films = pd.Series(movies_idtitle.movieId.values).sample(n=10, random_state=123456).values

for film in random_films:
    title = movies_idtitle[movies_idtitle['movieId'] == film].title.values[0]
#     print(film, title)
    
    res = reco(title)
    for i in res:
        print(i)
    print()

Рекомендация для фильма: 5291 - Rashomon (Rashômon) (1950)
Third Man, The (1949)
Ran (1985)
Seven Samurai (Shichinin no samurai) (1954)
Yojimbo (1961)
Wild Strawberries (Smultronstället) (1957)
Rashomon (Rashômon) (1950)
Throne of Blood (Kumonosu jô) (1957)

Рекомендация для фильма: 57532 - Meet the Spartans (2008)
Date Movie (2006)
Scary Movie 4 (2006)
Epic Movie (2007)
Meet the Spartans (2008)
Love Guru, The (2008)
Superhero Movie (2008)
Disaster Movie (2008)

Рекомендация для фильма: 86880 - Pirates of the Caribbean: On Stranger Tides (2011)
Pirates of the Caribbean: At World's End (2007)
Thor (2011)
Pirates of the Caribbean: On Stranger Tides (2011)
X-Men: First Class (2011)
Captain America: The First Avenger (2011)
Avengers, The (2012)
Sherlock Holmes: A Game of Shadows (2011)

Рекомендация для фильма: 4622 - Loverboy (1989)
Can't Buy Me Love (1987)
Love Potion #9 (1992)
Johnny Be Good (1988)
License to Drive (1988)
Dream a Little Dream (1989)
Loverboy (1989)
Road House (1989)

Ре

## SVD

"Восстанавливаем" матрицу пользователь-фильмы

In [78]:
%%time

def svd(train, k):
    utilMat = np.array(train)    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    
    item_means = np.mean(masked_arr, axis=0)    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)    
    x = np.tile(item_means, (utilMat.shape[0],1))    # we remove the per item average from all entries.
    
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x    # The magic happens here. U and V are user and item features
    
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    
    s = np.diag(s)    # we take only the k most significant features
    s = s[0:k,0:k]
    U = U[:,0:k]
    V = V[0:k,:]    
    
    s_root = sqrtm(s)    
    Usk=np.dot(U,s_root)
    skV = np.dot(s_root,V)
    UsV = np.dot(Usk, skV)    
    UsV = UsV + x    
    print("svd done")
    return UsV

movie_svd = svd(movie_pivot_na, 3)

svd done
Wall time: 1min 30s


In [200]:
# Save data to file
# movie_svd.dump('./data/movie_svd')

# Считываем данные
# movie_svd = np.load('./data/movie_svd', allow_pickle=True)
# movie_svd = movie_svd.data

print(movie_svd.shape)

# Обновить модель
model = NearestNeighbors( n_neighbors=20, algorithm='brute', metric='cosine')
model.fit(movie_svd)

(9365, 7441)


NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)

In [226]:
# Рекомендация фильма
# Функция рекомендаций с использованием SVD-"восстановленными" оценками
# Пока что сам фильм, для которго рекомендуем, входит в список рекомендуемых

# mp = movie_svd
# mp = movie_pivot

def reco(movie_name, model, movie_pivot):
# if True:
    movie_id = movies_idtitle[movies_idtitle['title'] == movie_name]['movieId']

    if(len(movie_id) == 0):  # фильм с таким названием не найден
        return ['Такой фильм не найден']

    movie_id = movie_id.values[0]
#     print('Рекомендация для фильма:', movie_id, "-", movie_name)

    if isinstance(movie_pivot,  np.ndarray):
        movie_id = movies_idtitle[movies_idtitle['movieId'] == movie_id].index[0]
        distances, suggestions = model.kneighbors(movie_pivot[movie_id,:].reshape(1,-1))
        res = movies_idtitle[movies_idtitle['movieId'].isin(suggestions[0])]['title'].values
    else:
        distances, suggestions = model.kneighbors(movie_pivot.loc[movie_id,:].values.reshape(1,-1))
        res = movies_idtitle[movies_idtitle['movieId'].isin(movie_pivot.index[suggestions[0]].values)]['title'].values
    
#     print(movie_pivot.index[suggestions[0]].values)
#     remove = movie_pivot.index[suggestions[0]].values == movie_id   # удаляем сам фильм, по которому рекомендуем ,если он попал в список
#     res = res[~remove]

    return res, distances  # , res[~remove], res[remove]


In [230]:
random_films = pd.Series(movies_idtitle.movieId.values).sample(n=10, random_state=12).values

for film in random_films:
    title = movies_idtitle[movies_idtitle['movieId'] == film].title.values[0]
    print('Рекомендация для фильма:', film, "-", title)
    res, sug = reco(title, model, movie_svd)
    for i in res[:10]:
        print(i)
    print()

Рекомендация для фильма: 8783 - Village, The (2004)
Little Lord Fauntleroy (1936)
Trick (1999)
Girl, Interrupted (1999)
Romeo Must Die (2000)
Filth and the Fury, The (2000)
Solas (1999)
Slumber Party Massacre II (1987)
Feds (1988)
Watchers (1988)
Domestic Disturbance (2001)

Рекомендация для фильма: 77330 - Red Riding: 1980 (2009)
Lethal Weapon 4 (1998)
A.I. Artificial Intelligence (2001)
Torch Song Trilogy (1988)
In a Lonely Place (1950)
Brother's Keeper (1992)
Ghost and Mr. Chicken, The (1966)
White Heat (1949)
Dear Frankie (2004)
Sherlock Holmes: Dressed to Kill (1946)
Come Back, Little Sheba (1952)

Рекомендация для фильма: 31737 - Bunny Lake Is Missing (1965)
Bed of Roses (1996)
Metro (1997)
Loss of Sexual Innocence, The (1999)
Night to Remember, A (1958)
Marathon Man (1976)
Girls Just Want to Have Fun (1985)
Chopper (2000)
Scary Movie 2 (2001)
Business of Strangers, The (2001)
Seance on a Wet Afternoon (1964)

Рекомендация для фильма: 1599 - Steel (1997)
Speed (1994)
Highlander I

##  Коллаборативный + контентный

# Решение с разделением на учебный/тестовый выборки

Требует довольно много памяти. у меня на лаптопе с 16Гб не уместилась.

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
# tag = pd.read_csv('./data/tag.csv')
rating = pd.read_csv('./data/rating.csv')
movies = pd.read_csv('./data/movie.csv')
# genome_scores = pd.read_csv('./data/genome_scores.csv')
# link = pd.read_csv('./data/link.csv')
# genome_tag = pd.read_csv('./data/genome_tags.csv')

In [3]:
data = movies.merge(rating,on='movieId')
# data.drop(columns=['timestamp'], inplace=True)
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [4]:
users = data['userId'].unique() #list of all users
movies = data['movieId'].unique() #list of all movies



In [13]:
%%time

test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)

test_ratio = 0.2 # fraction of data to be used as test set.

for u in users:
    temp = data[data['userId'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)
    
    temp = data.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)
    
    dummy_test = temp[n-1-test_size :]
    dummy_train = temp[: n-2-test_size]
    
    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])
    
    if (u > 10):
        break

Wall time: 7min 29s


In [16]:
%%time
# test.to_csv('./data/test.zip', compression='zip')

test = pd.read_csv('./data/test.zip', compression='zip')

Wall time: 2min 3s


In [17]:
%%time
# train.to_csv('./data/train.zip', compression='zip')
train = pd.read_csv('./data/train.zip', compression='zip')

Wall time: 8.31 ms


In [18]:
from scipy.linalg import sqrtm

def create_utility_matrix(data, formatizer = {'user':3, 'item': 0, 'value': 4}):    
    """
        :param data:      Array-like, 2D, nx3
        :param formatizer:pass the formatizer
        :return:          utility matrix (n x m), n=users, m=items
    """
        
    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']
    userList = data.iloc[:,userField].tolist()
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()
    users = list(set(data.iloc[:,userField]))
    items = list(set(data.iloc[:,itemField]))
    
    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    
    for i in range(0,len(data)):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]    
        pd_dict[item][users_index[user]] = value
        
    X = pd.DataFrame(pd_dict)
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items    
    
    return X, users_index, items_index

In [19]:
def svd(train, k):
    utilMat = np.array(train)    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    
    item_means = np.mean(masked_arr, axis=0)    # nan entries will replaced by the average rating for each item
   
    utilMat = masked_arr.filled(item_means)    
    x = np.tile(item_means, (utilMat.shape[0],1))    # we remove the per item average from all entries.
    
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x    
    
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)  # The magic happens here. U and V are user and item features
    
    s = np.diag(s)    # we take only the k most significant features
    s = s[0:k,0:k]
    U = U[:,0:k]
    V = V[0:k,:]    
    
    s_root = sqrtm(s)    
    Usk=np.dot(U,s_root)
    skV = np.dot(s_root,V)
    UsV = np.dot(Usk, skV)    
    UsV = UsV + x    
    print("svd done")
    return UsV

In [None]:
%%time

def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)   # to test the performance over a different number of features

no_of_features = [8,10]
utilMat, users_index, items_index = create_utility_matrix(train)

for f in no_of_features: 
    svdout = svd(utilMat, k=f)
    pred = [] #to store the predicted ratings    
    
    for _,row in test.iterrows():
        user = row['userId']
        item = row['movieId']        
        
        u_index = users_index.get(user)
        if u_index is None:
            continue
            
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
        
    print(f)
    
print(rmse(test['rating'], pred))

svd done


True

# Использованные материалы

Я решал эту задачу впервые. поэтмоу пришлось много всего поискать, посмотреть, разобраться.

    https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset
    https://www.kaggle.com/code/sankha1998/collaborative-movie-recommendation-system
    https://towardsdatascience.com/beginners-guide-to-creating-an-svd-recommender-system-1fd7326d1f65
    https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
    
    возможно использование https://implicit.readthedocs.io/en/latest/index.html
    
    немного идей про метрики Takács, Gábor & Tikk, Domonkos. (2012). Alternating least squares for personalized ranking. 10.1145/2365952.2365972. 