# DZ Рекомендательные системы

## Задание

Использовать датасет MovieLens.

1. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах;
- средние оценки (+ median, variance и т. д.) пользователя и фильма.
2. Оценить RMSE на тестовой выборке.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import statistics

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
# Data
links = pd.read_csv('D:/Work/KimT/Учеба/Новая папка/ml-latest-small/links.csv')
movies = pd.read_csv('D:/Work/KimT/Учеба/Новая папка/ml-latest-small/movies.csv')
ratings = pd.read_csv('D:/Work/KimT/Учеба/Новая папка/ml-latest-small/ratings.csv')
tags = pd.read_csv('D:/Work/KimT/Учеба/Новая папка/ml-latest-small/tags.csv')

In [4]:
movies.head()
# tags.head()
len(movies.movieId)

9742

### Жанры фильмов

In [5]:
# movies.genres[0].split('|')

In [6]:
# movies.genres.values[0]

In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [9]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [10]:
# ?TfidfVectorizer

In [11]:
pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [12]:
neigh = NearestNeighbors(n_neighbors=7, metric='euclidean')
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_neighbors=7)

In [13]:
test = change_string("Adventure|Comedy|Fantasy|Documentary")

X_tfidf = tfidf.transform([test])

res = neigh.kneighbors(X_tfidf, return_distance=True)

In [14]:
res

(array([[0.38758619, 0.56525059, 0.58114553, 0.65617877, 0.65617877,
         0.66834044, 0.66834044]]),
 array([[8014, 5836, 8161, 4853, 7597, 5636, 5627]], dtype=int64))

In [15]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
8014,97757,'Hellboy': The Seeds of Creation (2004),Action|Adventure|Comedy|Documentary|Fantasy
5836,32314,Incident at Loch Ness (2004),Adventure|Comedy|Documentary
8161,102590,Darkon (2006),Documentary|Fantasy
4853,7256,Touching the Void (2003),Adventure|Documentary
7597,86593,African Cats (2011),Adventure|Documentary
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
5627,27251,"10th Kingdom, The (2000)",Adventure|Comedy|Fantasy


## Теги фильмов

In [16]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [17]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [18]:
movies_with_tags.dropna(inplace=True)
movies_with_tags.tag.unique().shape
movies_with_tags.title.unique().shape

(1572,)

In [19]:
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies_ = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies_.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [20]:
tag_strings[:10]

['artistic funny humorous inspiring intelligent quirky romance zooeydeschanel',
 'lawyers',
 'creepy suspense',
 'shakespearesortof',
 'dogs remake',
 'disney',
 'terrorism',
 'court claustrophobic confrontational earnest gooddialogue greatscreenplay gritty motivational thoughtprovoking',
 'stranded',
 'markruffalo']

In [21]:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [22]:
pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names())

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [23]:
neigh_tag = NearestNeighbors(n_neighbors=10, p=1)
neigh_tag.fit(X_train_tfidf_tag)

NearestNeighbors(n_neighbors=10, p=1)

In [24]:
test = 'highschool pixar fun'

X_tfidf_tag = tfidf_tag.transform([test])

res = neigh_tag.kneighbors(X_tfidf_tag, return_distance=True)

In [25]:
res

(array([[0.95532444, 1.49356676, 1.52685966, 1.70596435, 1.70596435,
         1.70596435, 1.70596435, 1.70596435, 1.70596435, 1.70596435]]),
 array([[1432,  211,  565,  940,  439,  590,  336,  951,  432,  546]],
       dtype=int64))

In [26]:
for i in res[1][0]:
    print(movies_[i], tag_strings[i])

Toy Story (1995) pixar pixar fun
Bug's Life, A (1998) pixar
Guardians of the Galaxy 2 (2017) fun
Napoleon Dynamite (2004) highschool
Ferris Bueller's Day Off (1986) highschool
Heathers (1989) highschool
Dead Poets Society (1989) highschool highschool
Never Been Kissed (1999) highschool
Fast Times at Ridgemont High (1982) highschool
Grease (1978) highschool


### Средняя оценка (+median, variance) пользователя и фильма

In [27]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [28]:
ratings.userId.unique().shape

(610,)

In [29]:
rating_list_user = []
users = []

for user, group in tqdm(ratings.groupby('userId')):
    rating_list_user.append(group.rating.values)
    users.append(user)

  0%|          | 0/610 [00:00<?, ?it/s]

In [30]:
users
rating_list_user[:6]

[array([4., 4., 4., 5., 5., 3., 5., 4., 5., 5., 5., 5., 3., 5., 4., 5., 3.,
        3., 5., 4., 4., 5., 4., 3., 4., 5., 4., 3., 5., 4., 4., 5., 4., 4.,
        4., 5., 5., 3., 5., 3., 4., 3., 3., 4., 5., 5., 5., 4., 5., 3., 5.,
        5., 5., 5., 3., 5., 5., 4., 5., 4., 5., 5., 5., 4., 5., 5., 4., 5.,
        5., 5., 5., 5., 4., 5., 5., 4., 2., 5., 5., 5., 5., 5., 5., 3., 4.,
        5., 5., 5., 5., 5., 5., 4., 3., 3., 3., 3., 4., 4., 5., 4., 5., 3.,
        5., 5., 4., 5., 3., 3., 5., 4., 4., 5., 4., 4., 5., 5., 4., 4., 5.,
        4., 5., 4., 5., 4., 5., 4., 5., 5., 5., 3., 5., 4., 4., 4., 5., 5.,
        5., 5., 5., 4., 5., 4., 4., 2., 4., 4., 5., 5., 2., 5., 4., 5., 2.,
        5., 4., 3., 5., 4., 5., 5., 4., 4., 5., 3., 5., 5., 5., 5., 5., 4.,
        2., 4., 4., 5., 4., 4., 5., 3., 5., 5., 5., 5., 4., 4., 5., 5., 5.,
        4., 5., 5., 5., 5., 5., 4., 5., 5., 5., 4., 5., 5., 5., 5., 4., 5.,
        4., 1., 3., 3., 5., 5., 5., 4., 4., 5., 5., 5., 4., 4., 4., 5., 4.,
        4., 

In [31]:
rating_list_movie = []
movie_rat = []

for movie, group in tqdm(ratings.groupby('movieId')):
    rating_list_movie.append(group.rating.values)
    movie_rat.append(movie)

  0%|          | 0/9724 [00:00<?, ?it/s]

In [32]:
rating_list_movie[:6]
# movie_rat

[array([4. , 4. , 4.5, 2.5, 4.5, 3.5, 4. , 3.5, 3. , 5. , 3. , 3. , 5. ,
        5. , 3. , 4. , 5. , 3. , 3. , 5. , 5. , 4. , 4. , 2.5, 5. , 4.5,
        0.5, 4. , 2.5, 4. , 3. , 3. , 4. , 3. , 5. , 4.5, 4. , 4. , 3. ,
        3.5, 4. , 4. , 3. , 2. , 3. , 4. , 4. , 3. , 4. , 3.5, 5. , 5. ,
        2. , 3. , 4. , 4.5, 4. , 4. , 5. , 3.5, 4.5, 5. , 5. , 4. , 4. ,
        4. , 4. , 4. , 4. , 2. , 3.5, 5. , 4. , 5. , 3.5, 3. , 3. , 4. ,
        3.5, 5. , 3.5, 3.5, 5. , 3.5, 3. , 5. , 4. , 5. , 5. , 4. , 4.5,
        4.5, 4. , 4. , 2. , 5. , 5. , 5. , 4. , 5. , 4. , 4. , 3. , 4.5,
        4.5, 3. , 4.5, 4. , 4. , 4. , 3. , 2. , 5. , 4. , 3. , 3.5, 3.5,
        5. , 4. , 4. , 3.5, 4. , 4. , 4. , 5. , 5. , 4. , 5. , 5. , 4. ,
        5. , 5. , 3. , 3. , 4.5, 5. , 3.5, 4.5, 4. , 5. , 3. , 5. , 4. ,
        3.5, 5. , 2. , 4. , 4. , 4. , 2.5, 4. , 4. , 4.5, 4. , 5. , 5. ,
        5. , 5. , 4.5, 1.5, 4. , 4. , 4. , 5. , 4. , 4. , 4. , 3. , 4. ,
        4.5, 4.5, 3.5, 4. , 4. , 4. , 4. , 4. , 4. 

In [33]:
# Средняя оценка 
def rating_mean(rating_list):
    rating_mean = []
    for i in range(len(rating_list)):
        rating_mean.append(round(np.mean(rating_list[i]), 1))
    return rating_mean

In [34]:
# Медиана
def rating_median(rating_list):
    rating_median = []
    for i in range(len(rating_list)):
        rating_median.append(statistics.median(rating_list[i]))
    return rating_median

In [35]:
# Дисперсия
def rating_variance(rating_list):
    rating_var = []
    
    for i in range(len(rating_list)):
        if len(rating_list[i])<2: rating_var.append(0)
        else:
            rating_var.append(statistics.variance(rating_list[i]))
    return rating_var

In [36]:
user_rating_stat = pd.DataFrame({'userId': users,
                             'rating_mean_us': rating_mean(rating_list_user),
                             'rating_median_us': rating_median(rating_list_user), 
                             'rating_variance_us': rating_variance(rating_list_user)})

user_rating_stat.head()

Unnamed: 0,userId,rating_mean_us,rating_median_us,rating_variance_us
0,1,4.4,5.0,0.640077
1,2,3.9,4.0,0.649015
2,3,2.4,0.5,4.370783
3,4,3.6,4.0,1.727132
4,5,3.6,4.0,0.980973


In [37]:
movie_rating_stat = pd.DataFrame({'movieId': movie_rat,
                             'rating_mean_mov': rating_mean(rating_list_movie),
                             'rating_median_mov': rating_median(rating_list_movie), 
                             'rating_variance_mov': rating_variance(rating_list_movie)})

movie_rating_stat.head()

Unnamed: 0,movieId,rating_mean_mov,rating_median_mov,rating_variance_mov
0,1,3.9,4.0,0.69699
1,2,3.4,3.5,0.777419
2,3,3.3,3.0,1.112651
3,4,2.4,3.0,0.72619
4,5,3.1,3.0,0.822917


In [38]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [39]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
ratings.movieId.unique().shape

(9724,)

In [41]:
movies_with_rating = movies.merge(ratings, on='movieId')
movies_with_rating.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [42]:
movies_with_rating_stat_1 = movies_with_rating.merge(user_rating_stat, on='userId')
movies_with_rating_stat_full = movies_with_rating_stat_1.merge(movie_rating_stat, on='movieId')
movies_with_rating_stat_full.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,rating_mean_us,rating_median_us,rating_variance_us,rating_mean_mov,rating_median_mov,rating_variance_mov
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,4.4,5.0,0.640077,3.9,4.0,0.69699
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,3.6,4.0,0.980973,3.9,4.0,0.69699
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,3.2,3.5,1.76782,3.9,4.0,0.69699
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,3.4,3.5,1.284605,3.9,4.0,0.69699
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,4.2,4.0,0.258562,3.9,4.0,0.69699


In [43]:
movies_with_rating_stat_full.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp',
       'rating_mean_us', 'rating_median_us', 'rating_variance_us',
       'rating_mean_mov', 'rating_median_mov', 'rating_variance_mov'],
      dtype='object')

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 

In [45]:
model = LogisticRegression() # берем в качестве модели логистическую регресиию 

Предсказание по средней оценке пользователя

In [46]:
X = movies_with_rating_stat_full[['movieId', 'userId', 'rating', 'timestamp',
       'rating_mean_us', 'rating_median_us', 'rating_variance_us',
       'rating_mean_mov', 'rating_median_mov', 'rating_variance_mov']]
y = movies_with_rating_stat_full['movieId']

In [47]:
y

0              1
1              1
2              1
3              1
4              1
           ...  
100831    193579
100832    193581
100833    193583
100834    193585
100835    193587
Name: movieId, Length: 100836, dtype: int64

In [48]:
# Разделим данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
model.fit(X_train, y_train)

KeyboardInterrupt: 