# DZ Рекомендательные системы

## Задание

Использовать датасет MovieLens.

1. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах;
- средние оценки (+ median, variance и т. д.) пользователя и фильма.
2. Оценить RMSE на тестовой выборке.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import statistics

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
# Data
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
movies.head()
# tags.head()
len(movies.movieId)

9742

### Жанры фильмов

In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [6]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [7]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [8]:
mov_gens = movies.merge(pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names()), left_index=True, right_index=True)

# mov_gens.title.unique().shape
mov_gens

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),Drama,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [9]:
neigh = NearestNeighbors(n_neighbors=7, metric='euclidean')
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_neighbors=7)

In [10]:
test = change_string("Adventure|Comedy|Fantasy|Documentary")

X_tfidf = tfidf.transform([test])

res = neigh.kneighbors(X_tfidf, return_distance=True)

In [11]:
res

(array([[0.38758619, 0.56525059, 0.58114553, 0.65617877, 0.65617877,
         0.66834044, 0.66834044]]),
 array([[8014, 5836, 8161, 4853, 7597, 5636, 5627]], dtype=int64))

In [12]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
8014,97757,'Hellboy': The Seeds of Creation (2004),Action|Adventure|Comedy|Documentary|Fantasy
5836,32314,Incident at Loch Ness (2004),Adventure|Comedy|Documentary
8161,102590,Darkon (2006),Documentary|Fantasy
4853,7256,Touching the Void (2003),Adventure|Documentary
7597,86593,African Cats (2011),Adventure|Documentary
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
5627,27251,"10th Kingdom, The (2000)",Adventure|Comedy|Fantasy


## Теги фильмов

In [13]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [14]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [15]:
movies_with_tags.dropna(inplace=True)
movies_with_tags.tag.unique().shape
movies_with_tags.title.unique().shape

(1572,)

In [16]:
movies_with_tags.movieId.unique().shape

(1572,)

In [17]:
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies_ = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies_.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [18]:
type(movies_)

list

In [19]:
movies.loc[movies['title']=='eXistenZ (1999)']

Unnamed: 0,movieId,title,genres
1961,2600,eXistenZ (1999),Action|Sci-Fi|Thriller


In [20]:
tag_strings[:1]

['artistic funny humorous inspiring intelligent quirky romance zooeydeschanel']

In [21]:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [22]:
mov_df = pd.DataFrame(movies_,  columns =['title'])
mov_df

Unnamed: 0,title
0,(500) Days of Summer (2009)
1,...And Justice for All (1979)
2,10 Cloverfield Lane (2016)
3,10 Things I Hate About You (1999)
4,101 Dalmatians (1996)
...,...
1567,Zero Dark Thirty (2012)
1568,Zombieland (2009)
1569,Zoolander (2001)
1570,Zulu (1964)


In [23]:
mov_tags = mov_df.merge(pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names()), left_index=True, right_index=True)
mov_tags

Unnamed: 0,title,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,Zero Dark Thirty (2012),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1568,Zombieland (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1569,Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1570,Zulu (1964),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [24]:
neigh_tag = NearestNeighbors(n_neighbors=10, p=1)
neigh_tag.fit(X_train_tfidf_tag)

NearestNeighbors(n_neighbors=10, p=1)

In [25]:
test = 'highschool pixar fun'

X_tfidf_tag = tfidf_tag.transform([test])

res = neigh_tag.kneighbors(X_tfidf_tag, return_distance=True)

In [26]:
res

(array([[0.95532444, 1.49356676, 1.52685966, 1.70596435, 1.70596435,
         1.70596435, 1.70596435, 1.70596435, 1.70596435, 1.70596435]]),
 array([[1432,  211,  565,  940,  439,  590,  336,  951,  432,  546]],
       dtype=int64))

In [27]:
for i in res[1][0]:
    print(movies_[i], tag_strings[i])

Toy Story (1995) pixar pixar fun
Bug's Life, A (1998) pixar
Guardians of the Galaxy 2 (2017) fun
Napoleon Dynamite (2004) highschool
Ferris Bueller's Day Off (1986) highschool
Heathers (1989) highschool
Dead Poets Society (1989) highschool highschool
Never Been Kissed (1999) highschool
Fast Times at Ridgemont High (1982) highschool
Grease (1978) highschool


In [28]:
mov_gens_tags = mov_gens.merge(mov_tags, how='inner', on='title')
# mov_gens_tags.loc[mov_gens_tags['movieId']==2600]
mov_gens_tags.title.unique().shape
mov_gens_tags

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,Father of the Bride Part II (1995),Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,Sabrina (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1569,183611,Game Night (2018),Action|Comedy|Crime|Horror,0.476923,0.000000,0.000000,0.000000,0.348475,0.552125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1570,184471,Tomb Raider (2018),Action|Adventure|Fantasy,0.497828,0.566649,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1571,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,0.572190,0.000000,0.000000,0.000000,0.418084,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1572,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Средняя оценка (+median, variance) пользователя и фильма

In [29]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [30]:
ratings.userId.unique().shape

(610,)

In [31]:
rating_list_user = []
users = []

for user, group in tqdm(ratings.groupby('userId')):
    rating_list_user.append(group.rating.values)
    users.append(user)

  0%|          | 0/610 [00:00<?, ?it/s]

In [32]:
users
rating_list_user[:6]

[array([4., 4., 4., 5., 5., 3., 5., 4., 5., 5., 5., 5., 3., 5., 4., 5., 3.,
        3., 5., 4., 4., 5., 4., 3., 4., 5., 4., 3., 5., 4., 4., 5., 4., 4.,
        4., 5., 5., 3., 5., 3., 4., 3., 3., 4., 5., 5., 5., 4., 5., 3., 5.,
        5., 5., 5., 3., 5., 5., 4., 5., 4., 5., 5., 5., 4., 5., 5., 4., 5.,
        5., 5., 5., 5., 4., 5., 5., 4., 2., 5., 5., 5., 5., 5., 5., 3., 4.,
        5., 5., 5., 5., 5., 5., 4., 3., 3., 3., 3., 4., 4., 5., 4., 5., 3.,
        5., 5., 4., 5., 3., 3., 5., 4., 4., 5., 4., 4., 5., 5., 4., 4., 5.,
        4., 5., 4., 5., 4., 5., 4., 5., 5., 5., 3., 5., 4., 4., 4., 5., 5.,
        5., 5., 5., 4., 5., 4., 4., 2., 4., 4., 5., 5., 2., 5., 4., 5., 2.,
        5., 4., 3., 5., 4., 5., 5., 4., 4., 5., 3., 5., 5., 5., 5., 5., 4.,
        2., 4., 4., 5., 4., 4., 5., 3., 5., 5., 5., 5., 4., 4., 5., 5., 5.,
        4., 5., 5., 5., 5., 5., 4., 5., 5., 5., 4., 5., 5., 5., 5., 4., 5.,
        4., 1., 3., 3., 5., 5., 5., 4., 4., 5., 5., 5., 4., 4., 4., 5., 4.,
        4., 

In [33]:
rating_list_movie = []
movie_rat = []

for movie, group in tqdm(ratings.groupby('movieId')):
    rating_list_movie.append(group.rating.values)
    movie_rat.append(movie)

  0%|          | 0/9724 [00:00<?, ?it/s]

In [34]:
rating_list_movie[:6]
# movie_rat

[array([4. , 4. , 4.5, 2.5, 4.5, 3.5, 4. , 3.5, 3. , 5. , 3. , 3. , 5. ,
        5. , 3. , 4. , 5. , 3. , 3. , 5. , 5. , 4. , 4. , 2.5, 5. , 4.5,
        0.5, 4. , 2.5, 4. , 3. , 3. , 4. , 3. , 5. , 4.5, 4. , 4. , 3. ,
        3.5, 4. , 4. , 3. , 2. , 3. , 4. , 4. , 3. , 4. , 3.5, 5. , 5. ,
        2. , 3. , 4. , 4.5, 4. , 4. , 5. , 3.5, 4.5, 5. , 5. , 4. , 4. ,
        4. , 4. , 4. , 4. , 2. , 3.5, 5. , 4. , 5. , 3.5, 3. , 3. , 4. ,
        3.5, 5. , 3.5, 3.5, 5. , 3.5, 3. , 5. , 4. , 5. , 5. , 4. , 4.5,
        4.5, 4. , 4. , 2. , 5. , 5. , 5. , 4. , 5. , 4. , 4. , 3. , 4.5,
        4.5, 3. , 4.5, 4. , 4. , 4. , 3. , 2. , 5. , 4. , 3. , 3.5, 3.5,
        5. , 4. , 4. , 3.5, 4. , 4. , 4. , 5. , 5. , 4. , 5. , 5. , 4. ,
        5. , 5. , 3. , 3. , 4.5, 5. , 3.5, 4.5, 4. , 5. , 3. , 5. , 4. ,
        3.5, 5. , 2. , 4. , 4. , 4. , 2.5, 4. , 4. , 4.5, 4. , 5. , 5. ,
        5. , 5. , 4.5, 1.5, 4. , 4. , 4. , 5. , 4. , 4. , 4. , 3. , 4. ,
        4.5, 4.5, 3.5, 4. , 4. , 4. , 4. , 4. , 4. 

In [35]:
# Средняя оценка 
def rating_mean(rating_list):
    rating_mean = []
    for i in range(len(rating_list)):
        rating_mean.append(round(np.mean(rating_list[i]), 1))
    return rating_mean

In [36]:
# Медиана
def rating_median(rating_list):
    rating_median = []
    for i in range(len(rating_list)):
        rating_median.append(statistics.median(rating_list[i]))
    return rating_median

In [37]:
# Дисперсия
def rating_variance(rating_list):
    rating_var = []
    
    for i in range(len(rating_list)):
        if len(rating_list[i])<2: rating_var.append(0)
        else:
            rating_var.append(statistics.variance(rating_list[i]))
    return rating_var

In [38]:
user_rating_stat = pd.DataFrame({'userId': users,
#                                  'ratings': rating_list_user,
                             'rating_mean_us': rating_mean(rating_list_user),
                             'rating_median_us': rating_median(rating_list_user), 
                             'rating_variance_us': rating_variance(rating_list_user)})

user_rating_stat.head()

Unnamed: 0,userId,rating_mean_us,rating_median_us,rating_variance_us
0,1,4.4,5.0,0.640077
1,2,3.9,4.0,0.649015
2,3,2.4,0.5,4.370783
3,4,3.6,4.0,1.727132
4,5,3.6,4.0,0.980973


In [39]:
movie_rating_stat = pd.DataFrame({'movieId': movie_rat,
#                                   'ratings': rating_list_movie,
                             'rating_mean_mov': rating_mean(rating_list_movie),
                             'rating_median_mov': rating_median(rating_list_movie), 
                             'rating_variance_mov': rating_variance(rating_list_movie)})

movie_rating_stat.head()

Unnamed: 0,movieId,rating_mean_mov,rating_median_mov,rating_variance_mov
0,1,3.9,4.0,0.69699
1,2,3.4,3.5,0.777419
2,3,3.3,3.0,1.112651
3,4,2.4,3.0,0.72619
4,5,3.1,3.0,0.822917


In [40]:
ratings.rating.unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [41]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [42]:
ratings.movieId.unique().shape

(9724,)

In [43]:
mov_gens_tags_full = mov_gens_tags.merge(movie_rating_stat, how='inner', on='movieId')
mov_gens_tags_full

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,rating_mean_mov,rating_median_mov,rating_variance_mov
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,4.0,0.696990
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.4,3.5,0.777419
2,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3,3.0,1.112651
3,5,Father of the Bride Part II (1995),Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.1,3.0,0.822917
4,7,Sabrina (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2,3.0,0.955625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,183611,Game Night (2018),Action|Comedy|Crime|Horror,0.476923,0.000000,0.000000,0.000000,0.348475,0.552125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.000000
1552,184471,Tomb Raider (2018),Action|Adventure|Fantasy,0.497828,0.566649,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,3.0,1.833333
1553,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,0.572190,0.000000,0.000000,0.000000,0.418084,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,4.0,1.505682
1554,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,4.0,0.550000


In [44]:
mov_gens_tags_rating_full = mov_gens_tags_full.merge(ratings, how='inner', on='movieId')
mov_gens_tags_rating_full

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,zither,zoekazan,zombies,zooeydeschanel,rating_mean_mov,rating_median_mov,rating_variance_mov,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.69699,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.69699,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.69699,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.69699,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.69699,17,4.5,1305696483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48284,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.55000,380,4.0,1536872728
48285,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.55000,414,3.5,1527977920
48286,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.55000,514,3.0,1537674946
48287,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,3.9,4.0,0.55000,586,5.0,1529899556


In [45]:
# Конечный полный дата-сет
all_data_full = mov_gens_tags_rating_full.merge(user_rating_stat, how='inner', on='userId')
all_data_full

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,zooeydeschanel,rating_mean_mov,rating_median_mov,rating_variance_mov,userId,rating,timestamp,rating_mean_us,rating_median_us,rating_variance_us
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,...,0.0,3.9,4.0,0.696990,1,4.0,964982703,4.4,5.0,0.640077
1,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,3.3,3.0,1.112651,1,4.0,964981247,4.4,5.0,0.640077
2,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,4.0,4.0,0.850875,1,5.0,964983815,4.4,5.0,0.640077
3,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0.000000,0.000000,0.000000,0.000000,0.000000,0.553854,0.0,...,0.0,4.2,4.5,0.641475,1,5.0,964982931,4.4,5.0,0.640077
4,101,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,0.000000,0.550590,0.000000,0.000000,0.353441,0.559994,0.0,...,0.0,3.8,4.0,1.086957,1,5.0,964980868,4.4,5.0,0.640077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48284,174055,Dunkirk (2017),Action|Drama|Thriller|War,0.449869,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,3.4,4.0,1.326923,184,4.0,1537109477,3.7,4.0,1.139897
48285,176371,Blade Runner 2049 (2017),Sci-Fi,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,3.8,4.0,1.092320,184,4.5,1537109683,3.7,4.0,1.139897
48286,179401,Jumanji: Welcome to the Jungle (2017),Action|Adventure|Children,0.488186,0.555673,0.000000,0.672980,0.000000,0.000000,0.0,...,0.0,3.7,4.0,0.366667,184,2.5,1537109900,3.7,4.0,1.139897
48287,180985,The Greatest Showman (2017),Drama,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,3.0,3.0,1.625000,184,3.0,1537109917,3.7,4.0,1.139897


### Предсказание оценки фильма


In [46]:
from sklearn.model_selection import train_test_split 

In [47]:
X = all_data_full.drop(['rating', 'title', 'genres'], axis=1)
y = all_data_full['rating']

In [48]:
# y
# X

In [49]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [50]:
le.fit(y)
y_1 = le.fit_transform(y)

In [51]:
# set(y_1)
set(y)

{0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}

In [52]:
# Разделим данные
X_train, X_test, y_train, y_test = train_test_split(X, y_1, test_size=0.25, random_state=42)

In [71]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [156]:
des_tree_class = DecisionTreeClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=3, random_state=0)

In [157]:
des_tree_class.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, min_samples_split=3, random_state=0)

In [158]:
des_tree_class.predict(X_train)

array([7, 7, 9, ..., 7, 5, 6], dtype=int64)

In [159]:
print(classification_report(y_test, des_tree_class.predict(X_test)))

              precision    recall  f1-score   support

           0       0.14      0.09      0.11        95
           1       0.21      0.09      0.13       248
           2       0.14      0.03      0.05       127
           3       0.26      0.06      0.10       604
           4       0.40      0.11      0.18       469
           5       0.35      0.43      0.38      2076
           6       0.32      0.20      0.25      1425
           7       0.38      0.58      0.46      3561
           8       0.31      0.15      0.20      1249
           9       0.48      0.44      0.46      2219

    accuracy                           0.38     12073
   macro avg       0.30      0.22      0.23     12073
weighted avg       0.36      0.38      0.35     12073



### RMSE

In [146]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [147]:
# RMSE для предсказания оценки фильма 
sqrt(mean_squared_error(y_test, des_tree_class.predict(X_test))) 

1.9109427373113153

Качество предсказания отсавляет желать лучшего

После проведенной оптимизации параметров дерева решения была получена максимальная точность 38%