# DZ Рекомендательные системы

## Задание

Использовать датасет MovieLens.

1. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах;
- средние оценки (+ median, variance и т. д.) пользователя и фильма.
2. Оценить RMSE на тестовой выборке.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import statistics

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
# Data
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
movies.head()
# tags.head()
len(movies.movieId)

9742

### Жанры фильмов

In [5]:
# movies.genres[0].split('|')

In [6]:
# movies.genres.values[0]

In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [9]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [10]:
# ?TfidfVectorizer

In [11]:
mov_gens = movies.merge(pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names()), left_index=True, right_index=True)

# mov_gens.title.unique().shape
mov_gens

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),Drama,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [12]:
neigh = NearestNeighbors(n_neighbors=7, metric='euclidean')
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_neighbors=7)

In [13]:
test = change_string("Adventure|Comedy|Fantasy|Documentary")

X_tfidf = tfidf.transform([test])

res = neigh.kneighbors(X_tfidf, return_distance=True)

In [14]:
res

(array([[0.38758619, 0.56525059, 0.58114553, 0.65617877, 0.65617877,
         0.66834044, 0.66834044]]),
 array([[8014, 5836, 8161, 4853, 7597, 5636, 5627]], dtype=int64))

In [15]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
8014,97757,'Hellboy': The Seeds of Creation (2004),Action|Adventure|Comedy|Documentary|Fantasy
5836,32314,Incident at Loch Ness (2004),Adventure|Comedy|Documentary
8161,102590,Darkon (2006),Documentary|Fantasy
4853,7256,Touching the Void (2003),Adventure|Documentary
7597,86593,African Cats (2011),Adventure|Documentary
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
5627,27251,"10th Kingdom, The (2000)",Adventure|Comedy|Fantasy


## Теги фильмов

In [16]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [17]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [18]:
movies_with_tags.dropna(inplace=True)
movies_with_tags.tag.unique().shape
movies_with_tags.title.unique().shape

(1572,)

In [19]:
movies_with_tags.movieId.unique().shape

(1572,)

In [20]:
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies_ = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies_.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [21]:
type(movies_)

list

In [22]:
movies.loc[movies['title']=='eXistenZ (1999)']

Unnamed: 0,movieId,title,genres
1961,2600,eXistenZ (1999),Action|Sci-Fi|Thriller


In [23]:
tag_strings[:1]

['artistic funny humorous inspiring intelligent quirky romance zooeydeschanel']

In [24]:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [36]:
mov_df = pd.DataFrame(movies_,  columns =['title'])
mov_df

Unnamed: 0,title
0,(500) Days of Summer (2009)
1,...And Justice for All (1979)
2,10 Cloverfield Lane (2016)
3,10 Things I Hate About You (1999)
4,101 Dalmatians (1996)
...,...
1567,Zero Dark Thirty (2012)
1568,Zombieland (2009)
1569,Zoolander (2001)
1570,Zulu (1964)


In [37]:
mov_tags = mov_df.merge(pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names()), left_index=True, right_index=True)
mov_tags

Unnamed: 0,title,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,Zero Dark Thirty (2012),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1568,Zombieland (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1569,Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1570,Zulu (1964),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [38]:
neigh_tag = NearestNeighbors(n_neighbors=10, p=1)
neigh_tag.fit(X_train_tfidf_tag)

NearestNeighbors(n_neighbors=10, p=1)

In [39]:
test = 'highschool pixar fun'

X_tfidf_tag = tfidf_tag.transform([test])

res = neigh_tag.kneighbors(X_tfidf_tag, return_distance=True)

In [40]:
res

(array([[0.95532444, 1.49356676, 1.52685966, 1.70596435, 1.70596435,
         1.70596435, 1.70596435, 1.70596435, 1.70596435, 1.70596435]]),
 array([[1432,  211,  565,  940,  439,  590,  336,  951,  432,  546]],
       dtype=int64))

In [41]:
for i in res[1][0]:
    print(movies_[i], tag_strings[i])

Toy Story (1995) pixar pixar fun
Bug's Life, A (1998) pixar
Guardians of the Galaxy 2 (2017) fun
Napoleon Dynamite (2004) highschool
Ferris Bueller's Day Off (1986) highschool
Heathers (1989) highschool
Dead Poets Society (1989) highschool highschool
Never Been Kissed (1999) highschool
Fast Times at Ridgemont High (1982) highschool
Grease (1978) highschool


In [61]:
mov_gens_tags = mov_gens.merge(mov_tags, how='inner', on='title')
# mov_gens_tags.loc[mov_gens_tags['movieId']==2600]
mov_gens_tags.title.unique().shape
mov_gens_tags

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,Father of the Bride Part II (1995),Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,Sabrina (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1569,183611,Game Night (2018),Action|Comedy|Crime|Horror,0.476923,0.000000,0.000000,0.000000,0.348475,0.552125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1570,184471,Tomb Raider (2018),Action|Adventure|Fantasy,0.497828,0.566649,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1571,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,0.572190,0.000000,0.000000,0.000000,0.418084,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1572,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Средняя оценка (+median, variance) пользователя и фильма

In [44]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [45]:
ratings.userId.unique().shape

(610,)

In [46]:
rating_list_user = []
users = []

for user, group in tqdm(ratings.groupby('userId')):
    rating_list_user.append(group.rating.values)
    users.append(user)

  0%|          | 0/610 [00:00<?, ?it/s]

In [47]:
users
rating_list_user[:6]

[array([4., 4., 4., 5., 5., 3., 5., 4., 5., 5., 5., 5., 3., 5., 4., 5., 3.,
        3., 5., 4., 4., 5., 4., 3., 4., 5., 4., 3., 5., 4., 4., 5., 4., 4.,
        4., 5., 5., 3., 5., 3., 4., 3., 3., 4., 5., 5., 5., 4., 5., 3., 5.,
        5., 5., 5., 3., 5., 5., 4., 5., 4., 5., 5., 5., 4., 5., 5., 4., 5.,
        5., 5., 5., 5., 4., 5., 5., 4., 2., 5., 5., 5., 5., 5., 5., 3., 4.,
        5., 5., 5., 5., 5., 5., 4., 3., 3., 3., 3., 4., 4., 5., 4., 5., 3.,
        5., 5., 4., 5., 3., 3., 5., 4., 4., 5., 4., 4., 5., 5., 4., 4., 5.,
        4., 5., 4., 5., 4., 5., 4., 5., 5., 5., 3., 5., 4., 4., 4., 5., 5.,
        5., 5., 5., 4., 5., 4., 4., 2., 4., 4., 5., 5., 2., 5., 4., 5., 2.,
        5., 4., 3., 5., 4., 5., 5., 4., 4., 5., 3., 5., 5., 5., 5., 5., 4.,
        2., 4., 4., 5., 4., 4., 5., 3., 5., 5., 5., 5., 4., 4., 5., 5., 5.,
        4., 5., 5., 5., 5., 5., 4., 5., 5., 5., 4., 5., 5., 5., 5., 4., 5.,
        4., 1., 3., 3., 5., 5., 5., 4., 4., 5., 5., 5., 4., 4., 4., 5., 4.,
        4., 

In [48]:
rating_list_movie = []
movie_rat = []

for movie, group in tqdm(ratings.groupby('movieId')):
    rating_list_movie.append(group.rating.values)
    movie_rat.append(movie)

  0%|          | 0/9724 [00:00<?, ?it/s]

In [49]:
rating_list_movie[:6]
# movie_rat

[array([4. , 4. , 4.5, 2.5, 4.5, 3.5, 4. , 3.5, 3. , 5. , 3. , 3. , 5. ,
        5. , 3. , 4. , 5. , 3. , 3. , 5. , 5. , 4. , 4. , 2.5, 5. , 4.5,
        0.5, 4. , 2.5, 4. , 3. , 3. , 4. , 3. , 5. , 4.5, 4. , 4. , 3. ,
        3.5, 4. , 4. , 3. , 2. , 3. , 4. , 4. , 3. , 4. , 3.5, 5. , 5. ,
        2. , 3. , 4. , 4.5, 4. , 4. , 5. , 3.5, 4.5, 5. , 5. , 4. , 4. ,
        4. , 4. , 4. , 4. , 2. , 3.5, 5. , 4. , 5. , 3.5, 3. , 3. , 4. ,
        3.5, 5. , 3.5, 3.5, 5. , 3.5, 3. , 5. , 4. , 5. , 5. , 4. , 4.5,
        4.5, 4. , 4. , 2. , 5. , 5. , 5. , 4. , 5. , 4. , 4. , 3. , 4.5,
        4.5, 3. , 4.5, 4. , 4. , 4. , 3. , 2. , 5. , 4. , 3. , 3.5, 3.5,
        5. , 4. , 4. , 3.5, 4. , 4. , 4. , 5. , 5. , 4. , 5. , 5. , 4. ,
        5. , 5. , 3. , 3. , 4.5, 5. , 3.5, 4.5, 4. , 5. , 3. , 5. , 4. ,
        3.5, 5. , 2. , 4. , 4. , 4. , 2.5, 4. , 4. , 4.5, 4. , 5. , 5. ,
        5. , 5. , 4.5, 1.5, 4. , 4. , 4. , 5. , 4. , 4. , 4. , 3. , 4. ,
        4.5, 4.5, 3.5, 4. , 4. , 4. , 4. , 4. , 4. 

In [50]:
# Средняя оценка 
def rating_mean(rating_list):
    rating_mean = []
    for i in range(len(rating_list)):
        rating_mean.append(round(np.mean(rating_list[i]), 1))
    return rating_mean

In [51]:
# Медиана
def rating_median(rating_list):
    rating_median = []
    for i in range(len(rating_list)):
        rating_median.append(statistics.median(rating_list[i]))
    return rating_median

In [52]:
# Дисперсия
def rating_variance(rating_list):
    rating_var = []
    
    for i in range(len(rating_list)):
        if len(rating_list[i])<2: rating_var.append(0)
        else:
            rating_var.append(statistics.variance(rating_list[i]))
    return rating_var

In [53]:
user_rating_stat = pd.DataFrame({'userId': users,
                                 'ratings': rating_list_user,
                             'rating_mean_us': rating_mean(rating_list_user),
                             'rating_median_us': rating_median(rating_list_user), 
                             'rating_variance_us': rating_variance(rating_list_user)})

user_rating_stat.head()

Unnamed: 0,userId,ratings,rating_mean_us,rating_median_us,rating_variance_us
0,1,"[4.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 5.0, ...",4.4,5.0,0.640077
1,2,"[3.0, 4.0, 4.5, 4.0, 4.0, 3.5, 4.0, 4.0, 4.5, ...",3.9,4.0,0.649015
2,3,"[0.5, 0.5, 0.5, 0.5, 0.5, 5.0, 0.5, 0.5, 0.5, ...",2.4,0.5,4.370783
3,4,"[3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 4.0, 5.0, 1.0, ...",3.6,4.0,1.727132
4,5,"[4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 5.0, 4.0, 3.0, ...",3.6,4.0,0.980973


In [54]:
movie_rating_stat = pd.DataFrame({'movieId': movie_rat,
#                                   'ratings': rating_list_movie,
                             'rating_mean_mov': rating_mean(rating_list_movie),
                             'rating_median_mov': rating_median(rating_list_movie), 
                             'rating_variance_mov': rating_variance(rating_list_movie)})

movie_rating_stat.head()

Unnamed: 0,movieId,rating_mean_mov,rating_median_mov,rating_variance_mov
0,1,3.9,4.0,0.69699
1,2,3.4,3.5,0.777419
2,3,3.3,3.0,1.112651
3,4,2.4,3.0,0.72619
4,5,3.1,3.0,0.822917


In [55]:
ratings.rating.unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [56]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [57]:
ratings.movieId.unique().shape

(9724,)

In [70]:
mov_gens_tags_full = mov_gens_tags.merge(movie_rating_stat, how='inner', on='movieId')
mov_gens_tags_full

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,rating_mean_mov,rating_median_mov,rating_variance_mov
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,4.0,0.696990
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.4,3.5,0.777419
2,3,Grumpier Old Men (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3,3.0,1.112651
3,5,Father of the Bride Part II (1995),Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.1,3.0,0.822917
4,7,Sabrina (1995),Comedy|Romance,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2,3.0,0.955625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,183611,Game Night (2018),Action|Comedy|Crime|Horror,0.476923,0.000000,0.000000,0.000000,0.348475,0.552125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.000000
1552,184471,Tomb Raider (2018),Action|Adventure|Fantasy,0.497828,0.566649,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,3.0,1.833333
1553,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,0.572190,0.000000,0.000000,0.000000,0.418084,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,4.0,1.505682
1554,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,0.418252,0.476072,0.000000,0.576575,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,4.0,0.550000


In [63]:
from sklearn.model_selection import train_test_split 

Предсказание по средней оценке фильма


In [66]:
mov_gens_tags.columns

Index(['movieId', 'title', 'genres', 'action_x', 'adventure_x', 'animation_x',
       'children_x', 'comedy_x', 'crime_x', 'documentary_x',
       ...
       'worldwari', 'worldwarii', 'writing', 'wrongfulimprisonment', 'wry',
       'youngermen', 'zither', 'zoekazan', 'zombies', 'zooeydeschanel'],
      dtype='object', length=1495)

In [84]:
X = mov_gens_tags_full.drop(['rating_mean_mov', 'title', 'genres'], axis=1)
y = mov_gens_tags_full['rating_mean_mov']

In [85]:
y
X

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,rating_median_mov,rating_variance_mov
0,1,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,0.0,0.482990,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.696990
1,2,0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.777419
2,3,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.112651
3,5,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.822917
4,7,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.955625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,183611,0.476923,0.000000,0.000000,0.000000,0.348475,0.552125,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.000000
1552,184471,0.497828,0.566649,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.656564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.833333
1553,187593,0.572190,0.000000,0.000000,0.000000,0.418084,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.505682
1554,187595,0.418252,0.476072,0.000000,0.576575,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.550000


In [88]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()


In [92]:
le.fit(y)
y_1 = le.fit_transform(y)

In [132]:
set(y_1)
set(y)

{0.5,
 1.0,
 1.4,
 1.5,
 1.7,
 1.8,
 1.9,
 2.0,
 2.1,
 2.2,
 2.3,
 2.4,
 2.5,
 2.6,
 2.7,
 2.8,
 2.9,
 3.0,
 3.1,
 3.2,
 3.3,
 3.4,
 3.5,
 3.6,
 3.7,
 3.8,
 3.9,
 4.0,
 4.1,
 4.2,
 4.3,
 4.4,
 4.5,
 4.6,
 4.7,
 4.8,
 5.0}

In [94]:
# Разделим данные
X_train, X_test, y_train, y_test = train_test_split(X, y_1, test_size=0.2, random_state=42)

In [121]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
# используем пайплайны
from sklearn.preprocessing import StandardScaler
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight='balanced'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', degree=20, gamma='auto'))])

In [122]:
clf.predict(X_test)

array([28, 18, 22, 33, 27, 27, 17, 28, 27, 35, 22, 28, 27, 28, 27, 33,  3,
       22, 28, 26, 31, 28, 22, 22, 33,  9, 31, 22, 26, 18, 33, 28, 33, 17,
        9, 33, 28, 27, 28, 28, 27, 28, 27, 22, 17, 27, 18, 27, 22, 33, 28,
       33, 22, 28,  6, 28, 28, 27, 33,  7, 14,  7, 36, 22, 33, 27, 26, 22,
       22, 22, 18, 33, 22, 18, 26, 22, 33, 33, 33, 26, 27, 33, 22, 22, 12,
       18, 21, 22, 28, 28, 28, 22, 27, 28, 19, 33, 33, 28,  5, 33,  7, 33,
       28, 22,  1, 22, 18, 22, 33, 33, 33, 33, 22, 35, 33,  9, 22, 22, 24,
       33, 22, 28, 18, 27, 27, 28, 27, 28, 26, 27, 18, 21, 27, 27, 12, 18,
       27, 22, 27, 28, 33, 22, 33, 27, 22, 22, 33, 22, 22, 18, 27, 22, 18,
       28, 27, 33,  1, 27, 22, 33, 33, 33, 27, 26, 27, 28, 27, 27, 18, 27,
       27, 22, 33, 28, 27, 27, 27, 33, 27, 22, 33, 28, 35, 33, 22, 22, 13,
       27, 33, 26,  9, 18, 21, 15, 28, 22,  6, 18, 27, 33,  8,  8, 22, 33,
       36,  9, 33,  7, 22, 27, 33, 28, 33, 27, 22, 22, 33, 27, 33,  7,  3,
       27, 13, 33, 27, 18

In [123]:
y_test

array([29, 18, 27, 24, 30, 24, 30, 24, 32, 25, 24, 27, 25, 25, 25, 27, 22,
       27, 27, 15, 27, 22, 22, 29, 21, 17, 29, 18, 28, 12, 27, 20, 28, 27,
       17, 19, 25, 32, 17, 16, 29, 21, 23, 17, 25, 25, 21, 26, 27, 29, 22,
       25, 24, 23, 28, 21, 17, 28, 22, 22, 16, 19, 19, 27, 27, 28, 16, 20,
       22, 15, 21, 26, 27, 25, 20, 24, 22, 28, 25, 28, 22, 17, 29,  8, 17,
       20, 20, 19, 19, 27, 22, 17, 22, 29, 25, 32, 20, 22, 25, 30, 26, 24,
       20, 27, 25, 18, 22, 20, 27, 23, 24, 28, 27, 29, 25, 18, 26, 22, 27,
       28, 24, 27, 15, 23, 25, 29, 18, 26, 22, 22, 25, 23, 25, 22, 22, 18,
       26, 17, 26, 21, 27, 27, 17, 27, 29, 35, 20, 18, 27,  8, 15, 25, 12,
       22, 22, 16, 25, 27, 23, 32, 36, 27, 21, 27, 28, 23, 31, 13, 27, 29,
       22, 27, 29, 21, 30, 25, 21, 19, 22, 27, 18, 21, 26, 30, 26, 22, 16,
       27, 26, 23,  9, 22, 22, 15, 28, 22,  4, 19, 22, 26, 20, 18, 27, 17,
       26, 16, 22, 28, 19, 29, 22, 26, 15, 25, 22, 27, 22, 24, 23, 25, 13,
       24, 22, 19, 29, 27

In [124]:
clf.score(X_train, y_train)

0.4565916398713826

### RMSE

In [133]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [134]:
# RMSE для предсказания средней оценки фильма 
sqrt(mean_squared_error(y_test, clf.predict(X_test))) 

7.785999927549118