In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from ast import literal_eval
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

# Загрузка данных

In [5]:
md = pd.read_csv("data/movies_metadata.csv")
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

# Cleaning the Dataset

In [7]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['production_companies']= md['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['production_countries'] = md['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['spoken_languages'] = md['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


In [8]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995


# Collaborative Filtering

In [10]:
reader = Reader()
ratings = pd.read_csv('data/train.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating
0,0,527,4.0
1,0,778,3.0
2,0,1060,3.0
3,0,1097,4.0
4,0,1985,3.0


In [11]:
ratings

Unnamed: 0,userId,movieId,rating
0,0,527,4.0
1,0,778,3.0
2,0,1060,3.0
3,0,1097,4.0
4,0,1985,3.0
...,...,...,...
93544,998,3100,3.0
93545,998,3826,4.0
93546,998,3969,2.0
93547,999,318,5.0


In [12]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5)

{'test_rmse': array([0.89017417, 0.88626176, 0.88388671, 0.89136533, 0.88005709]),
 'test_mae': array([0.6828109 , 0.67815045, 0.67799926, 0.68230927, 0.67618183]),
 'fit_time': (2.2159249782562256,
  1.6589713096618652,
  1.513256549835205,
  1.5221121311187744,
  1.481590986251831),
 'test_time': (0.14160394668579102,
  0.1685469150543213,
  0.1683521270751953,
  0.15624570846557617,
  0.32079315185546875)}

In [13]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f23003838d0>

In [14]:
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')

In [16]:
# test_ratings = pd.read_csv('data/test.csv')

# def replace_id_column(dataset):
#     dataset["id"] = dataset["id"].apply(lambda x: x.split("_", 1))
#     dataset[["userId", "movieId"]] = dataset["id"].apply(pd.Series)
#     dataset = dataset.drop("id", axis=1)
#     return dataset

In [22]:
test_ratings = pd.read_csv('data/test_ratings.csv')

In [33]:
svd.predict(test_ratings)

TypeError: AlgoBase.predict() missing 1 required positional argument: 'iid'

In [31]:
Dataset.load_from_df(test_ratings[['userId', 'movieId']], reader)

ValueError: not enough values to unpack (expected 3, got 2)

In [35]:
user_id = 89
user_ratings[user_ratings['userId'] == user_id]

NameError: name 'user_ratings' is not defined

In [None]:
movie=md['original_title']=='The Green Mile'
md[movie][['original_title','id']]

Unnamed: 0,original_title,id
3030,The Green Mile,497


In [None]:
movie_id = md[movie][['id']].values[0][0]

In [34]:
svd.predict(1000, 110)

Prediction(uid=1000, iid=110, r_ui=None, est=4.097246099204283, details={'was_impossible': False})