In [1]:
from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np
import random

In [2]:
#Загружаем данные

In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
#Соединяем ratings и tags

In [7]:
r_t=pd.merge(ratings, tags , on=['movieId', 'userId'])

In [8]:
#Соединяем r_t и movies

In [9]:
r_t_m=pd.merge(r_t, movies , on='movieId')

In [10]:
r_t_m.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,2,60756,5.0,1445714980,funny,1445714994,Step Brothers (2008),Comedy
1,2,60756,5.0,1445714980,Highly quotable,1445714996,Step Brothers (2008),Comedy
2,2,60756,5.0,1445714980,will ferrell,1445714992,Step Brothers (2008),Comedy
3,62,60756,3.5,1528934376,comedy,1528934384,Step Brothers (2008),Comedy
4,62,60756,3.5,1528934376,funny,1528934381,Step Brothers (2008),Comedy


In [11]:
r_t_m.shape

(3476, 8)

In [12]:
#Чистим жанры и теги

In [13]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [14]:
movie_genres = [change_string(g) for g in r_t_m.genres.values]

In [15]:
len(movie_genres)

3476

In [16]:
movie_tags = [change_string(g) for g in r_t_m.tag.values]

In [17]:
movie_genres[0]

'Comedy'

In [18]:
#Делаем tfidf векторизацию

In [19]:
tfidf_vectorizer_m = TfidfVectorizer()
tfidf_m = tfidf_vectorizer_m.fit_transform(movie_genres)

In [20]:
tfidf_vectorizer_t = TfidfVectorizer()
tfidf_t = tfidf_vectorizer_t.fit_transform(movie_tags)

In [21]:
tfidf_m.shape

(3476, 20)

In [22]:
tfidf_t.shape

(3476, 1436)

In [23]:
a=tfidf_t.toarray()

In [24]:
b=tfidf_m.toarray()

In [25]:
#Конкатенируем векторы тегов с векторами жанров

In [26]:
tfidf_vectorizer_con=np.concatenate((a,b),axis=1)

In [27]:
# Разбиваем на учебную и тестовую выборку

In [28]:
target=r_t_m['rating']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectorizer_con, target, test_size=0.2, random_state=43)

In [29]:
#обучаем модель

In [30]:
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [31]:
#через NearestNeighbors находим 20 ближайших соседей

In [32]:
neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(tfidf_vectorizer_con)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [33]:
#Создаем словарь movieId->title

In [34]:
title_movies = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_movies[row.movieId] = row.title

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [36]:
#Создаем словарь title->genres

In [37]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [38]:
#title_movies[1]

In [39]:
#Создаем словарь movieId->tag

In [40]:
titile_tags={}
for index, row in tqdm_notebook(tags.iterrows()):
    titile_tags[row.movieId] = row.tag

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [41]:
#Создаем функцию которая на вход принимает жанры фильмов(не обработанные прямо из дс) и теги(из дс), обрабатываем теги и жанры
#функцие change_string и делаем tfidf векторизацию и конкатенируем векторы, возвращая finish_vec

In [42]:
def tfidf_vec(movie_genres, movie_tags):
    movie_genres=change_string(movie_genres)
    movie_tags=change_string(movie_tags)
    vector_genres=tfidf_vectorizer_m.transform([movie_genres])
    vector_tag=tfidf_vectorizer_t.transform([movie_tags])
    finish_vec=np.concatenate((vector_genres.toarray(),vector_tag.toarray()),axis=1)
    return finish_vec

In [43]:
#функция принимает user_id и выдает фильм с оценкой, фильм по которому ищутся близкие фильмы выбирается случайно из оцененых
#пользователем

In [85]:
def recommend_for_user(user_id):
    current_user_id = user_id
    random_film=random.choice(r_t_m[r_t_m.userId == current_user_id].movieId.unique())
    target_movie=title_movies[random_film]
    movie_genres = title_genres[target_movie]
    movie_tags=titile_tags[random_film]
    
    movie_genres=change_string(movie_genres)
    movie_tags=change_string(movie_tags)
    
    user_movies = r_t_m[r_t_m.userId == current_user_id].movieId.unique()
    
    res = neigh.kneighbors(tfidf_vec(movie_genres, movie_tags), return_distance=True)
    
    movies_to_score = res[1][0]

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue
        #делаю исплючение, т.к не у всех фильмов есть теги, мы их не оцениваем
        #можно сделать для таких фильмов нуливой вектор tfidf по тегам, тогда оценка будет для всех фильмов
        try:
            
            scores.append(knn.predict(tfidf_vec(title_genres[title_movies[movie]],titile_tags[movie])))
            titles.append(title_movies[movie])
        except KeyError:
            pass
    for i in range(0, len(titles)):
        print(titles[i], scores[i][0])
        
    
    dic={'title': titles, 'scores': [x[0] for x in scores]}
    sc={'scores': [x[0] for x in scores]}
    df=pd.DataFrame(data=dic)
    return df.sort_values('scores', ascending=False).head(10)

In [90]:
recommend_for_user(62)

Never Been Kissed (1999) 3.8
Honey, I Shrunk the Kids (1989) 3.7
Superman II (1980) 4.1
Invasion of the Body Snatchers (1956) 3.7
Superman III (1983) 4.0


Unnamed: 0,title,scores
2,Superman II (1980),4.1
4,Superman III (1983),4.0
0,Never Been Kissed (1999),3.8
1,"Honey, I Shrunk the Kids (1989)",3.7
3,Invasion of the Body Snatchers (1956),3.7
