In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
#для начала найдем userId активного пользователя

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
gr=ratings.groupby('userId')[['rating']].sum()

In [6]:
gr.sort_values('rating', ascending=False).head(10)

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
414,9151.5
474,7165.0
599,6547.0
448,5307.5
610,4802.5
380,4474.0
274,4355.5
606,4078.0
68,4074.5
249,3866.5


In [7]:
#вользме пользователя с userId 474

In [7]:
ratings_user_474=ratings[ratings['userId']==474]

In [8]:
ratings_user_474.shape

(2108, 4)

In [9]:
ratings_user_474.head()

Unnamed: 0,userId,movieId,rating,timestamp
73092,474,1,4.0,978575760
73093,474,2,3.0,1046886814
73094,474,5,1.5,1053021982
73095,474,6,3.0,1047054565
73096,474,7,3.0,978576381


In [10]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [12]:
#ratings_user_474_add_tags=ratings_user_474.join(tags.set_index('userId'), on='userId')

In [14]:
#объединяем  ratings_user_474 и tags

In [13]:
r_t=pd.merge(ratings_user_474, tags , on=['movieId', 'userId'])

In [15]:
r_t.shape

(1414, 6)

In [16]:
r_t.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y
0,474,1,4.0,978575760,pixar,1137206825
1,474,2,3.0,1046886814,game,1137375552
2,474,5,1.5,1053021982,pregnancy,1137373903
3,474,5,1.5,1053021982,remake,1137373903
4,474,7,3.0,978576381,remake,1137375642


In [17]:
#объединяем  r_t и movies

In [18]:
r_t_m=pd.merge(r_t, movies , on='movieId')

In [19]:
r_t_m.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,474,1,4.0,978575760,pixar,1137206825,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,474,2,3.0,1046886814,game,1137375552,Jumanji (1995),Adventure|Children|Fantasy
2,474,5,1.5,1053021982,pregnancy,1137373903,Father of the Bride Part II (1995),Comedy
3,474,5,1.5,1053021982,remake,1137373903,Father of the Bride Part II (1995),Comedy
4,474,7,3.0,978576381,remake,1137375642,Sabrina (1995),Comedy|Romance


In [20]:
r_t_m.shape

(1414, 8)

In [23]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [21]:
#очищаем текст в жанрах

In [24]:
movie_genres = [change_string(g) for g in r_t_m.genres.values]

In [25]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy',
 'Comedy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy Drama Romance',
 'Drama',
 'Drama',
 'Crime Drama']

In [26]:
#создаем векторы для movie_genres

In [27]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(movie_genres)

In [28]:
tfidf_vectorizer.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 14,
 'drama': 7,
 'crime': 5,
 'thriller': 16,
 'horror': 10,
 'mystery': 13,
 'scifi': 15,
 'war': 17,
 'musical': 12,
 'action': 0,
 'documentary': 6,
 'imax': 11,
 'western': 18,
 'filmnoir': 9}

In [29]:
X_train_tfidf.toarray()

array([[0.        , 0.39629945, 0.54335279, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.49995786, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.58696084, 0.        , 0.        , ..., 0.50856994, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [30]:
X_train_tfidf.shape

(1414, 19)

In [31]:
#очищаем текст в тегах

In [32]:
movie_tags = [change_string(g) for g in r_t_m.tag.values]

In [33]:
movie_tags

['pixar',
 'game',
 'pregnancy',
 'remake',
 'remake',
 'politics',
 'president',
 'politics',
 'president',
 'Mafia',
 'JaneAusten',
 'Hollywood',
 'serialkiller',
 'alcoholism',
 'Shakespeare',
 'InNetflixqueue',
 'JaneAusten',
 'kidnapping',
 'highschool',
 'teacher',
 'timetravel',
 'Animalmovie',
 'pigs',
 'deathpenalty',
 'Nun',
 'twins',
 'Emma',
 'JaneAusten',
 'Shakespeare',
 'England',
 'Journalism',
 'wedding',
 'serialkiller',
 'heist',
 'adoption',
 'prostitution',
 'writing',
 'music',
 'JekyllandHyde',
 'theater',
 'crime',
 'golf',
 'muppets',
 'Scotland',
 'assassination',
 'Holocaust',
 'dating',
 'journalism',
 'moon',
 'NASA',
 'space',
 'superhero',
 'MichaelCrichton',
 'submarine',
 'InNetflixqueue',
 'computers',
 'Mademecry',
 'generationX',
 'school',
 'Ireland',
 'generationX',
 'mentalillness',
 'psychology',
 'StephenKing',
 'InNetflixqueue',
 'moviebusiness',
 'basketball',
 'France',
 'infertility',
 'basketball',
 'Australia',
 'Beethoven',
 'Einstein',
 

In [34]:
#создаем векторы для movie_tags

In [35]:
tfidf_vectorizer_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_vectorizer_tag.fit_transform(movie_tags)

In [37]:
tfidf_vectorizer_tag.vocabulary_

{'pixar': 374,
 'game': 192,
 'pregnancy': 382,
 'remake': 410,
 'politics': 378,
 'president': 385,
 'mafia': 292,
 'janeausten': 260,
 'hollywood': 228,
 'serialkiller': 441,
 'alcoholism': 20,
 'shakespeare': 444,
 'innetflixqueue': 249,
 'kidnapping': 273,
 'highschool': 223,
 'teacher': 495,
 'timetravel': 507,
 'animalmovie': 30,
 'pigs': 373,
 'deathpenalty': 135,
 'nun': 350,
 'twins': 524,
 'emma': 163,
 'england': 165,
 'journalism': 269,
 'wedding': 544,
 'heist': 219,
 'adoption': 13,
 'prostitution': 391,
 'writing': 558,
 'music': 333,
 'jekyllandhyde': 265,
 'theater': 504,
 'crime': 124,
 'golf': 203,
 'muppets': 331,
 'scotland': 437,
 'assassination': 41,
 'holocaust': 229,
 'dating': 130,
 'moon': 319,
 'nasa': 337,
 'space': 466,
 'superhero': 491,
 'michaelcrichton': 309,
 'submarine': 486,
 'computers': 116,
 'mademecry': 291,
 'generationx': 196,
 'school': 436,
 'ireland': 255,
 'mentalillness': 305,
 'psychology': 393,
 'stephenking': 483,
 'moviebusiness': 325

In [38]:
X_train_tfidf_tag.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
X_train_tfidf_tag.shape

(1414, 563)

In [46]:
# Конкотерируем векторное предстовление movie_tags и movie_genres

In [43]:
df=np.concatenate((X_train_tfidf_tag.toarray(), X_train_tfidf.toarray()),axis=1)

In [44]:
df.shape

(1414, 582)

In [47]:
# оперделяем таргет

In [48]:
target=r_t_m['rating']

In [49]:
target.shape

(1414,)

In [None]:
# Разбиваем на учебную и тестовую выборку

In [54]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=43)

In [57]:
X_train.shape

(1131, 582)

In [58]:
y_train.shape

(1131,)

In [59]:
#обучаем модель

In [60]:
from sklearn.neighbors import KNeighborsRegressor

In [61]:
knn=KNeighborsRegressor()

In [62]:
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [63]:
y_pred=knn.predict(X_test)

In [64]:
y_pred

array([3.4, 3.3, 4.2, 3.8, 3.9, 3.9, 3.6, 3. , 3.7, 3. , 4.1, 4.1, 4.1,
       4. , 3.3, 3.1, 3.7, 3.9, 4. , 4.2, 2.2, 3.1, 4. , 3.6, 4. , 3.8,
       3.9, 3.9, 4.7, 4.2, 3.7, 4.4, 4.1, 3.2, 4.2, 3.1, 4.1, 3.9, 3.9,
       3.5, 3.6, 3.9, 3.7, 4.1, 3.6, 4.2, 3.9, 3.6, 3.3, 4.1, 3.7, 3.6,
       3.7, 3.7, 4. , 3.3, 4.5, 4.3, 3.6, 3.5, 3.9, 4.2, 4.1, 4. , 3.4,
       4.4, 2.7, 4.2, 3.2, 4.1, 4.2, 4.5, 3.8, 1.9, 3.8, 4. , 4. , 3.9,
       1.5, 3.9, 3.9, 3.7, 3.9, 4.4, 3.6, 4.2, 3.8, 3.9, 4. , 4.5, 3.8,
       3.9, 4.1, 4.3, 4.1, 4.1, 3.6, 3.5, 3.9, 3.8, 3.9, 4. , 4. , 3.6,
       3.4, 3.9, 3.9, 4. , 4.2, 3.6, 3.9, 3.8, 3.9, 3.2, 3.2, 3.6, 4.2,
       3.5, 4.7, 3.6, 3.2, 3.7, 4.4, 3.8, 3.9, 4.1, 4.1, 4.1, 2.8, 4. ,
       4.7, 3.6, 3.4, 4.1, 3.8, 3.9, 3.7, 3. , 3.8, 3.7, 3.8, 3.1, 3.5,
       4.1, 3.8, 3.4, 4.1, 3.8, 2.8, 3.8, 3.9, 3.9, 3.8, 4.2, 3.5, 4.1,
       3.6, 4.2, 3.6, 3.9, 4. , 3.8, 4.2, 3. , 4.2, 3.4, 4.5, 3.9, 1.5,
       2.8, 4. , 3.9, 2.5, 3.7, 3.9, 4. , 4. , 3.9, 3.1, 3.8, 3.

In [68]:
from sklearn.metrics import mean_squared_error

In [72]:
mean_squared_error(y_test, y_pred)

0.6559010600706714