In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
movies = pd.read_csv('../1. Вводное занятие/movies.csv')
ratings = pd.read_csv('../1. Вводное занятие/ratings.csv')
tags = pd.read_csv('../1. Вводное занятие/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
# датасет, который будем использовать
data = ratings.drop('timestamp', axis = 1)

### TF-IDF

In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [9]:
tfidf_vect = TfidfVectorizer()
movies_tfidf = tfidf_vect.fit_transform(movie_genres)

In [10]:
df = pd.DataFrame(movies_tfidf.toarray(), columns = tfidf_vect.get_feature_names())

In [11]:
movies = pd.concat([movies, df], axis=1)
movies.head()

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
data = data.join(movies.set_index('movieId'), on='movieId')
# теперь новые фичи - это TF-IDF жанров фильмов

### Другие фичи

In [14]:
# год фильма
data['movie_year'] = data['title'].str.extract('(\d{4})') 

In [17]:
# если год не указан, пишем 0
data['movie_year'] = data['movie_year'].fillna(0)

In [18]:
# количество жанров у фильма
data['genres_count'] = data['genres'].apply(lambda x: len(str(x).split('|')))

In [19]:
data.drop(['title', 'genres'], axis=1, inplace=True)

### Средние оценки

In [20]:
# средняя оценка пользователя и медиана оценки пользователя
data['user_mean_rating'] = data.groupby('userId').rating.transform('mean')
data['user_median_rating'] = data.groupby('userId').rating.transform('median')

In [21]:
# средняя оценка фильма и медиана оценки фильма
data['movie_mean_rating'] = data.groupby('movieId').rating.transform('mean')
data['movie_median_rating'] = data.groupby('movieId').rating.transform('median')

In [23]:
# количество тегов на фильм
tags['tags_count'] = tags.groupby('movieId').tag.transform('count')

In [24]:
data = data.join(tags.drop(['userId', 'tag', 'timestamp'], axis=1).drop_duplicates().set_index('movieId'), 
                 on='movieId')

In [25]:
# если тегов у фильма нет, ставим 0
data['tags_count'] = data['tags_count'].fillna(0)

In [26]:
data.head()

Unnamed: 0,userId,movieId,rating,action,adventure,animation,children,comedy,crime,documentary,...,thriller,war,western,movie_year,genres_count,user_mean_rating,user_median_rating,movie_mean_rating,movie_median_rating,tags_count
0,1,1,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,1995,5,4.366379,5.0,3.92093,4.0,3.0
1,1,3,4.0,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,...,0.0,0.0,0.0,1995,2,4.366379,5.0,3.259615,3.0,2.0
2,1,6,4.0,0.549328,0.0,0.0,0.0,0.0,0.635947,0.0,...,0.542042,0.0,0.0,1995,3,4.366379,5.0,3.946078,4.0,0.0
3,1,47,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.566975,0.0,0.0,1995,2,4.366379,5.0,3.975369,4.0,3.0
4,1,50,5.0,0.0,0.0,0.0,0.0,0.0,0.553854,0.0,...,0.472071,0.0,0.0,1995,3,4.366379,5.0,4.237745,4.5,6.0


### Нормализация данных

In [27]:
X = data.drop('rating', axis=1)
Y = data['rating']

In [28]:
X_scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

In [29]:
X_scaled.head()

Unnamed: 0,userId,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,...,thriller,war,western,movie_year,genres_count,user_mean_rating,user_median_rating,movie_mean_rating,movie_median_rating,tags_count
0,-1.780374,-0.54697,-0.640547,1.20284,3.31647,2.594924,0.169681,-0.433416,-0.110016,-0.72508,...,-0.573734,-0.222979,-0.138721,0.017464,1.909005,1.873188,2.568746,0.742308,0.696738,0.063233
1,-1.780374,-0.546914,-0.640547,-0.546057,-0.26763,-0.308243,1.148996,-0.433416,-0.110016,-0.72508,...,-0.573734,-0.222979,-0.138721,0.017464,-0.605097,1.873188,2.568746,-0.428247,-0.93145,-0.028782
2,-1.780374,-0.54683,1.605502,-0.546057,-0.26763,-0.308243,-0.694237,2.127576,-0.110016,-0.72508,...,1.641743,-0.222979,-0.138721,0.017464,0.232937,1.873188,2.568746,0.786821,0.696738,-0.212813
3,-1.780374,-0.545676,-0.640547,-0.546057,-0.26763,-0.308243,-0.694237,-0.433416,-0.110016,-0.72508,...,1.743648,-0.222979,-0.138721,0.017464,-0.605097,1.873188,2.568746,0.838668,0.696738,0.063233
4,-1.780374,-0.545591,-0.640547,-0.546057,-0.26763,-0.308243,-0.694237,1.796981,-0.110016,-0.72508,...,1.355751,-0.222979,-0.138721,0.017464,0.232937,1.873188,2.568746,1.303083,1.510833,0.33928


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size = 0.3, random_state = 44)

In [31]:
X_train.shape, X_test.shape

((70585, 29), (30251, 29))

### Обучение

In [42]:
forest_params = {
    'n_estimators': [50, 80, 100],
    'max_depth': [10, 15, 18],
    'min_samples_leaf': [5, 8, 10, 13],
    'max_features': [15, 20, 25]
}

In [43]:
forest = RandomForestRegressor(random_state=17)
grid_forest = GridSearchCV(forest, forest_params, cv=10, n_jobs=-1)
grid_forest.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=17,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [10, 15, 18],
       

In [44]:
print(grid_forest.best_params_)
print(grid_forest.best_score_)
print(grid_forest.best_estimator_)

{'max_depth': 15, 'max_features': 25, 'min_samples_leaf': 13, 'n_estimators': 100}
0.4215963060954705
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
                      max_features=25, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=13, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=17, verbose=0,
                      warm_start=False)


### Оценка RMSE

In [52]:
train_scores = cross_val_score(grid_forest.best_estimator_, X_train, y_train, cv=10)

In [57]:
print('Средняя оценка RMSE на train выборке:', np.mean(np.sqrt(train_scores)))

Средняя оценка RMSE на train выборке: 0.6492909838082613


In [54]:
test_scores = cross_val_score(grid_forest.best_estimator_, X_test, y_test, cv=10)

In [58]:
print('Средняя оценка RMSE на test выборке:', np.mean(np.sqrt(train_scores)))

Средняя оценка RMSE на test выборке: 0.6492909838082613
