In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [4]:
movies['genres'] = [change_string(g) for g in movies.genres.values]

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [7]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0


In [8]:
movies_with_tags.dropna(inplace=True)

In [9]:
tag_strings = []
movies_list = []

for movie, group in movies_with_tags.groupby('movieId'):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_list.append(movie)

In [10]:
movies_list[:5]

[1, 2, 3, 5, 7]

In [11]:
tag_strings[:5]

['pixar pixar fun',
 'fantasy magicboardgame RobinWilliams game',
 'moldy old',
 'pregnancy remake',
 'remake']

In [12]:
df = pd.DataFrame()

In [13]:
df['movieId'] = movies_list
df['tags'] = tag_strings

In [14]:
df.head()

Unnamed: 0,movieId,tags
0,1,pixar pixar fun
1,2,fantasy magicboardgame RobinWilliams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [15]:
movies_with_tags = movies.join(df.set_index('movieId'), on='movieId')

In [16]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame RobinWilliams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake


In [17]:
movies_with_tags = movies_with_tags.fillna('')

In [18]:
movies_with_tags['genres+tags'] = movies_with_tags['genres'] + ' ' + movies_with_tags['tags']

In [19]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tags,genres+tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame RobinWilliams game,Adventure Children Fantasy fantasy magicboardg...
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,Comedy Romance moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,Comedy pregnancy remake


In [20]:
movies_with_tags = movies_with_tags.drop(['genres', 'tags'], axis = 1)

In [21]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres+tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...
2,3,Grumpier Old Men (1995),Comedy Romance moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy pregnancy remake


In [22]:
movies_with_tags = movies_with_tags.join(ratings.groupby(['movieId']).mean()['rating'], on='movieId')

In [23]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres+tags,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...,3.431818
2,3,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy pregnancy remake,3.071429


In [24]:
ratings.groupby(['userId']).count().sort_values(by='movieId', ascending=False).head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
414,2698,2698,2698
599,2478,2478,2478
474,2108,2108,2108
448,1864,1864,1864
274,1346,1346,1346


In [25]:
user_data = ratings[ratings['userId'] == 414]

In [26]:
movies_with_tags.rename(columns={'genres+tags': 'genres_tags', 'rating': 'avg_rating'}, inplace=True)

In [27]:
user_data = user_data.join(movies_with_tags.set_index('movieId'), on='movieId')

In [28]:
user_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres_tags,avg_rating
62294,414,1,4.0,961438127,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
62295,414,2,3.0,961594981,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...,3.431818
62296,414,3,4.0,961439278,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
62297,414,5,2.0,961437647,Father of the Bride Part II (1995),Comedy pregnancy remake,3.071429
62298,414,6,3.0,961515642,Heat (1995),Action Crime Thriller,3.946078


## предсказание на основе текстовых данных 

In [29]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(user_data['genres_tags'])

tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_train_counts)



In [30]:
y = user_data['rating']

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 

In [33]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [34]:
reg.predict(X_test)[:10]

array([3.52620569, 2.99721979, 2.94316445, 2.9162438 , 3.52620569,
       3.5143114 , 3.71846893, 3.39490275, 2.94316445, 3.4106246 ])

In [35]:
y_test[:10]

64667    3.5
63603    3.0
63128    4.0
63259    1.5
64181    3.0
62826    2.0
62552    4.0
64334    3.0
63714    2.0
64729    4.0
Name: rating, dtype: float64

In [36]:
reg.score(X_train, y_train)

0.3431132793091831

In [37]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

0.9480521277217417

## Расчет с учетом средней оценки по фильму и применением PCA

In [156]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(user_data['genres_tags'])

tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_train_counts)

X = pd.DataFrame(X.toarray())

In [157]:
avg_rating_list = list(user_data['avg_rating'])

In [158]:
X['rating'] = avg_rating_list

In [159]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1084,1085,1086,1087,1088,1089,1090,1091,1092,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.946078


In [160]:
y = user_data['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 

In [161]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [162]:
X_train.shape

(2158, 1094)

In [163]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 200)
X_train = pca.fit_transform(X_train)

In [174]:
X_train

array([[-0.1676326 , -0.25850483, -0.09019564, ...,  0.12786174,
        -0.28471257,  0.21254952],
       [-0.14648831, -0.18720451, -0.09817665, ..., -0.01462499,
         0.12641356,  0.00585555],
       [-0.15497164, -0.20565528, -0.09537215, ..., -0.20749825,
        -0.17000867,  0.244417  ],
       ...,
       [-0.06794823,  1.33352367, -0.13887934, ..., -0.50543986,
         0.60409417,  0.67235784],
       [-0.15311855, -0.19718088, -0.08590141, ...,  0.33555885,
         0.49767593, -1.17907559],
       [-0.16053462, -0.21914922, -0.08780073, ..., -0.21762127,
         0.07879295,  0.05980231]])

In [165]:
reg = LinearRegression().fit(X_train, y_train)

In [166]:
reg.score(X_train, y_train)

0.35505479874325574

In [167]:
reg.predict(pca.transform(scaler.transform(X_test)))[:10]

array([3.43161694, 4.76297457, 3.05512321, 3.3201125 , 3.82915825,
       3.08504433, 3.1468638 , 3.08826289, 3.37945242, 2.72224512])

In [168]:
y_test[:10]

64499    4.5
62454    5.0
64290    2.0
62863    3.0
63274    5.0
63018    4.0
62384    3.0
64558    2.0
62931    4.0
63504    2.0
Name: rating, dtype: float64

In [169]:
reg.score(pca.transform(scaler.transform(X_test)), y_test)

0.3286014923035069

In [170]:
np.sqrt(mean_squared_error(y_test, reg.predict(pca.transform(scaler.transform(X_test)))))

0.7596322990553603