In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Read Data

In [3]:
movies = pd.read_csv('../library/data/ml-latest-small/movies.csv')
ratings = pd.read_pickle('../library/data/ml-latest-small/ratings_concat.p')
genres = pd.read_pickle('../library/data/ml-latest-small/genres.p')

In [4]:
genres.sample()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
31260,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [5]:
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp
63783,414,4822,2.0,2004-01-05 15:03:43


## Preprocessing

In [6]:
ratings = ratings.merge(genres, how='inner', left_on='movieId', right_index=True)

In [7]:
ratings.groupby('movieId').head(20)

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2000-07-30 18:45:03,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
516,5,1,4.0,1996-11-08 06:36:02,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
874,7,1,4.5,2005-01-25 06:52:26,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1434,15,1,2.5,2017-11-13 12:59:30,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1667,17,1,4.5,2011-05-18 05:28:03,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1772,18,1,3.5,2016-02-11 16:56:56,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2274,19,1,4.0,2000-08-08 03:33:57,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3219,21,1,3.5,2014-08-09 21:14:38,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4059,27,1,3.0,2000-07-04 04:34:22,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4879,31,1,5.0,1996-12-13 08:43:36,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
ratings = ratings.replace(0, np.nan)
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
43505,292,296,3.0,2001-05-17 01:23:18,,,,,,1.0,...,,,,,,,,1.0,,


## train test split

In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, random_state=4444, test_size=.1)

In [10]:
print(train.shape)
print(test.shape)

(90763, 24)
(10085, 24)


## Item profile

In [11]:
genres.loc[4896]

(no genres listed)    0
Action                0
Adventure             1
Animation             0
Children              1
Comedy                0
Crime                 0
Documentary           0
Drama                 0
Fantasy               1
Film-Noir             0
Horror                0
IMAX                  0
Musical               0
Mystery               0
Romance               0
Sci-Fi                0
Thriller              0
War                   0
Western               0
Name: 4896, dtype: int64

## User profile

In [12]:
genre_cols = genres.columns
genre_cols

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [13]:
for cols in genre_cols:
    train[cols] =  train[cols] * train['rating']

In [14]:
train.head()
# one-hot 이었던 value들에 rating이 곱해진 것을 확인할 수 있음.

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
81111,514,597,2.5,2018-08-11 01:01:16,,,,,,2.5,...,,,,,,2.5,,,,
5362,37,316,5.0,1996-10-13 20:44:02,,5.0,5.0,,,,...,,,,,,,5.0,,,
68233,439,5952,4.0,2015-07-15 07:06:51,,,4.0,,,,...,,,,,,,,,,
30350,212,8961,3.0,2018-04-08 19:48:40,,3.0,3.0,3.0,3.0,3.0,...,,,,,,,,,,
36973,249,62081,3.5,2014-11-08 03:05:20,,3.5,,,,,...,,,3.5,,,,,3.5,,


In [15]:
train.groupby('userId')['Action'].mean()
# Action 장르에서 유저별 평점 평균

userId
1       4.345679
2       4.000000
3       3.461538
4       3.391304
5       3.111111
6       3.637931
7       3.283019
8       3.363636
9       3.000000
10      3.613636
11      3.450000
12      4.000000
13      4.375000
14      3.400000
15      3.140000
16      3.586957
17      4.236111
18      3.604972
19      2.705882
20      2.822222
21      3.463235
22      3.166667
23      3.540000
24      3.602041
25      4.892857
26      3.000000
27      3.789474
28      3.078212
29      3.960000
30      4.772727
          ...   
582     4.068966
583     3.000000
584     4.275862
585     4.277778
586     4.414286
587     4.142857
588     3.058824
589     3.750000
590     3.277523
591     3.846154
592     3.648649
593     2.962963
594     4.404110
595     4.000000
596     3.397661
597     3.577586
598     4.300000
599     2.729136
600     2.618644
601     4.390625
602     3.540541
603     3.152778
604     3.633333
605     3.232759
606     3.170455
607     3.734375
608     3.339921
609    

* User profile

In [16]:
user_profile = train.groupby('userId')[genre_cols].mean()
user_profile

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,4.345679,4.386667,4.703704,4.589744,4.263158,4.342857,,4.620690,4.300000,5.000000,3.384615,,4.666667,4.357143,4.272727,4.314286,4.217391,4.500000,4.285714
2,,4.000000,4.166667,,,4.000000,3.800000,4.000000,3.900000,,,3.000000,3.750000,,4.000000,4.500000,3.875000,3.666667,4.500000,
3,,3.461538,2.727273,0.500000,0.500000,0.500000,0.500000,,0.500000,3.375000,,4.666667,,0.500000,5.000000,0.500000,4.115385,4.083333,0.500000,
4,,3.391304,3.592593,4.000000,3.800000,3.484211,3.791667,4.000000,3.491379,3.588235,4.000000,4.000000,3.000000,3.933333,3.789474,3.428571,2.555556,3.600000,4.000000,3.800000
5,,3.111111,3.250000,4.333333,4.111111,3.466667,3.833333,,3.833333,4.142857,,,3.666667,4.400000,4.000000,3.090909,2.500000,3.555556,3.333333,3.000000
6,,3.637931,3.931818,4.166667,3.619048,3.368421,3.242424,,3.606299,3.500000,2.500000,3.277778,4.666667,4.181818,3.733333,3.590909,3.526316,3.524590,3.777778,3.900000
7,,3.283019,3.367347,3.392857,3.200000,3.177778,3.347826,,3.147059,3.065217,3.000000,4.000000,2.454545,3.562500,3.208333,2.672414,3.200000,3.432432,3.277778,
8,,3.363636,3.545455,5.000000,4.250000,3.181818,3.714286,,3.800000,3.250000,,4.500000,4.500000,5.000000,4.000000,3.500000,3.250000,3.733333,4.000000,3.000000
9,,3.000000,3.666667,4.000000,4.000000,3.714286,3.000000,,3.388889,5.000000,4.000000,1.750000,3.000000,,4.000000,3.166667,3.000000,2.545455,3.000000,4.000000
10,,3.613636,3.537037,3.846154,3.541667,3.263514,3.115385,,3.200000,3.400000,,0.500000,3.352941,3.333333,2.166667,3.381944,2.375000,3.300000,3.750000,


* My user profile

In [19]:
user_profile.loc[1000]

(no genres listed)         NaN
Action                3.333333
Adventure             2.000000
Animation             2.500000
Children              3.000000
Comedy                3.500000
Crime                 5.000000
Documentary                NaN
Drama                 4.800000
Fantasy               3.500000
Film-Noir                  NaN
Horror                     NaN
IMAX                  4.000000
Musical               2.000000
Mystery               4.750000
Romance               4.000000
Sci-Fi                3.500000
Thriller              4.750000
War                        NaN
Western                    NaN
Name: 1000, dtype: float64

## Predict one sample

In [20]:
sample = test.loc[13852]
sample

userId                                 89
movieId                             88069
rating                                  4
timestamp             2018-03-07 07:59:16
(no genres listed)                    NaN
Action                                NaN
Adventure                             NaN
Animation                             NaN
Children                              NaN
Comedy                                  1
Crime                                   1
Documentary                           NaN
Drama                                 NaN
Fantasy                               NaN
Film-Noir                             NaN
Horror                                NaN
IMAX                                  NaN
Musical                               NaN
Mystery                               NaN
Romance                               NaN
Sci-Fi                                NaN
Thriller                              NaN
War                                   NaN
Western                           

In [21]:
sample_user = sample['userId']

In [22]:
sample_user_profile = user_profile.loc[sample_user]
sample_user_profile

(no genres listed)    3.000000
Action                3.367347
Adventure             3.563291
Animation             4.063291
Children              3.918478
Comedy                3.465197
Crime                 3.171053
Documentary           4.500000
Drama                 3.430851
Fantasy               3.415094
Film-Noir                  NaN
Horror                4.242424
IMAX                  4.000000
Musical               3.138889
Mystery               3.700000
Romance               3.122549
Sci-Fi                3.483333
Thriller              3.738095
War                   3.272727
Western               4.500000
Name: 89, dtype: float64

* sample user profile로 sample 대한 예측

In [23]:
print(sample['movieId'])
sample[genre_cols]

88069


(no genres listed)    NaN
Action                NaN
Adventure             NaN
Animation             NaN
Children              NaN
Comedy                  1
Crime                   1
Documentary           NaN
Drama                 NaN
Fantasy               NaN
Film-Noir             NaN
Horror                NaN
IMAX                  NaN
Musical               NaN
Mystery               NaN
Romance               NaN
Sci-Fi                NaN
Thriller              NaN
War                   NaN
Western               NaN
Name: 13852, dtype: object

* comedy 와 crime 하나씩 있으니, 각각을 sample_user_profile에 있는 장르 평점과 곱해주고, mean

In [24]:
(sample[genre_cols] * sample_user_profile).mean()

3.3181249236781047

* sample에 대해, we predict that the user will socre 3.318..

## Predict all testsets

In [25]:
from tqdm import tqdm_notebook

In [26]:
predict = []
for idx, row in tqdm_notebook(test.iterrows()):
    user = row['userId']
#     item profile * user profile 
    predict.append((row[genre_cols] * user_profile.loc[user]).mean())

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [36]:
test['predict'] = predict
test.head(30)

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,predict
55414,368,161,4.0,2000-10-11 14:34:26,,,,,,,...,,,,,,,1.0,1.0,,3.039329
95358,600,1682,3.5,2009-03-22 09:08:46,,,,,,1.0,...,,,,,,1.0,,,,2.959884
81635,517,1270,5.0,2017-02-24 17:18:10,,,1.0,,,1.0,...,,,,,,1.0,,,,2.247452
91184,590,48738,4.0,2009-11-17 03:44:12,,,,,,,...,,,,,,,1.0,,,3.477146
3752,22,53519,0.5,2010-03-16 08:12:17,,1.0,1.0,,,,...,1.0,,,,,,1.0,,,2.840439
35991,246,910,4.5,2012-11-28 18:22:45,,,,,,1.0,...,,,,,,,,,,4.131579
20425,135,1358,5.0,2001-12-30 05:45:44,,,,,,,...,,,,,,,,,,3.7625
40413,274,55118,4.0,2008-02-20 02:06:53,,,,,,,...,,,,,,,1.0,,,3.354324
35276,238,3101,4.0,2001-12-09 20:50:50,,,,,,,...,,,,,,,1.0,,,3.813859
57756,380,52950,2.0,2017-05-08 18:29:06,,1.0,,,,,...,1.0,,,,,,1.0,,,3.602275


* null 값이 있는 case들이 있다.

In [30]:
test[test['predict'].isnull()]

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,predict
11668,70,246,4.5,2012-12-11 04:06:33,,,,,,,...,,,,,,,,,,
81466,514,182727,2.0,2018-09-08 04:30:35,1.0,,,,,,...,,,,,,,,,,
79767,496,904,5.0,2014-11-05 05:50:05,,,,,,,...,,,,1.0,,,1.0,,,
84755,548,80862,3.5,2017-05-15 20:30:32,,,,,,,...,,,,1.0,,,,,,
79902,499,38304,4.5,2010-11-17 02:17:40,,,,,,,...,,,,,,,,,,
79793,496,114713,2.5,2014-11-09 08:17:27,,,,,,,...,1.0,,,,,,,,,
18794,121,246,3.0,1996-11-10 20:06:14,,,,,,,...,,,,,,,,,,
61119,398,82459,4.0,2011-07-21 00:24:35,,,,,,,...,,,,,,,,,1.0,
37495,252,122896,3.0,2017-06-24 06:01:10,1.0,,,,,,...,,,,,,,,,,
90221,586,122896,5.0,2018-06-25 04:06:27,1.0,,,,,,...,,,,,,,,,,


* global mean으로 채우자!

In [39]:
test.loc[test['predict'].isnull(), 'predict'] = train['rating'].mean()

In [40]:
test[test['predict'].isnull()]

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,predict


## Model validation

In [41]:
from sklearn.metrics import mean_squared_error

In [43]:
mse = mean_squared_error(test['rating'], test['predict'])
rmse = np.sqrt(mse)
rmse

0.918052219583595