In [31]:
import numpy as np
import pandas as pd

In [32]:
df_ratings = pd.read_csv('web_app/data/ratings.csv')

In [33]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [34]:
# pivot data
df_ratings_pivot = df_ratings.pivot(
    index="userId", columns="movieId", values="rating")

In [35]:
df_ratings_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


#### Eliminating Columns depends on nan percentage

In [36]:
# Delete columns containing either 75% or more than 75% NaN Values
perc = 90.0
min_count =  int(((100-perc)/100)*df_ratings_pivot.shape[0] + 1)
df_ratings_pivot = df_ratings_pivot.dropna( axis=1, 
                thresh=min_count)

In [37]:
df_ratings_pivot

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,4.0,4.0,,3.5,3.5,3.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,3.0,,...,,,,,,,,,,
5,4.0,,,,,,,,4.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,2.5,,4.0,2.0,,,...,,,,3.0,,,,,,
607,4.0,,,,3.0,,,,,3.0,...,,,,,,,,,,
608,2.5,2.0,,4.0,,4.5,,2.0,3.5,,...,,,,,,,,,,
609,3.0,,,4.0,,,,,,,...,,,,,,,,,,


In [38]:
df_ratings_pivot.isna().sum()

movieId
1         395
2         500
6         508
10        478
11        540
         ... 
79132     467
89745     541
91529     534
99114     539
109487    537
Length: 317, dtype: int64

In [39]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')

In [40]:
def fill_na_with_movie_average(df_o):
    fill_mean = lambda col : col.fillna(col.mean())
    df_o = df_o.apply(fill_mean, axis = 0)
    return df_o

In [41]:
df_ratings_pivot_filled= imputer.fit_transform(df_ratings_pivot)

- Dump Imputer

In [42]:
import pickle as pic
with open('web_app/model/imputer.pk', 'wb') as fin:
        pic.dump(imputer, fin)

In [43]:
df_ratings_pivot_filled

array([[4.        , 3.43181818, 4.        , ..., 3.99342105, 3.94366197,
        3.99315068],
       [3.92093023, 3.43181818, 3.94607843, ..., 3.5       , 3.5       ,
        3.        ],
       [3.92093023, 3.43181818, 3.94607843, ..., 3.99342105, 3.94366197,
        3.99315068],
       ...,
       [2.5       , 2.        , 3.94607843, ..., 3.99342105, 3.94366197,
        3.99315068],
       [3.        , 3.43181818, 3.94607843, ..., 3.99342105, 3.94366197,
        3.99315068],
       [5.        , 3.43181818, 5.        , ..., 4.5       , 4.5       ,
        3.5       ]])

In [44]:
df_ratings_f = pd.DataFrame(df_ratings_pivot_filled, columns=df_ratings_pivot.columns, index=df_ratings_pivot.index)
df_ratings_f

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.431818,4.000000,3.496212,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
2,3.92093,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.000000,4.000000,3.869565,3.500000,3.500000,3.000000
3,3.92093,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
4,3.92093,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,3.000000,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
5,4.00000,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,4.000000,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50000,3.431818,3.946078,3.496212,2.500000,3.926829,4.000000,2.000000,3.494382,3.625,...,4.004762,3.631579,3.776923,3.000000,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
607,4.00000,3.431818,3.946078,3.496212,3.000000,3.926829,3.776119,2.727273,3.494382,3.000,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
608,2.50000,2.000000,3.946078,4.000000,3.671429,4.500000,3.776119,2.000000,3.500000,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
609,3.00000,3.431818,3.946078,4.000000,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151


In [45]:
df_stack=df_ratings_f.stack()

In [46]:
df_stack

userId  movieId
1       1          4.000000
        2          3.431818
        6          4.000000
        10         3.496212
        11         3.671429
                     ...   
610     79132      4.000000
        89745      5.000000
        91529      4.500000
        99114      4.500000
        109487     3.500000
Length: 193370, dtype: float64

In [47]:
df_ratings_f.to_csv("web_app/data/ratings_f.csv", index=True)

In [48]:
df_ratings_rf = pd.read_csv('web_app/data/ratings_f.csv',index_col="userId")

In [49]:
df_ratings_rf

Unnamed: 0_level_0,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.431818,4.000000,3.496212,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
2,3.92093,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.000000,4.000000,3.869565,3.500000,3.500000,3.000000
3,3.92093,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
4,3.92093,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,3.000000,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
5,4.00000,3.431818,3.946078,3.496212,3.671429,3.926829,3.776119,2.727273,4.000000,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50000,3.431818,3.946078,3.496212,2.500000,3.926829,4.000000,2.000000,3.494382,3.625,...,4.004762,3.631579,3.776923,3.000000,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
607,4.00000,3.431818,3.946078,3.496212,3.000000,3.926829,3.776119,2.727273,3.494382,3.000,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
608,2.50000,2.000000,3.946078,4.000000,3.671429,4.500000,3.776119,2.000000,3.500000,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151
609,3.00000,3.431818,3.946078,4.000000,3.671429,3.926829,3.776119,2.727273,3.494382,3.625,...,4.004762,3.631579,3.776923,3.603093,4.022388,4.066434,3.869565,3.993421,3.943662,3.993151


In [50]:
df_ratings_rf.columns

Index(['1', '2', '6', '10', '11', '16', '17', '19', '21', '25',
       ...
       '68954', '69122', '70286', '72998', '74458', '79132', '89745', '91529',
       '99114', '109487'],
      dtype='object', length=317)

## To use this for our project we need 
1. The user-ratings matrix
Use `pivot` `R = ratings.pivot(index = '___' ,columns = '___', values = '___')`
2. Follow the same steps in this notebook
3. Make it nicer, out it in a function

### Movies

In [51]:
df_movies = pd.read_csv('web_app/data/movies.csv', index_col="movieId")

In [52]:
df_movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [53]:
df_movies['year'] = 0
for i, data in df_movies.iterrows():
    try:
        df_movies.loc[i, 'year'] = int(
            data['title'].split('(')[-1].replace(')', ''))
    except:
        df_movies.loc[i, 'year'] = 0

In [54]:
df_movies.head()

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,Jumanji (1995),Adventure|Children|Fantasy,1995
3,Grumpier Old Men (1995),Comedy|Romance,1995
4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
5,Father of the Bride Part II (1995),Comedy,1995


In [55]:
df_movies.shape

(9742, 3)

In [56]:
df_movies_f=df_movies.loc[df_movies.index.isin(df_ratings_rf.columns.astype(int))]

In [57]:
df_movies_f

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,Jumanji (1995),Adventure|Children|Fantasy,1995
6,Heat (1995),Action|Crime|Thriller,1995
10,GoldenEye (1995),Action|Adventure|Thriller,1995
11,"American President, The (1995)",Comedy|Drama|Romance,1995
...,...,...,...
79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010
89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,2012
91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,2012
99114,Django Unchained (2012),Action|Drama|Western,2012


In [58]:
dic_t={"Jumanji (1995)": 4, "GoldenEye (1995)": 3, "Inception (2010)": 2, "Django Unchained (2012)":1, "American President, The (1995)":4}

In [59]:
dic_t

{'Jumanji (1995)': 4,
 'GoldenEye (1995)': 3,
 'Inception (2010)': 2,
 'Django Unchained (2012)': 1,
 'American President, The (1995)': 4}

In [60]:
df_movies_f[df_movies_f["title"]=="Jumanji (1995)"].index.values

array([2], dtype=int64)

In [61]:
def get_ids (dic_t, df_movies_f):
    user_n={}
    for key, value in dic_t.items():
        user_n[np.asscalar(df_movies_f[df_movies_f["title"]==key].index.values)]=value
    return user_n    
    

In [62]:
get_ids(dic_t, df_movies_f)

  user_n[np.asscalar(df_movies_f[df_movies_f["title"]==key].index.values)]=value


{2: 4, 10: 3, 79132: 2, 99114: 1, 11: 4}

In [63]:
df_movies_f.to_csv("web_app/data/df_movies_f.csv", index=True)

### Linkes

In [64]:
df_links = pd.read_csv('web_app/data/links.csv',index_col="movieId")

In [65]:
df_links.head()

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,114709,862.0
2,113497,8844.0
3,113228,15602.0
4,114885,31357.0
5,113041,11862.0


In [106]:
df_links_f=df_links.loc[df_links.index.isin(df_ratings_rf.columns.astype(int))]

In [107]:
df_links_f

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,114709,862.0
2,113497,8844.0
6,113277,949.0
10,113189,710.0
11,112346,9087.0
...,...,...
79132,1375666,27205.0
89745,848228,24428.0
91529,1345836,49026.0
99114,1853728,68718.0


In [None]:
df_links_f.to_csv("web_app/data/df_links_f.csv", index=True)

In [105]:
df_links.shape

(9742, 2)

In [66]:
Rtrue=df_ratings_pivot_filled

In [67]:
Rtrue

array([[4.        , 3.43181818, 4.        , ..., 3.99342105, 3.94366197,
        3.99315068],
       [3.92093023, 3.43181818, 3.94607843, ..., 3.5       , 3.5       ,
        3.        ],
       [3.92093023, 3.43181818, 3.94607843, ..., 3.99342105, 3.94366197,
        3.99315068],
       ...,
       [2.5       , 2.        , 3.94607843, ..., 3.99342105, 3.94366197,
        3.99315068],
       [3.        , 3.43181818, 3.94607843, ..., 3.99342105, 3.94366197,
        3.99315068],
       [5.        , 3.43181818, 5.        , ..., 4.5       , 4.5       ,
        3.5       ]])

### 1. Create a model 
Model assumes R ~ PQ'

In [68]:
from sklearn.decomposition import NMF

In [69]:
max_components=15
max_iter=500

In [70]:
def grid_search(Rtrue,max_components,max_iter):
    init_comp=5
    init_iter=50
    errors=[]
    for i in range(init_comp,max_components):
        for y in range(init_iter,max_iter,50):
            model = NMF(n_components=i, max_iter=y)
            model.fit(Rtrue)
            dic={"components":i, "iterations":y, "recons_err":model.reconstruction_err_ }
            errors.append(dic)
    return errors   
        

In [71]:
#errors_list=grid_search(Rtrue,15,500)

#### Model Fit 

In [72]:
model = NMF(n_components=7, max_iter= 350) 
model.fit(Rtrue) 



NMF(max_iter=350, n_components=7)

#### Q: Movie-'genre' matrix

In [73]:
Q_df = pd.DataFrame(model.components_, columns=df_ratings_pivot.columns, index=['feature1', 'feature2', 'feature3', 'feature4', 'feature5','feature6','feature7'])
Q_df

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
feature1,1.914359,1.631759,1.914765,1.570374,1.756678,2.086889,2.122287,1.165389,1.840315,2.009414,...,2.083765,1.962269,1.918779,1.715635,2.083189,2.206731,1.882431,2.130576,2.102405,2.064316
feature2,0.89531,1.061395,0.730675,1.084555,0.791694,0.315272,0.304139,0.860099,0.475069,0.309677,...,0.210907,0.145235,0.214554,0.339064,0.429152,0.0,0.552262,0.231536,0.263256,0.318509
feature3,0.200602,0.299777,0.586286,0.495865,0.386524,0.235189,0.08767,0.399429,0.259457,0.128815,...,0.79174,0.729991,0.871093,1.062806,0.349122,1.198701,0.78806,0.922841,0.584924,0.743886
feature4,0.146835,0.115604,0.089858,0.205102,0.261432,0.269123,0.335633,0.0,0.138257,0.134958,...,0.292181,0.318133,0.333523,0.299315,0.270455,0.187785,0.304831,0.265036,0.271507,0.248962
feature5,0.667197,0.338577,0.328801,0.418644,0.40824,0.219831,0.106304,0.243497,0.160538,0.140611,...,0.026456,0.0,0.070766,0.254836,0.212746,0.0,0.176164,0.029754,0.0,0.101018
feature6,0.39567,0.394018,0.714411,0.473418,0.344901,0.688851,0.490026,0.892104,0.597336,0.578416,...,0.3045,0.116073,0.295953,0.080129,0.672282,0.213392,0.277194,0.166829,0.48901,0.413861
feature7,0.395651,0.225926,0.33353,0.0,0.236993,0.289524,0.10228,0.160427,0.304344,0.420218,...,0.410237,0.198222,0.208301,0.229549,0.288924,0.246132,0.25334,0.220857,0.287796,0.319892


In [74]:
Q_df.to_csv("web_app/data/Q_df.csv", index=True)

In [75]:
Q = model.components_
Q

array([[1.91435857, 1.63175922, 1.91476479, ..., 2.13057639, 2.10240529,
        2.06431567],
       [0.89531018, 1.06139477, 0.73067495, ..., 0.23153631, 0.26325607,
        0.31850894],
       [0.20060212, 0.29977721, 0.58628557, ..., 0.92284146, 0.58492352,
        0.74388578],
       ...,
       [0.6671974 , 0.33857738, 0.32880074, ..., 0.02975365, 0.        ,
        0.10101803],
       [0.39567013, 0.39401794, 0.71441126, ..., 0.166829  , 0.48901002,
        0.41386137],
       [0.39565096, 0.22592627, 0.33353049, ..., 0.22085748, 0.28779568,
        0.31989211]])

#### P: User-'genre' matrix

In [76]:
P = model.transform(Rtrue)
P

array([[1.35197122, 0.49152651, 0.46249677, ..., 0.79646428, 0.55223627,
        0.23441689],
       [1.3937554 , 0.51073247, 0.46282136, ..., 0.37145711, 0.38186228,
        0.33689388],
       [1.2767341 , 0.58622227, 0.53415965, ..., 0.38334872, 0.48873084,
        0.400152  ],
       ...,
       [1.2345286 , 0.01391025, 1.51629978, ..., 0.        , 0.8703741 ,
        0.        ],
       [1.29176405, 0.46716144, 0.5242878 , ..., 0.32559975, 0.4465017 ,
        0.6346095 ],
       [1.3475231 , 0.39439292, 0.99696067, ..., 0.71437138, 0.84451739,
        0.        ]])

### Saving the model

In [77]:
import pickle as pic
with open('web_app/nmf_model.pk', 'wb') as fin:
        pic.dump(model, fin)

In [78]:
P_df = pd.DataFrame(P, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5','feature6', 'feature7'], index=df_ratings_pivot.index)
P_df

Unnamed: 0_level_0,feature1,feature2,feature3,feature4,feature5,feature6,feature7
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.351971,0.491527,0.462497,1.250982,0.796464,0.552236,0.234417
2,1.393755,0.510732,0.462821,1.087357,0.371457,0.381862,0.336894
3,1.276734,0.586222,0.534160,1.414409,0.383349,0.488731,0.400152
4,1.235803,0.613281,0.108516,2.679781,0.546899,0.000000,0.533445
5,1.355687,0.470425,0.467842,1.309704,0.374758,0.413875,0.401392
...,...,...,...,...,...,...,...
606,1.484789,0.170950,0.438009,0.586627,0.279168,0.435922,0.910417
607,1.735829,0.626718,0.350035,0.000000,0.016584,0.000000,0.000000
608,1.234529,0.013910,1.516300,1.112148,0.000000,0.870374,0.000000
609,1.291764,0.467161,0.524288,1.548164,0.325600,0.446502,0.634609


In [79]:
model.reconstruction_err_

151.4554656227938

#### (Reconstruct predicted R) <- not necessary for predictions

In [80]:
R = pd.DataFrame(np.dot(P, Q).round(), index=df_ratings_pivot.index, columns=df_ratings_pivot.columns)

In [81]:
R

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
3,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
4,4.0,3.0,3.0,3.0,4.0,4.0,4.0,2.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0
5,4.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,4.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
607,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
608,3.0,3.0,4.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0
609,4.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [82]:
def get_user_array(user, df_ratings_pivot):
    arr = np.empty((1,len(df_ratings_pivot.columns)))
    arr[:] = np.NaN
    for key, value in user.items():
        # find the index no
        index_no = df_ratings_pivot.columns.get_loc(key)
        arr[0][index_no]=value
    return arr   

In [83]:
import random
def create_ramdom_user(num_ratings, df_ratings_pivot):
    random_dic={}
    for i in range(num_ratings):
        random_r=random.randint(1,5)
        movie_index=random.randint(0,len(df_ratings_pivot.columns)-1)
        random_dic[df_ratings_pivot.columns[movie_index]] =random_r
    return random_dic 

In [84]:
def get_prediction_array(Q,user,imputer,model,df_ratings_pivot):
    arr=get_user_array(user, df_ratings_pivot)
    user_clean = imputer.transform(arr)
    user_P = model.transform(user_clean) # how strongly our user likes the n "genres"
    #new user R - reconstruct R but for this new user only
    user_R = np.dot(user_P,Q)
    return user_R[0]
    

In [85]:
def get_list_of_recommendation(pred_array,user, df_ratings_pivot,df_movies, best=5, worst=5):
    recom_df = pd.DataFrame({'predicted_ratings':pred_array}, index = df_ratings_pivot.columns)
    recom_df=recom_df.drop(index=user.keys())
    # Default left join
    recom_final = recom_df.join(df_movies)
    recom_final.sort_values(by = 'predicted_ratings', ascending= False, inplace=True)
    recom_best=recom_final.head(best)
    recom_worst=recom_final.tail(worst)
    return recom_best, recom_worst

#### TEST of a USER

In [86]:
r_user=create_ramdom_user(7, R)

In [87]:
r_user

{2115: 5, 3948: 5, 8961: 1, 3623: 4, 500: 2, 48774: 3, 5418: 1}

In [88]:
arr=get_user_array(r_user, df_ratings_pivot)

In [89]:
arr

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan,  2., nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan

In [90]:
# Checking non NAN
arr[~np.isnan(arr)]

array([2., 5., 4., 5., 1., 1., 3.])

- Load Imputer and Model

In [91]:
l_Q_df=pd.read_csv('web_app/data/Q_df.csv', index_col=0)

In [92]:
l_Q_df

Unnamed: 0,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
feature1,1.914359,1.631759,1.914765,1.570374,1.756678,2.086889,2.122287,1.165389,1.840315,2.009414,...,2.083765,1.962269,1.918779,1.715635,2.083189,2.206731,1.882431,2.130576,2.102405,2.064316
feature2,0.89531,1.061395,0.730675,1.084555,0.791694,0.315272,0.304139,0.860099,0.475069,0.309677,...,0.210907,0.145235,0.214554,0.339064,0.429152,0.0,0.552262,0.231536,0.263256,0.318509
feature3,0.200602,0.299777,0.586286,0.495865,0.386524,0.235189,0.08767,0.399429,0.259457,0.128815,...,0.79174,0.729991,0.871093,1.062806,0.349122,1.198701,0.78806,0.922841,0.584924,0.743886
feature4,0.146835,0.115604,0.089858,0.205102,0.261432,0.269123,0.335633,0.0,0.138257,0.134958,...,0.292181,0.318133,0.333523,0.299315,0.270455,0.187785,0.304831,0.265036,0.271507,0.248962
feature5,0.667197,0.338577,0.328801,0.418644,0.40824,0.219831,0.106304,0.243497,0.160538,0.140611,...,0.026456,0.0,0.070766,0.254836,0.212746,0.0,0.176164,0.029754,0.0,0.101018
feature6,0.39567,0.394018,0.714411,0.473418,0.344901,0.688851,0.490026,0.892104,0.597336,0.578416,...,0.3045,0.116073,0.295953,0.080129,0.672282,0.213392,0.277194,0.166829,0.48901,0.413861
feature7,0.395651,0.225926,0.33353,0.0,0.236993,0.289524,0.10228,0.160427,0.304344,0.420218,...,0.410237,0.198222,0.208301,0.229549,0.288924,0.246132,0.25334,0.220857,0.287796,0.319892


In [93]:
l_Q = l_Q_df.to_numpy()

In [94]:
l_Q

array([[1.91435857, 1.63175922, 1.91476479, ..., 2.13057639, 2.10240529,
        2.06431567],
       [0.89531018, 1.06139477, 0.73067495, ..., 0.23153631, 0.26325607,
        0.31850894],
       [0.20060212, 0.29977721, 0.58628557, ..., 0.92284146, 0.58492352,
        0.74388578],
       ...,
       [0.6671974 , 0.33857738, 0.32880074, ..., 0.02975365, 0.        ,
        0.10101803],
       [0.39567013, 0.39401794, 0.71441126, ..., 0.166829  , 0.48901002,
        0.41386137],
       [0.39565096, 0.22592627, 0.33353049, ..., 0.22085748, 0.28779568,
        0.31989211]])

In [95]:
l_imputer=pic.load(open("web_app/model/imputer.pk", "rb"))

In [96]:
l_model=pic.load(open("web_app/model/nmf_model.pk", "rb"))

In [97]:
pred_array=get_prediction_array(l_Q,r_user,l_imputer,l_model,df_ratings_pivot)

In [98]:
pred_array

array([3.92632621, 3.43626199, 3.92266217, 3.50370485, 3.65478276,
       3.89653119, 3.7620716 , 2.72362796, 3.47358199, 3.59921198,
       3.94857299, 3.65954393, 3.80346611, 3.28523196, 3.97679624,
       3.14828381, 4.20705444, 3.70158707, 3.01968111, 3.445449  ,
       4.08207032, 4.08451385, 3.47894443, 3.8887623 , 2.92437659,
       2.8123601 , 3.63264139, 3.53834843, 3.56158856, 2.66042242,
       3.04029197, 2.90467956, 3.8450501 , 3.06620409, 3.66342495,
       3.42392229, 4.27806615, 3.37851222, 3.19055419, 3.43682282,
       3.98674975, 4.1695241 , 3.507671  , 3.35937164, 3.21331792,
       4.4222831 , 3.38460357, 3.75187937, 3.48047905, 3.04588771,
       3.5941635 , 3.57740913, 4.19569629, 3.50694347, 3.9339979 ,
       3.22216082, 3.48710302, 3.56360441, 3.50036806, 3.09935141,
       3.03905086, 2.43300211, 3.53140722, 3.08333811, 3.52018932,
       4.03131007, 3.68244615, 3.8204366 , 3.40122684, 3.59105617,
       3.12747284, 4.23749538, 3.46743234, 4.11453761, 3.52227

In [99]:
len(pred_array)

317

In [100]:
count=0
for i in range(len(pred_array)):
    if(pred_array[i]>5):
        print(pred_array[i])
        count+=1
print(count)

0


In [101]:
best_list,worst_list=get_list_of_recommendation(pred_array,r_user,df_ratings_pivot,df_movies, best=7, worst=7)

In [102]:
best_list

Unnamed: 0_level_0,predicted_ratings,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,4.422283,"Shawshank Redemption, The (1994)",Crime|Drama,1994
260,4.278066,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977
858,4.269841,"Godfather, The (1972)",Crime|Drama,1972
750,4.260814,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,1964
1221,4.248368,"Godfather: Part II, The (1974)",Crime|Drama,1974
527,4.237495,Schindler's List (1993),Drama|War,1993
1196,4.235939,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,1980


In [None]:
a=best

In [103]:
best_list["title"].to_list()

['Shawshank Redemption, The (1994)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Godfather, The (1972)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Godfather: Part II, The (1974)',
 "Schindler's List (1993)",
 'Star Wars: Episode V - The Empire Strikes Back (1980)']

In [104]:
worst_list

Unnamed: 0_level_0,predicted_ratings,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2710,2.768835,"Blair Witch Project, The (1999)",Drama|Horror|Thriller,1999
788,2.75188,"Nutty Professor, The (1996)",Comedy|Fantasy|Romance|Sci-Fi,1996
1544,2.740222,"Lost World: Jurassic Park, The (1997)",Action|Adventure|Sci-Fi|Thriller,1997
19,2.723628,Ace Ventura: When Nature Calls (1995),Comedy,1995
3977,2.695883,Charlie's Angels (2000),Action|Comedy,2000
173,2.660422,Judge Dredd (1995),Action|Crime|Sci-Fi,1995
435,2.433002,Coneheads (1993),Comedy|Sci-Fi,1993
