In [1]:
import numpy as np
import pandas as pd
from fastai.learner import *
from fastai.column_data import *

In [4]:
import torch

In [3]:
path='./ml-latest-small/'

In [95]:
ratings=pd.read_csv(f'{path}ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
!ls {path}

README.txt  [34mmodels[m[m      ratings.csv [34mtmp[m[m
links.csv   movies.csv  tags.csv


In [86]:
movies=pd.read_csv(f'{path}movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Analyse Result

In [97]:
movie_names=movies.set_index('movieId')['title'].to_dict()

In [98]:
g=ratings.groupby('movieId')['rating'].count()
top_movies=g.sort_values(ascending=False).index.values[:3000]
top_movie_index=[movie2idx[x] for x in top_movies]

In [115]:
m=learn.model
movie_bias=to_np(m.ib(torch.autograd.Variable(torch.LongTensor(top_movie_index))))

In [125]:
movie_ratings=[(n[0],movie_names[m])for m,n in zip(top_movies,movie_bias)]

# Bad Movies

In [128]:
sorted(movie_ratings)[:15]

[(-0.9416478, 'Battlefield Earth (2000)'),
 (-0.72874886, 'Speed 2: Cruise Control (1997)'),
 (-0.71467185, 'Wild Wild West (1999)'),
 (-0.667259, 'Anaconda (1997)'),
 (-0.66249305, 'Inspector Gadget (1999)'),
 (-0.66146207, 'Mighty Morphin Power Rangers: The Movie (1995)'),
 (-0.65649426, 'Little Nicky (2000)'),
 (-0.65312755, 'Batman & Robin (1997)'),
 (-0.64757675, 'Super Mario Bros. (1993)'),
 (-0.6455878, 'Police Academy 4: Citizens on Patrol (1987)'),
 (-0.6263255, 'Message in a Bottle (1999)'),
 (-0.6235784, 'Superman III (1983)'),
 (-0.61898196, 'Flintstones, The (1994)'),
 (-0.61593574, 'Showgirls (1995)'),
 (-0.6071003, 'Police Academy 5: Assignment: Miami Beach (1988)')]

# Good Movie

In [131]:
sorted(movie_ratings)[-15:][::-1]

[(1.306405, 'Shawshank Redemption, The (1994)'),
 (1.0888941, 'Godfather, The (1972)'),
 (1.0259326, 'Usual Suspects, The (1995)'),
 (0.98929137, 'Godfather: Part II, The (1974)'),
 (0.9768999, "Schindler's List (1993)"),
 (0.9371151, 'Pulp Fiction (1994)'),
 (0.900492, 'Fargo (1996)'),
 (0.86708885, 'Silence of the Lambs, The (1991)'),
 (0.8611707, 'Princess Bride, The (1987)'),
 (0.8467151, 'Wallace & Gromit: A Close Shave (1995)'),
 (0.8411499, 'Stand by Me (1986)'),
 (0.83722085, 'Lord of the Rings: The Return of the King, The (2003)'),
 (0.83517283, 'Dark Knight, The (2008)'),
 (0.8328852, 'To Kill a Mockingbird (1962)'),
 (0.83017206, 'Matrix, The (1999)')]

# Embedding Vectors

In [136]:
movie_emb=to_np(m.i(V(torch.LongTensor(top_movie_index))))

In [143]:
from sklearn.decomposition import PCA
pca=PCA(3)
movie_pca=pca.fit(movie_emb.T).components_

In [144]:
movie_pca.shape

(3, 3000)

In [146]:
fac0=movie_pca[0]
fac0_names=[(n,movie_names[m])for m,n in zip(top_movies,fac0)]

In [148]:
sorted(fac0_names)[:15]

[(-0.07270552, 'Taxi Driver (1976)'),
 (-0.070694804, 'Fargo (1996)'),
 (-0.06873547, 'Pulp Fiction (1994)'),
 (-0.06815638, 'Godfather, The (1972)'),
 (-0.06736045, 'Chinatown (1974)'),
 (-0.06418417, 'Goodfellas (1990)'),
 (-0.06248804, 'Apocalypse Now (1979)'),
 (-0.062106147, '2001: A Space Odyssey (1968)'),
 (-0.05973667, 'Killing Fields, The (1984)'),
 (-0.059652906, 'Being John Malkovich (1999)'),
 (-0.059412003,
  'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)'),
 (-0.058140848, 'Casablanca (1942)'),
 (-0.058111362, 'High Fidelity (2000)'),
 (-0.055317044, 'Memento (2000)'),
 (-0.05501783, 'Godfather: Part II, The (1974)')]

In [8]:
pd.crosstab(ratings['userId'],ratings['movieId'],ratings['rating'],aggfunc=np.sum)[:10].fillna(0)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
val_idxs=get_cv_idxs(len(ratings))
wd=2e-4
n_factors=50

In [10]:
cf=CollabFilterDataset.from_csv(path,'ratings.csv','userId','movieId','rating')
learn = cf.get_learner(n_factors, val_idxs, 64, opt_fn=optim.Adam)

In [11]:
learn.fit(1e-2,2,wds=wd,cycle_len=1,cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      0.794227   0.806107  
    1      0.793105   0.778302                                 
    2      0.591366   0.764303                                 



[array([0.7643])]

In [12]:
learn

EmbeddingDotBias(
  (u): Embedding(671, 50)
  (i): Embedding(9066, 50)
  (ub): Embedding(671, 1)
  (ib): Embedding(9066, 1)
)

In [13]:
preds=learn.predict()

In [14]:
a=torch.FloatTensor([[1,2],[3,4]])
b=torch.FloatTensor([[2,2],[10,10]])

In [16]:
print (a)
print (b)
print ((a*b).sum(1))


 1  2
 3  4
[torch.FloatTensor of size 2x2]


  2   2
 10  10
[torch.FloatTensor of size 2x2]


  6
 70
[torch.FloatTensor of size 2]



In [14]:
class DotProduct(nn.Module):
    def __init__(self):
        super(DotProduct,self).__init__()
    def forward(self,x,v):
        return (x*v).sum(1)

In [15]:
d=DotProduct()

# Dot Product

In [87]:
user_uni=ratings.userId.unique()
user2idx={o:i for i,o in enumerate(user_uni) }
ratings.userId=ratings.userId.apply(lambda x: user2idx[x])

movie_uni=ratings.movieId.unique()
movie2idx={o:i for i,o in enumerate(movie_uni)}
ratings.movieId=ratings.movieId.apply(lambda x: movie2idx[x])

In [18]:
n_users=len(ratings.userId.unique())
n_movies=len(ratings.movieId.unique())

In [19]:
n_movies

9066

In [20]:
class EmbeddingDot(nn.Module):
    def __init__(self,n_users,n_movies,n_flags=50):
        super().__init__()
        self.u=nn.Embedding(n_users,n_flags)
        self.m=nn.Embedding(n_movies,n_flags)
        self.u.weight.data.uniform_(0.0,0.05)
        self.m.weight.data.uniform_(0.0,0.05)
    def forward(self,cats,conts):
        users,movies=cats[:,0],cats[:,1]
        u,m= self.u(users),self.m(movies)
        return (u*m).sum(1)
        

In [21]:
x=ratings.drop(["rating","timestamp"],axis=1)
y=ratings["rating"]

In [22]:
data=ColumnarModelData.from_data_frame(path,val_idxs,x,y,['userId','movieId'],64)

In [23]:
w_d=1e-5
model=EmbeddingDot(n_users,n_movies)
opmt=torch.optim.SGD(model.parameters(),1e-1,weight_decay=w_d,momentum=0.9)

In [24]:
fit(model,data,3,opmt,crit=F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.67112    1.630519  
    1      1.159592   1.305827                                
    2      0.926814   1.229996                                 



[array([1.23])]

In [27]:
def get_emb(ni,nf):
    ev=nn.Embedding(ni,nf)
    ev.weight.data.uniform_(-0.01,0.01)
    return ev

In [29]:
max_rating=ratings.rating.max()
min_rating=ratings.rating.min()

In [32]:
class EmbeddingDotBias(nn.Module):
   
    def __init__(self,n_users,n_movies,n_flags=50):
        super().__init__()
        self.u=get_emb(n_users,n_flags)
        self.m=get_emb(n_movies,n_flags)
        self.ub=get_emb(n_users,1)
        self.mb=get_emb(n_movies,1)
    def forward(self,cats,conts):
        users,movies=cats[:,0],cats[:,1]
        u,m,ub,mb=self.u(users),self.m(movies),self.ub(users),self.mb(movies)
        res=F.sigmoid((u*m).sum(1) + ub.squeeze() + mb.squeeze())
        return res*(max_rating-min_rating)+ min_rating 

In [33]:
model=EmbeddingDotBias(n_users,n_movies)
optm=torch.optim.SGD(model.parameters(),1e-1,0.9,weight_decay=w_d)

In [34]:
fit(model,data,3,optm,F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      0.805071   0.828678  
    1      0.748436   0.802094                                 
    2      0.801883   0.798069                                 



[array([0.79807])]

# Embedding Net

In [41]:
class EmbeddingNet(nn.Module):
    def __init__(self,n_users,n_movies,n_flags=50,nh=10):
        super().__init__()
        self.u=get_emb(n_users,n_flags)
        self.m=get_emb(n_movies,n_flags)
        self.lin1=nn.Linear(n_flags*2,nh)
        self.lin2=nn.Linear(nh,1)
    def forward(self,cats,consts):
        users,movies=cats[:,0],cats[:,1]
        x=torch.cat((self.u(users),self.m(movies)),1)
        x=F.dropout(F.relu(self.lin1(x)),0.5)
        return F.sigmoid(self.lin2(x))*(max_rating-min_rating) + min_rating

In [52]:
model=EmbeddingNet(n_users,n_movies)
optm=torch.optim.Adam(model.parameters())

In [53]:
fit(model,data,3,optm,F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      0.816319   0.803015  
    1      0.777246   0.784543                                 
    2      0.720458   0.785519                                 



[array([0.78552])]