In [1]:
import random
from tqdm.notebook import tqdm
from gensim.models import Word2Vec
from sklearn import metrics
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import model_selection
from surprise import accuracy
from surprise import Reader
from surprise.prediction_algorithms.predictions import Prediction

In [2]:
def stringToTuple(s):
    strings=s.split("_")
    if len(strings)==2:
        return (strings[0],int(strings[1]))
    else:
        return (strings[0],int(strings[1]),float(strings[2]))

def tupleToString(t):
    if len(t)==2:
        return str(t[0])+"_"+str(t[1])
    else:
        return str(t[0])+"_"+str(t[1])+"_"+str(t[2])

In [3]:
class DeepWalkRecommender():
    
    def __init__(self, w=8, epochs = 10, t = 30,latent_dim=32,k=50):        
        self.w = w
        self.epochs = epochs
        self.t = t
        self.latent_dim = latent_dim
        self.k=k
        self.predicted = {}
        self.possible_ratings = []
    
    def __build_graph__(self, data):
        G = nx.Graph()
        
        for user, movie, rating in data.all_ratings():
            user_node = ('u', int(data.to_raw_uid(user)))
            movie_node = ('m', int(data.to_raw_iid(movie)), rating)
            G.add_edge(user_node, movie_node)
        
        return G
    
    def __build_val_map__(self, data):
        return {(int(user), int(movie)) : float(rating) for (user, movie, rating) in data}
    
    def make_corpus(self,G):
        corpus=[]
        nodes=list(G.nodes)
        for _ in tqdm(range(self.epochs),total=self.epochs):
            np.random.shuffle(nodes)
            for node in nodes:
                walk = [node]
                for i in range(1,self.t):
                    if len(list(G.neighbors(walk[-1]))) == 0:
                        break
                    walk.append(random.choice(list(G.neighbors(walk[-1]))))
                corpus.append([tupleToString(word) for word in walk])    
        return corpus
    
    def fit(self, data, val_data = None):
        logging = (val_data is not None)
        
        self.possible_ratings = sorted(set([rating for (_, _, rating) in data.all_ratings()]))
        self.mean_rating = data.global_mean
        
        G = self.__build_graph__(data)
        self.G_train=G
        
        corpus=self.make_corpus(G)
        
        self.word2vec=Word2Vec(corpus, size = self.latent_dim, window = self.w, min_count = 0, sg = 1, hs = 1, workers = 4)
        return self.word2vec
        
    def test(self,test_data):
        #y_predicted = []
        #y_real = []
        predictions=[]
        for user,movie,rating in test_data:
            user=int(user)
            movie=int(movie)
            y_real.append(rating)
            if ('u',user) not in self.G_train.nodes:
                print('nema ga')
            similar_users=self.word2vec.wv.most_similar(positive = [tupleToString(('u',user))], topn = self.k)
            
            rating_sum=0
            rating_cnt=0
            for similar_user in similar_users:
                if ('m',movie,rating) in self.G_train.neighbors(stringToTuple(similar_user[0])):
                    rating_sum+=rating
                    rating_cnt+=1
            
            if rating_cnt>0:
                predicted_rating=rating_sum/rating_cnt
            else:
                #rating_sum = 0
                #rating_cnt = 0
                #for rating in range(5):
                #    movieNode=('m',movie,float(rating))
                #    if movieNode in self.G_train.nodes:
                #        rating_sum += len(list(self.G_train.neighbors(movieNode)))*rating
                #        rating_cnt += len(list(self.G_train.neighbors(movieNode)))
                #if rating_cnt > 0: 
                #    predicted_rating =rating_sum / rating_cnt
                #else:
                predicted_rating = self.mean_rating
            details={'sad':True}
            predictions.append(Prediction(uid = user,
                                          iid = movie,
                                          r_ui = rating,
                                          est = float(predicted_rating),
                                          details = details))
            #y_predicted.append(predicted_rating)
        return predictions

    def gridSearch(train_data,test_data):
        ws=[2,4,8,16]
        ts=[15,30,45]
        ks=[25,50]
        epochs=[10,20]
        dimensions=[8,16,32,64]
        for w in ws:
            for t in ts:
                for epoch in epochs:
                    for dim in dimensions:
                        for k in ks:
                            self.w=w
                            self.k=k
                            self.latent_dim=dim
                            self.epochs=epoch
                            self.t=t
                            self.fit(train_data)
                            predictions=self.test(test_data)
        #TODO
        

In [100]:
d=DeepWalkRecommender()
model=d.fit(train_data)
predictions=d.test(test_data)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [101]:
def max_k_off(y_real, y_predicted, k = 1):
    return len([i for i in range(len(y_real)) if abs(y_real[i] - y_predicted[i]) <= k]) / len(y_real)

print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

#print(metrics.mean_absolute_error(y_real, y_predicted))
#print(np.sqrt(metrics.mean_squared_error(y_real, y_predicted)))
#print(metrics.r2_score(y_real, y_predicted))
#print(max_k_off(y_real, y_predicted))
print('------------------')
#y_constant4 = [4 for i in range(len(y2))]

#print('Constant 4 model score:')
#print('MAE:',metrics.mean_absolute_error(y_real, y_constant4))
#print('MSE:',metrics.mean_squared_error(y_real, y_constant4))
#print(metrics.r2_score(y_real, y_constant4))
#print(max_k_off(y_real, y_constant4))
#print('------------------')

#y_custom = [max(1, min(5, y + round(numpy.random.normal(0, 1.11)))) for y in y_real]
#print('Normal distribution random model score:')
#print('MAE:',metrics.mean_absolute_error(y_real, y_custom))
#print('MSE',metrics.mean_squared_error(y_real, y_custom))
#print(metrics.r2_score(y_real, y_custom))
#print(max_k_off(y_real, y_custom))
#print('------------------')


MODEL:
RMSE: 0.8979
0.8978689260432736
MAE:  0.5435
0.543477913125
------------------
Constant 4 model score:


ValueError: Found input variables with inconsistent numbers of samples: [40000, 20000]

In [91]:
svd = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4)
svd.fit(train_data)
predictionsSVD = svd.test(test_data)
accuracy.rmse(predictionsSVD)
accuracy.mae(predictionsSVD)
print(len(predictionsSVD))

RMSE: 0.9628
MAE:  0.7742
20000


In [15]:
model.save('deepwalk.model')