In [50]:
import random
from tqdm.notebook import tqdm
from gensim.models import Word2Vec
from sklearn import metrics
import numpy 
import networkx as nx
from matplotlib import pyplot as plt

In [57]:
def make_graph_from_file(filename, test_size):
    G = nx.Graph()
    G_test = nx.Graph()
    with open(filename) as data_file:
        for line in tqdm(data_file):
            line = line.strip()
            if line.endswith(":"):
                movie_id = int(line[:-1])
                for i in range(5):
                    G.add_node(str(movie_id) + "_" + str(i))
                    G_test.add_node(str(movie_id) + "_" + str(i))
            else:
                node_info = line.split(",")
                user_id = int(node_info[0])
                rating = int(node_info[1]) - 1

                if user_id < 60000:
                    if str(user_id) not in G.nodes:
                        G.add_node(str(user_id))
                        G_test.add_node(str(user_id))

                    if random.uniform(0, 1) > test_size:
                        G.add_edge(str(user_id), str(movie_id) + "_" + str(rating))
                    else:
                        G_test.add_edge(str(user_id), str(movie_id) + "_" + str(rating))
    return G, G_test              

In [58]:
def make_corpus(G, t, epochs):
    corpus=[]
    nodes=list(G.nodes)
    for _ in tqdm(range(epochs),total=epochs):
        np.random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            for i in range(1,t):
                if len(list(G.neighbors(walk[-1]))) == 0:
                    break
                walk.append(random.choice(list(G.neighbors(walk[-1]))))
            corpus.append([word for word in walk])    
    return corpus

In [64]:
data_file = "combined_data_1.txt"
test_size = 0.1
latent_dim = 32
w = 8
t = 30
epochs = 10

G, G_test = make_graph_from_file(data_file, test_size)
# lista=list(G.nodes)

In [60]:
corpus = make_corpus(G, t, epochs)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [65]:
model = Word2Vec(corpus, size = latent_dim, window = w, min_count = 0, sg = 1, hs = 1, workers = 4)

In [66]:
def predict(model,G,G_test):
    y_predicted = []
    y_real = []
    for node in tqdm(G.nodes):
        if '_' not in node:
            similar_users = model.wv.most_similar(positive = [node], topn = 50)
        
            for movie_rating_node in G_test.neighbors(node):
                movie_node = movie_rating_node.split("_")[0]
                    
                rating_sum = 0
                rating_cnt = 0
                for similar_user in similar_users: 
                    for rating in range(5):
                        if (movie_node + "_" + str(rating)) in G.neighbors(similar_user[0]):
                            rating_sum += rating + 1
                            rating_cnt += 1
                        
                if rating_cnt > 0:
                    predicted_rating = round(rating_sum / rating_cnt)
                else:
                    rating_sum = 0
                    rating_cnt = 0
                    for rating in range(5):
                        rating_sum += len(list(G.neighbors(movie_node + "_" + str(rating)))) * (rating + 1)
                        rating_cnt += len(list(G.neighbors(movie_node + "_" + str(rating))))
                    if rating_cnt > 0:
                        predicted_rating = round(rating_sum / rating_cnt)
                    else:
                        predicted_rating = 4
                real_rating = int(movie_rating_node.split("_")[1]) + 1
            
                y_predicted.append(predicted_rating)
                y_real.append(real_rating)
    return y_predicted, y_real

def predictAllAverage(G,G_test):
    y_allavg = []
    for node in G.nodes:
        if '_' not in node:
            for movie_rating_node in G_test.neighbors(node):
                movie_node = movie_rating_node.split("_")[0]
                        
                rating_sum = 0
                rating_cnt = 0
                for rating in range(5):
                    rating_sum += len(list(G.neighbors(movie_node + "_" + str(rating)))) * (rating + 1)
                    rating_cnt += len(list(G.neighbors(movie_node + "_" + str(rating))))
                if rating_cnt > 0:
                    predicted_rating = round(rating_sum / rating_cnt)
                else:
                    predicted_rating = 4
            
                y_allavg.append(predicted_rating)
    return y_allavg

In [67]:
def max_k_off(y_real, y_predicted, k = 1):
    return len([i for i in range(len(y_real)) if abs(y_real[i] - y_predicted[i]) <= k]) / len(y_real)

y_predicted,y_real=predict(model,G,G_test)

print('Model score:')
print(metrics.mean_absolute_error(y_real, y_predicted))
print(metrics.mean_squared_error(y_real, y_predicted))
print(metrics.r2_score(y_real, y_predicted))
print(max_k_off(y_real, y_predicted))
print('------------------')

y_constant4 = [4 for i in range(len(y_real))]

print('Constant 4 model score:')
print(metrics.mean_absolute_error(y_real, y_constant4))
print(metrics.mean_squared_error(y_real, y_constant4))
print(metrics.r2_score(y_real, y_constant4))
print(max_k_off(y_real, y_constant4))
print('------------------')

y_custom = [max(1, min(5, y + round(numpy.random.normal(0, 1.11)))) for y in y_real]
print('Normal distribution random model score:')
print(metrics.mean_absolute_error(y_real, y_custom))
print(metrics.mean_squared_error(y_real, y_custom))
print(metrics.r2_score(y_real, y_custom))
print(max_k_off(y_real, y_custom))
print('------------------')

y_allavg=predictAllAverage(G,G_test)

print('All average model score:')
print(metrics.mean_absolute_error(y_real, y_allavg))
print(metrics.mean_squared_error(y_real, y_allavg))
print(metrics.r2_score(y_real, y_allavg))
print(max_k_off(y_real, y_allavg))
print('------------------')


HBox(children=(FloatProgress(value=0.0, max=33102.0), HTML(value='')))


Model score:
0.7270280168467314
1.0675517304522981
0.0920469672858848
0.8815235304889214
------------------
Constant 4 model score:
0.8552462918879327
1.3288958066288226
-0.13022623950832868
0.854367331990478
------------------
Normal distribution random model score:
0.6797289873649515
0.9411463101995972
0.1995548110709504
0.8876945614356345
------------------
All average model score:
0.7865775498992859
1.1284563266800953
0.04024759187966609
0.8669108221937374
------------------


In [15]:
model.save('deepwalk.model')