In [8]:
from enum import Enum
import random
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn import metrics

class NodeType(Enum):
    USER = 0
    MOVIE = 1

node_map = []
graph = []
test_graph = []
test_size = 0.1

user_map = {}
movie_map = {}

last_movie_idx = 0
with open("combined_data_1.txt") as data_file:
    for line in tqdm(data_file):
        line = line.strip()
        if line.endswith(":"):
            movie_id = int(line[:-1])
                
            movie_map[movie_id] = len(node_map)
            last_movie_idx = len(node_map)
            
            for _ in range(5):
                node_map.append((NodeType.MOVIE, movie_id))
                graph.append([])
                test_graph.append([])
            
        else:
            node_info = line.split(",")
            user_id = int(node_info[0])
            rating = int(node_info[1]) - 1
            
            if user_id < 1200000:
                if user_id not in user_map.keys():
                    node_map.append((NodeType.USER, user_id))
                    graph.append([])
                    test_graph.append([])
                    
                    user_map[user_id] = len(node_map) - 1
                    
                user_node_idx = user_map[user_id]
                
                if random.uniform(0, 1) > test_size:
                    graph[last_movie_idx + rating].append(user_node_idx)
                    graph[user_node_idx].append(last_movie_idx + rating)
                else:
                    test_graph[last_movie_idx + rating].append(user_node_idx)
                    test_graph[user_node_idx].append(last_movie_idx + rating)

24058263it [00:34, 690078.54it/s]


In [9]:
len(user_map)

213543

In [10]:
epochs = 10
t = 30
w = 5
latent_dim = 64

corpus = []

nodes = list(range(len(node_map)))
for _ in tqdm(range(epochs), total = epochs):
    random.shuffle(nodes)
    for node in nodes:
        walk = [node]
        for i in range(1, t):
            if len(graph[walk[-1]]) == 0:
                break
            walk.append(random.choice(graph[walk[-1]]))
        corpus.append([str(word) for word in walk])

100%|██████████| 10/10 [01:30<00:00,  9.07s/it]


In [None]:
model = Word2Vec(corpus, size = latent_dim, window = w, min_count = 0, sg = 1, hs = 1, workers = 4)

In [None]:
y_predicted = []
y_real = []
N = len(node_map)
for user, node in tqdm(enumerate(node_map), total = N):
    if node[0] == NodeType.USER:
        for movie_rating_node in test_graph[user]:
            movie_node = movie_map[node_map[movie_rating_node][1]]
            max_score = -1
            for rating in range(5):
                t_score = model.wv.similarity(str(user), str(movie_node + rating))
                if t_score > max_score:
                    predicted_rating = rating
            predicted_rating += 1
            real_rating = movie_rating_node - movie_node + 1
            y_predicted.append(predicted_rating)
            y_real.append(real_rating)

print(metrics.mean_absolute_error(y_real, y_predicted))
print(metrics.mean_squared_error(y_real, y_predicted))

y_random = [random.randint(1, 5) for i in range(len(y_real))]

print(metrics.mean_absolute_error(y_real, y_random))
print(metrics.mean_squared_error(y_real, y_random))

y_constant = [3 for i in range(len(y_real))]

print(metrics.mean_absolute_error(y_real, y_constant))
print(metrics.mean_squared_error(y_real, y_constant))