In [4]:
import os, sys
import time

import glob
import random

import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class RecommenderNet(keras.Model):
    def __init__(self, num_tags, num_musics, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_tags = num_tags
        self.num_musics = num_musics
        self.embedding_size = embedding_size
        self.tag_embedding = layers.Embedding(
            num_tags,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.tag_bias = layers.Embedding(num_tags, 1)
        self.music_embedding = layers.Embedding(
            num_musics,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.music_bias = layers.Embedding(num_musics, 1)

    def call(self, inputs):
        tag_vector = self.tag_embedding(inputs[:, 0])
        tag_bias = self.tag_bias(inputs[:, 0])
        music_vector = self.music_embedding(inputs[:, 1])
        music_bias = self.music_bias(inputs[:, 1])
        dot_tag_music = tf.tensordot(tag_vector, music_vector, 2)
        x = dot_tag_music + tag_bias + music_bias
        return tf.nn.sigmoid(x)

In [None]:
RecommenderNet()

In [10]:
max(ratings['userId'].unique().tolist())

610

In [5]:
files = glob.glob('ml-latest-small/*.csv')
files

['ml-latest-small/links.csv',
 'ml-latest-small/tags.csv',
 'ml-latest-small/ratings.csv',
 'ml-latest-small/movies.csv']

In [6]:
links = pd.read_csv(files[0])
tags = pd.read_csv(files[1])
ratings = pd.read_csv(files[2])
movies = pd.read_csv(files[3])

In [26]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
import networkx as nx

In [9]:
G = nx.Graph()

In [10]:
for src, dst in ratings[['userId', 'movieId']].values:
    G.add_edge(int(src), int(dst))

In [11]:
len(G.nodes), len(G.edges) 

(1616, 1775)

# GraphSAGE

In [1]:
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
from dgl.data import CoraGraphDataset
import dgl.function as fn

Using backend: pytorch


In [30]:
genres = []
movies['genres'].apply(lambda x: genres.extend(x.split('|')))

piv = pd.DataFrame(index = movies.index, columns = list(set(genres)))

for idx in tqdm(movies.index.values):
    columns = movies.loc[idx, 'genres'].split('|')
    piv.loc[idx, columns] = 1

100%|██████████| 9742/9742 [00:04<00:00, 2255.43it/s]


In [33]:
piv = piv.fillna(0)

In [39]:
piv

Unnamed: 0,Film-Noir,Drama,Children,War,IMAX,Action,Musical,Sci-Fi,(no genres listed),Crime,Romance,Western,Documentary,Fantasy,Mystery,Horror,Animation,Thriller,Comedy,Adventure
0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1
1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0
9738,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0
9739,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9740,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
