In [8]:
import os
import re
import argparse
import pickle
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

In [1]:
import joblib

In [153]:
ratings = []
with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
    for l in f:
        user_id, movie_id, rating, timestamp = [int(_) for _ in l.split('::')]
        ratings.append({
            'user_id': user_id,
            'movie_id': movie_id,
            'rating': rating,
            'timestamp': timestamp,
            })
ratings = pd.DataFrame(ratings)


In [156]:
ratings['user_id'].values

array([   1,    1,    1, ..., 6040, 6040, 6040])

In [158]:
ratings['movie_id'].values

array([1193,  661,  914, ...,  562, 1096, 1097])

In [58]:
data = joblib.load('data.pkl')

In [59]:
data

{'train-graph': Graph(num_nodes={'movie': 3706, 'user': 6040},
       num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
       metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')]),
 'val-matrix': <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 6040 stored elements in COOrdinate format>,
 'test-matrix': <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 6040 stored elements in COOrdinate format>,
 'item-texts': {'title': array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Tigerland',
         'Two Family House', 'Contender, The'], dtype=object)},
 'item-images': None,
 'user-type': 'user',
 'item-type': 'movie',
 'user-to-item-type': 'watched',
 'item-to-user-type': 'watched-by',
 'timestamp-edge-column': 'timestamp'}

In [148]:
data['train-graph'].num_edges

<bound method DGLHeteroGraph.num_edges of Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])>

In [None]:
user : 1
movie : 5, 100, 150

In [118]:
i = 0
for arg in data['train-graph'].metagraph():
    print(arg)


movie
user


In [128]:
data['val-matrix'].todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [127]:
data['train-graph'].metagraph()

<networkx.classes.multidigraph.MultiDiGraph at 0x7fdc9a6e9c10>

In [15]:
directory = './ml-1m/'
users = []
with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
    for l in f:
        id_, gender, age, occupation, zip_ = l.strip().split('::')
        users.append({
            'user_id': int(id_),
            'gender': gender,
            'age': age,
            'occupation': occupation,
            'zip': zip_,
            })
users = pd.DataFrame(users).astype('category')

In [41]:
movies = []
with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
    for l in f:
        id_, title, genres = l.strip().split('::')
        genres_set = set(genres.split('|'))

        # extract year
        assert re.match(r'.*\([0-9]{4}\)$', title)
        year = title[-5:-1]
        title = title[:-6].strip()

        data = {'movie_id': int(id_), 'title': title, 'year': year}
        for g in genres_set:
            data[g] = True
        movies.append(data)
movies = pd.DataFrame(movies).astype({'year': 'category'})

In [76]:
users['user_id']

0          1
1          2
2          3
3          4
4          5
        ... 
6035    6036
6036    6037
6037    6038
6038    6039
6039    6040
Name: user_id, Length: 6040, dtype: category
Categories (6040, int64): [1, 2, 3, 4, ..., 6037, 6038, 6039, 6040]

In [75]:
users['user_id'].cat.reorder_categories(users['user_id'].values)

0          1
1          2
2          3
3          4
4          5
        ... 
6035    6036
6036    6037
6037    6038
6038    6039
6039    6040
Name: user_id, Length: 6040, dtype: category
Categories (6040, int64): [1, 2, 3, 4, ..., 6037, 6038, 6039, 6040]

In [53]:
genre_columns = movies.columns.drop(['movie_id', 'title', 'year']) # columns.drop 이 가능한 것은 처음 보게 됨
movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool')
movies_categorical = movTMFFies.drop('title', axis=1)

In [55]:
movies_categorical

Unnamed: 0,movie_id,year,Children's,Comedy,Animation,Fantasy,Adventure,Romance,Drama,Thriller,Action,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,1995,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2,1995,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,3,1995,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,4,1995,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,5,1995,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,2000,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3879,3949,2000,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
3880,3950,2000,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
3881,3951,2000,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


In [81]:
temp = {}

In [82]:
temp['asd'] = (users['user_id'].cat.codes.values.astype('int64'), users['user_id'].cat.codes.values.astype('int64'))

In [84]:
temp

{'asd': (array([   0,    1,    2, ..., 6037, 6038, 6039]),
  array([   0,    1,    2, ..., 6037, 6038, 6039]))}

In [129]:
data_dict = {
    ('user', 'follows', 'user'): (torch.tensor([0, 1]), torch.tensor([1, 2])),
    ('user', 'follows', 'topic'): (torch.tensor([1, 1]), torch.tensor([1, 2])),
    ('user', 'plays', 'game'): (torch.tensor([0, 3]), torch.tensor([3, 4]))
}

In [130]:
data_dict

{('user', 'follows', 'user'): (tensor([0, 1]), tensor([1, 2])),
 ('user', 'follows', 'topic'): (tensor([1, 1]), tensor([1, 2])),
 ('user', 'plays', 'game'): (tensor([0, 3]), tensor([3, 4]))}

In [131]:
g = dgl.heterograph(data_dict)

In [149]:
g

Graph(num_nodes={'game': 5, 'topic': 3, 'user': 4},
      num_edges={('user', 'follows', 'topic'): 2, ('user', 'follows', 'user'): 2, ('user', 'plays', 'game'): 2},
      metagraph=[('user', 'topic', 'follows'), ('user', 'user', 'follows'), ('user', 'game', 'plays')])

In [162]:
data['train-graph']

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])

In [176]:
data['val-matrix']

<6040x3706 sparse matrix of type '<class 'numpy.int64'>'
	with 6040 stored elements in COOrdinate format>

In [174]:
list(data['train-graph'].metagraph()['movie']['user'])

['watched-by']