In [8]:
import os
import re
import argparse
import pickle
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

In [1]:
import joblib

In [2]:
data = joblib.load('data.pkl')

Using backend: pytorch


In [5]:
data

{'train-graph': Graph(num_nodes={'movie': 3706, 'user': 6040},
       num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
       metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')]),
 'val-matrix': <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 6040 stored elements in COOrdinate format>,
 'test-matrix': <6040x3706 sparse matrix of type '<class 'numpy.int64'>'
 	with 6040 stored elements in COOrdinate format>,
 'item-texts': {'title': array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Tigerland',
         'Two Family House', 'Contender, The'], dtype=object)},
 'item-images': None,
 'user-type': 'user',
 'item-type': 'movie',
 'user-to-item-type': 'watched',
 'item-to-user-type': 'watched-by',
 'timestamp-edge-column': 'timestamp'}

In [6]:
data['train-graph']

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 988129, ('user', 'watched', 'movie'): 988129},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])

In [15]:
directory = './ml-1m/'
users = []
with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
    for l in f:
        id_, gender, age, occupation, zip_ = l.strip().split('::')
        users.append({
            'user_id': int(id_),
            'gender': gender,
            'age': age,
            'occupation': occupation,
            'zip': zip_,
            })
users = pd.DataFrame(users).astype('category')

In [41]:
movies = []
with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
    for l in f:
        id_, title, genres = l.strip().split('::')
        genres_set = set(genres.split('|'))

        # extract year
        assert re.match(r'.*\([0-9]{4}\)$', title)
        year = title[-5:-1]
        title = title[:-6].strip()

        data = {'movie_id': int(id_), 'title': title, 'year': year}
        for g in genres_set:
            data[g] = True
        movies.append(data)
movies = pd.DataFrame(movies).astype({'year': 'category'})

In [47]:
movies

Unnamed: 0,movie_id,title,year,Children's,Comedy,Animation,Fantasy,Adventure,Romance,Drama,...,Action,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,1995,True,True,True,,,,,...,,,,,,,,,,
1,2,Jumanji,1995,True,,,True,True,,,...,,,,,,,,,,
2,3,Grumpier Old Men,1995,,True,,,,True,,...,,,,,,,,,,
3,4,Waiting to Exhale,1995,,True,,,,,True,...,,,,,,,,,,
4,5,Father of the Bride Part II,1995,,True,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,,True,,,,,,...,,,,,,,,,,
3879,3949,Requiem for a Dream,2000,,,,,,,True,...,,,,,,,,,,
3880,3950,Tigerland,2000,,,,,,,True,...,,,,,,,,,,
3881,3951,Two Family House,2000,,,,,,,True,...,,,,,,,,,,


In [53]:
genre_columns = movies.columns.drop(['movie_id', 'title', 'year']) # columns.drop 이 가능한 것은 처음 보게 됨
movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool')
movies_categorical = movTMFFies.drop('title', axis=1)

In [55]:
movies_categorical

Unnamed: 0,movie_id,year,Children's,Comedy,Animation,Fantasy,Adventure,Romance,Drama,Thriller,Action,Crime,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,1995,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2,1995,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,3,1995,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,4,1995,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,5,1995,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,2000,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3879,3949,2000,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
3880,3950,2000,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
3881,3951,2000,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
