In [1]:
# built-in imports
import os
import re
import argparse
import pickle
import sys

# third-party imports
import pandas as pd
import numpy as np
import scipy.sparse as ssp
import dgl
import torch
import torchtext

# local imports
sys.path.insert(0, '../src/pinsage')
from builder import PandasGraphBuilder
from data_utils import *

Using backend: pytorch


In [126]:
# get directory of data files
directory = '../ml-1m'
# get output file path
output_path = '../pinsage/processed_movielens.pkl'

In [10]:
users = []
with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
    # UserID::Gender::Age::Occupation::Zip-code
    for l in f:
        id_, gender, age, occupation, zip_ = l.strip().split('::')
        users.append({
            'user_id': int(id_),
            'gender': gender,
            'age': age,
            'occupation': occupation,
            'zip': zip_,
            })
users = pd.DataFrame(users).astype('category')

In [6]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [12]:
print(users.shape)
print(users.dtypes)

(6040, 5)
user_id       category
gender        category
age           category
occupation    category
zip           category
dtype: object


In [14]:
movies = []
with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
    for l in f:
        # MovieID::Title::Genres
        id_, title, genres = l.strip().split('::')
        
        # get unique genres
        genres_set = set(genres.split('|'))

        # extract year
        assert re.match(r'.*\([0-9]{4}\)$', title)
        year = title[-5:-1]
        title = title[:-6].strip()

        data = {'movie_id': int(id_), 'title': title, 'year': year}
        for g in genres_set:
            data[g] = True
        movies.append(data)
movies = pd.DataFrame(movies).astype({'year': 'category'})

In [15]:
movies.head()

Unnamed: 0,movie_id,title,year,Animation,Comedy,Children's,Fantasy,Adventure,Romance,Drama,...,Thriller,Action,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,1995,True,True,True,,,,,...,,,,,,,,,,
1,2,Jumanji,1995,,,True,True,True,,,...,,,,,,,,,,
2,3,Grumpier Old Men,1995,,True,,,,True,,...,,,,,,,,,,
3,4,Waiting to Exhale,1995,,True,,,,,True,...,,,,,,,,,,
4,5,Father of the Bride Part II,1995,,True,,,,,,...,,,,,,,,,,


In [16]:
print(movies.shape)
print(movies.dtypes)

(3883, 21)
movie_id          int64
title            object
year           category
Animation        object
Comedy           object
Children's       object
Fantasy          object
Adventure        object
Romance          object
Drama            object
Crime            object
Thriller         object
Action           object
Horror           object
Sci-Fi           object
Documentary      object
War              object
Musical          object
Mystery          object
Film-Noir        object
Western          object
dtype: object


In [17]:
ratings = []
with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
    # UserID::MovieID::Rating::Timestamp
    for l in f:
        user_id, movie_id, rating, timestamp = [int(_) for _ in l.split('::')]
        ratings.append({
            'user_id': user_id,
            'movie_id': movie_id,
            'rating': rating,
            'timestamp': timestamp,
            })
ratings = pd.DataFrame(ratings)

In [18]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [19]:
print(ratings.shape)
print(ratings.dtypes)

(1000209, 4)
user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object


In [25]:
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings = ratings['user_id'].unique()
distinct_movies_in_ratings = ratings['movie_id'].unique()
users = users.copy()[users['user_id'].isin(distinct_users_in_ratings)]
movies = movies.copy()[movies['movie_id'].isin(distinct_movies_in_ratings)]

In [31]:
users.shape

(6040, 5)

In [27]:
movies.shape

(3706, 21)

In [29]:
ratings['user_id'].value_counts().min()

20

In [30]:
ratings['movie_id'].value_counts().min()

1

In [32]:
genre_columns = movies.columns.drop(['movie_id', 'title', 'year'])
# movies에서 genre_columns열에 해당하는 모든 행들에 대해 NAN값을 False로 채워줌
movies.loc[:, genre_columns] = movies.loc[:, genre_columns].fillna(False).astype(bool)
# title열 제거
movies_categorical = movies.drop('title', axis=1)

In [105]:
# Build graph
graph_builder = PandasGraphBuilder()
# add_entities(table, primary key, name for node)
graph_builder.add_entities(users, 'user_id', 'user')
graph_builder.add_entities(movies_categorical, 'movie_id','movie')
graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id', 'watched')
graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id', 'watched-by')

In [106]:
g = graph_builder.build()

In [107]:
g

Graph(num_nodes={'movie': 3706, 'user': 6040},
      num_edges={('movie', 'watched-by', 'user'): 1000209, ('user', 'watched', 'movie'): 1000209},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'watched')])

In [108]:
# cat.codes는 Series데이터 중 category형 데이터를 int형으로 mapping 시켜줌
users['gender'].cat.codes

0       0
1       1
2       1
3       1
4       1
       ..
6035    0
6036    0
6037    0
6038    0
6039    1
Length: 6040, dtype: int8

In [109]:
# df.values는 행 데이터들을 출력 시켜줌
users['gender'].cat.codes.values

array([0, 1, 1, ..., 0, 0, 1], dtype=int8)

In [110]:
# Add user node features
g.nodes['user'].data['gender'] = torch.LongTensor(users['gender'].cat.codes.values.copy())
g.nodes['user'].data['age'] = torch.LongTensor(users['age'].cat.codes.values.copy())
g.nodes['user'].data['occupation'] = torch.LongTensor(users['occupation'].cat.codes.values.copy())
g.nodes['user'].data['zip'] = torch.LongTensor(users['zip'].cat.codes.values.copy())

In [111]:
movies[genre_columns].values

array([[ True,  True,  True, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [112]:
# Add movie node features
g.nodes['movie'].data['year'] = torch.LongTensor(movies['year'].cat.codes.values.copy())
g.nodes['movie'].data['genre'] = torch.FloatTensor(movies[genre_columns].values)

In [113]:
# Add user-movie edge features
g.edges['watched'].data['rating'] = torch.LongTensor(ratings['rating'].values)
g.edges['watched'].data['timestamp'] = torch.LongTensor(ratings['timestamp'].values)
g.edges['watched-by'].data['rating'] = torch.LongTensor(ratings['rating'].values)
g.edges['watched-by'].data['timestamp'] = torch.LongTensor(ratings['timestamp'].values)

## Train, Validation, Split by Time

In [114]:
train_indices, val_indices, test_indices = train_test_split_by_time(ratings, 'timestamp', 'user_id')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  df = df.groupby(user, group_keys=False).apply(train_test_split).compute(scheduler='processes').sort_index()


    user_id  movie_id  rating  timestamp  train_mask  val_mask  test_mask
31        1      3186       4  978300019        True     False      False
27        1      1721       4  978300055        True     False      False
37        1      1022       5  978300055        True     False      False
22        1      1270       5  978300055        True     False      False
24        1      2340       3  978300103        True     False      False
36        1      1836       5  978300172        True     False      False
3         1      3408       4  978300275        True     False      False
47        1      1207       4  978300719        True     False      False
7         1      2804       5  978300719        True     False      False
21        1       720       3  978300760        True     False      False
0         1      1193       5  978300760        True     False      False
44        1       260       4  978300760        True     False      False
9         1       919       4  9783013

In [115]:
# data_utils.py 에서 train/test/val 비율을 조정 --> 이해 필요
print('train', train_indices.shape)
print('test', test_indices.shape)
print('val', val_indices.shape)

train (988129,)
test (6040,)
val (6040,)


In [116]:
# Build Train Graph
train_g = build_train_graph(g, train_indices, 'user','movie','watched','watched-by')
# 각 노드의 outgoing edge(watched)의 최솟값이 1 이상인지 확인
assert train_g.out_degrees(etype='watched').min() > 0

In [117]:
val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user', 'movie', 'watched')

In [118]:
# Text Features
movie_textual_dataset = {'title' : movies['title'].values}
movie_textual_dataset

{'title': array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Tigerland',
        'Two Family House', 'Contender, The'], dtype=object)}

## Save preprocessed data

In [119]:
dataset = {
    'train-graph': train_g,
    'val-matrix': val_matrix,
    'test-matrix': test_matrix,
    'item-texts': movie_textual_dataset,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'movie',
    'user-to-item-type': 'watched',
    'item-to-user-type': 'watched-by',
    'timestamp-edge-column': 'timestamp'}

In [127]:
# pickle로 graph structure로 변환된 데이터 저장
with open(output_path, 'wb') as f:
    pickle.dump(dataset, f)