To start with simply download the dataset:

`wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz`

In [1]:
import numpy as np
import pandas as pd
import csv

from pathlib import Path
from joblib import Parallel, delayed

In [2]:
DATA_PATH = Path("/home/ubuntu/projects/neural_graph_cf/Data/amazon-movies")
reviews = "reviews_Movies_and_TV_5.json.gz"

In [3]:
df = pd.read_json(DATA_PATH/reviews, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
df.head()

Unnamed: 0,user,item,timestamp,rating
0,ADZPIG9QOCDG5,5019281,1203984000,4
1,A35947ZP82G7JH,5019281,1388361600,3
2,A3UORV8A9D5L2E,5019281,1388361600,3
3,A1VKW06X1O2X7V,5019281,1202860800,5
4,A3R27T4HADWFFJ,5019281,1387670400,4


In [4]:
df.rating.value_counts()

5    906608
4    382994
3    201302
1    104219
2    102410
Name: rating, dtype: int64

a lot of people seem to love the movies they watch. There are more 5s that 1,2,3 and 4s together.

In [5]:
# mapping user and item ids to (continuos) integers
user_mappings = {k:v for v,k in enumerate(df.user.unique())}
item_mappings = {k:v for v,k in enumerate(df.item.unique())}

user_list = pd.DataFrame.from_dict(user_mappings, orient='index').reset_index()
user_list.columns = ['orig_id', 'remap_id']
item_list = pd.DataFrame.from_dict(item_mappings, orient='index').reset_index()
item_list.columns = ['orig_id', 'remap_id']
print(user_list.head())
print(item_list.head())

          orig_id  remap_id
0   ADZPIG9QOCDG5         0
1  A35947ZP82G7JH         1
2  A3UORV8A9D5L2E         2
3  A1VKW06X1O2X7V         3
4  A3R27T4HADWFFJ         4
      orig_id  remap_id
0  0005019281         0
1  0005119367         1
2  0307141985         2
3  0307142469         3
4  0307142477         4


In [8]:
user_list.to_csv(DATA_PATH/'user_list.txt', sep=" ", index=False)
item_list.to_csv(DATA_PATH/'item_list.txt', sep=" ", index=False)

In [9]:
df['user'] = df['user'].map(user_mappings)
df['item'] = df['item'].map(item_mappings)
df = df[['user','item']].astype(np.int64)
df.head()

Unnamed: 0,user,item
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [10]:
def f(df):
    keys, values = df.sort_values('user').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:])
    df2 = pd.DataFrame({'user':ukeys, 'item':[list(a) for a in arrays]})
    return df2

In [11]:
interactions_df = f(df)
interactions_df.head()

Unnamed: 0,user,item
0,0,"[0, 5773, 3362, 35303, 522, 537]"
1,1,"[20414, 0, 26966, 14571, 3240]"
2,2,"[0, 47252, 34434, 41076, 21569]"
3,3,"[15583, 46938, 39098, 0, 2132]"
4,4,"[11141, 4998, 28627, 10497, 43878, 9926, 46637..."


In [12]:
def train_test_split(u, i_l, p=0.8):
    s = np.floor(len(i_l)*p).astype('int')
    train = [u] + list(np.random.choice(i_l, s, replace=False))
    test  = [u] + list(np.setdiff1d(i_l, train))
    return (train, test)

In [13]:
interactions_l = [train_test_split(r['user'], r['item']) for i,r in interactions_df.iterrows()]

In [14]:
interactions_l[0]

([0, 5773, 0, 537, 35303], [0, 522, 3362])

In [15]:
train_fname = DATA_PATH/'train.txt'
test_fname = DATA_PATH/'test.txt'

with open(train_fname, 'w') as trf, open(test_fname, 'w') as tef:
    trwrt = csv.writer(trf, delimiter=' ')
    tewrt = csv.writer(tef, delimiter=' ')
    trwrt.writerows([interactions_l[i][0] for i in range(len(interactions_l))])
    tewrt.writerows([interactions_l[i][1] for i in range(len(interactions_l))])