## Amazon Movies Data Preparation

To start with simply download the dataset:

`wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz`

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import csv

from tqdm import tqdm
from pathlib import Path

In [2]:
DATA_PATH = Path("/Users/javier/ml_exercises_python/RecoTour/Amazon/neural_graph_cf/Data/amazon-movies/")
reviews = "reviews_Movies_and_TV_5.json.gz"

In [3]:
df = pd.read_json(DATA_PATH/reviews, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
df.head()

Unnamed: 0,user,item,timestamp,rating
0,ADZPIG9QOCDG5,5019281,1203984000,4
1,A35947ZP82G7JH,5019281,1388361600,3
2,A3UORV8A9D5L2E,5019281,1388361600,3
3,A1VKW06X1O2X7V,5019281,1202860800,5
4,A3R27T4HADWFFJ,5019281,1387670400,4


In [4]:
df.rating.value_counts()

5    906608
4    382994
3    201302
1    104219
2    102410
Name: rating, dtype: int64

a lot of people seem to love the movies they watch. There are more 5s that 1,2,3 and 4s together. 

For convenience later, let's now sort values based on `timestamp`. This will be useful later in the process.

In [5]:
df.sort_values(['user','timestamp'], ascending=[True,True], inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,user,item,timestamp,rating
0,A00295401U6S2UG3RAQSZ,0767015533,1353196800,4
1,A00295401U6S2UG3RAQSZ,0792838084,1353196800,4
2,A00295401U6S2UG3RAQSZ,6304484054,1353196800,4
3,A00295401U6S2UG3RAQSZ,6305182205,1353196800,4
4,A00295401U6S2UG3RAQSZ,B00004W22I,1353196800,4


Let's map users and items to continuous integers

In [6]:
def map_user_items(df):

    dfc = df.copy()
    user_mappings = {k:v for v,k in enumerate(dfc.user.unique())}
    item_mappings = {k:v for v,k in enumerate(dfc.item.unique())}

    user_list = pd.DataFrame.from_dict(user_mappings, orient='index').reset_index()
    user_list.columns = ['orig_id', 'remap_id']
    item_list = pd.DataFrame.from_dict(item_mappings, orient='index').reset_index()
    item_list.columns = ['orig_id', 'remap_id']
    user_list.to_csv(DATA_PATH/'user_list.txt', sep=" ", index=False)
    item_list.to_csv(DATA_PATH/'item_list.txt', sep=" ", index=False)    

    dfc['user'] = dfc['user'].map(user_mappings).astype(np.int64)
    dfc['item'] = dfc['item'].map(item_mappings).astype(np.int64)
        
    return user_mappings, item_mappings, dfc

In [7]:
user_mappings, item_mappings, dfm = map_user_items(df)
dfm.head()

Unnamed: 0,user,item,timestamp,rating
0,0,0,1353196800,4
1,0,1,1353196800,4
2,0,2,1353196800,4
3,0,3,1353196800,4
4,0,4,1353196800,4


###  Train/Test split

This split is designed to reproduce [Xiang Wang et al. 2019](https://arxiv.org/pdf/1905.08108.pdf) paper. 

In [8]:
df1 = dfm[['user', 'item']]

In [9]:
def f(df):
    keys, values = df.sort_values('user').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:])
    df2 = pd.DataFrame({'user':ukeys, 'item':[list(a) for a in arrays]})
    return df2

In [10]:
interactions_df = f(df1)
interactions_df.head()

Unnamed: 0,user,item
0,0,"[0, 1, 2, 3, 4, 5]"
1,1,"[6, 7, 8, 9, 10]"
2,2,"[20, 19, 18, 17, 16, 11, 14, 13, 12, 15]"
3,3,"[25, 24, 21, 22, 23]"
4,4,"[34, 32, 31, 30, 33, 28, 27, 26, 29]"


The split strategy we will follow is: 80% training, 20% testing. 

Then, 10% of the training we'll be as validation to tune parameters. Once tuned, one would merge train+validation and re-train with the best performing params

In [11]:
def train_test_split(u, i_l, p=0.8):
    s = np.floor(len(i_l)*p).astype('int')
    train = list(np.random.choice(i_l, s, replace=False))
    test  = list(np.setdiff1d(i_l, train))
    return ([u]+train, [u]+test)

In [12]:
interactions_l = [train_test_split(r['user'], r['item']) for i,r in interactions_df.iterrows()]

In [13]:
train = [interactions_l[i][0] for i in range(len(interactions_l))]
test =  [interactions_l[i][1] for i in range(len(interactions_l))]

In [14]:
print(train[0], test[0])

[0, 4, 1, 3, 0] [0, 2, 5]


Now let's take 10% of the train (which was 80%) as validation

In [15]:
tr_interactions_l = [train_test_split(t[0], t[1:], p=0.9) for t in train]

In [16]:
train = [tr_interactions_l[i][0] for i in range(len(tr_interactions_l))]
valid = [tr_interactions_l[i][1] for i in range(len(tr_interactions_l))]

In [20]:
print(train[1], valid[1], test[1])

[1, 7, 8, 10] [1, 6] [1, 9]


In [18]:
print(min([len(t[1:]) for t in test]), min([len(v[1:]) for v in valid]))

1 1


In [19]:
train_fname = DATA_PATH/'train.txt'
valid_fname = DATA_PATH/'valid.txt'
test_fname = DATA_PATH/'test.txt'

with open(train_fname, 'w') as trf, open(valid_fname, 'w') as vaf, open(test_fname, 'w') as tef:
    trwrt = csv.writer(trf, delimiter=' ')
    vawrt = csv.writer(vaf, delimiter=' ')
    tewrt = csv.writer(tef, delimiter=' ')
    trwrt.writerows(train)
    vawrt.writerows(valid)
    tewrt.writerows(test)