## Amazon Movies Data Preparation

To start with simply download the dataset:

`wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz`

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import csv

from tqdm import tqdm
from pathlib import Path

In [2]:
DATA_PATH = Path("/Users/javier/ml_exercises_python/RecoTour/Amazon/neural_graph_cf/Data/amazon-movies/")
reviews = "reviews_Movies_and_TV_5.json.gz"

In [3]:
df = pd.read_json(DATA_PATH/reviews, lines=True)
keep_cols = ['reviewerID', 'asin', 'unixReviewTime', 'overall']
new_colnames = ['user', 'item', 'timestamp', 'rating']
df = df[keep_cols]
df.columns = new_colnames
df.head()

Unnamed: 0,user,item,timestamp,rating
0,ADZPIG9QOCDG5,5019281,1203984000,4
1,A35947ZP82G7JH,5019281,1388361600,3
2,A3UORV8A9D5L2E,5019281,1388361600,3
3,A1VKW06X1O2X7V,5019281,1202860800,5
4,A3R27T4HADWFFJ,5019281,1387670400,4


In [4]:
df.rating.value_counts()

5    906608
4    382994
3    201302
1    104219
2    102410
Name: rating, dtype: int64

a lot of people seem to love the movies they watch. There are more 5s that 1,2,3 and 4s together. 

For convenience later, let's now sort values based on `timestamp`. This will be useful later in the process.

In [5]:
df.sort_values(['user','timestamp'], ascending=[True,True], inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,user,item,timestamp,rating
0,A00295401U6S2UG3RAQSZ,0767015533,1353196800,4
1,A00295401U6S2UG3RAQSZ,0792838084,1353196800,4
2,A00295401U6S2UG3RAQSZ,6304484054,1353196800,4
3,A00295401U6S2UG3RAQSZ,6305182205,1353196800,4
4,A00295401U6S2UG3RAQSZ,B00004W22I,1353196800,4


Let's map users and items to continuous integers

In [6]:
def map_user_items(df):

    dfc = df.copy()
    user_mappings = {k:v for v,k in enumerate(dfc.user.unique())}
    item_mappings = {k:v for v,k in enumerate(dfc.item.unique())}

    user_list = pd.DataFrame.from_dict(user_mappings, orient='index').reset_index()
    user_list.columns = ['orig_id', 'remap_id']
    item_list = pd.DataFrame.from_dict(item_mappings, orient='index').reset_index()
    item_list.columns = ['orig_id', 'remap_id']
    user_list.to_csv(DATA_PATH/'user_list.txt', sep=" ", index=False)
    item_list.to_csv(DATA_PATH/'item_list.txt', sep=" ", index=False)    

    dfc['user'] = dfc['user'].map(user_mappings).astype(np.int64)
    dfc['item'] = dfc['item'].map(item_mappings).astype(np.int64)
        
    return user_mappings, item_mappings, dfc

In [7]:
user_mappings, item_mappings, dfm = map_user_items(df)
dfm.head()

Unnamed: 0,user,item,timestamp,rating
0,0,0,1353196800,4
1,0,1,1353196800,4
2,0,2,1353196800,4
3,0,3,1353196800,4
4,0,4,1353196800,4


###  Train/Test split Method 1

This method is designed to reproduce [Xiang Wang et al. 2019](https://arxiv.org/pdf/1905.08108.pdf) paper. 

In [8]:
df1 = dfm[['user', 'item']]

In [9]:
def f(df):
    keys, values = df.sort_values('user').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:])
    df2 = pd.DataFrame({'user':ukeys, 'item':[list(a) for a in arrays]})
    return df2

In [10]:
interactions_df = f(df1)
interactions_df.head()

Unnamed: 0,user,item
0,0,"[0, 1, 2, 3, 4, 5]"
1,1,"[6, 7, 8, 9, 10]"
2,2,"[20, 19, 18, 17, 16, 11, 14, 13, 12, 15]"
3,3,"[25, 24, 21, 22, 23]"
4,4,"[34, 32, 31, 30, 33, 28, 27, 26, 29]"


The split strategy we will follow is: 80% training, 20% testing. 

Then, 10% of the training we'll be as validation to tune parameters. Once tuned, one would merge train+validation and re-train with the best performing params

In [11]:
def train_test_split(u, i_l, p=0.8):
    s = np.floor(len(i_l)*p).astype('int')
    train = list(np.random.choice(i_l, s, replace=False))
    test  = list(np.setdiff1d(i_l, train))
    return ([u]+train, [u]+test)

In [12]:
interactions_l = [train_test_split(r['user'], r['item']) for i,r in interactions_df.iterrows()]

In [13]:
train = [interactions_l[i][0] for i in range(len(interactions_l))]
test =  [interactions_l[i][1] for i in range(len(interactions_l))]

In [14]:
print(train[0], test[0])

[0, 4, 1, 3, 0] [0, 2, 5]


Now let's take 10% of the train (which was 80%) as validation

In [15]:
tr_interactions_l = [train_test_split(t[0], t[1:], p=0.9) for t in train]

In [16]:
train = [tr_interactions_l[i][0] for i in range(len(tr_interactions_l))]
valid = [tr_interactions_l[i][1] for i in range(len(tr_interactions_l))]

In [20]:
print(train[1], valid[1], test[1])

[1, 7, 8, 10] [1, 6] [1, 9]


In [18]:
print(min([len(t[1:]) for t in test]), min([len(v[1:]) for v in valid]))

1 1


In [19]:
train_fname = DATA_PATH/'train.txt'
valid_fname = DATA_PATH/'valid.txt'
test_fname = DATA_PATH/'test.txt'

with open(train_fname, 'w') as trf, open(valid_fname, 'w') as vaf, open(test_fname, 'w') as tef:
    trwrt = csv.writer(trf, delimiter=' ')
    vawrt = csv.writer(vaf, delimiter=' ')
    tewrt = csv.writer(tef, delimiter=' ')
    trwrt.writerows(train)
    vawrt.writerows(valid)
    tewrt.writerows(test)

### Train/Test split Method 2

Based on the code [here](https://github.com/sh0416/bpr/blob/master/preprocess.py).

Let's start from the top since we need the `timestamp` column for this approach 

In [21]:
dfm.head()

Unnamed: 0,user,item,timestamp,rating
0,0,0,1353196800,4
1,0,1,1353196800,4
2,0,2,1353196800,4
3,0,3,1353196800,4
4,0,4,1353196800,4


In [22]:
user_size = len(dfm['user'].unique())
item_size = len(dfm['item'].unique())

In [23]:
def create_user_list(df, user_size):
    user_list = [dict() for u in range(user_size)]
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        # this could be .append()
        user_list[row['user']][row['item']] = row['timestamp']
    return user_list

We are going to create a list of dictionaries, one per user, where the keys are the item_ids and the values are the corresponding `timestamps`.

In [24]:
total_user_list = create_user_list(dfm, user_size)

100%|██████████| 1697533/1697533 [03:35<00:00, 7884.81it/s]


Let's have a look to two users

In [25]:
print(total_user_list[0])
print(total_user_list[950])

{0: 1353196800, 1: 1353196800, 2: 1353196800, 3: 1353196800, 4: 1353196800, 5: 1353628800}
{8660: 1262822400, 9104: 1361923200, 9105: 1362009600, 9106: 1383696000, 9107: 1403136000}


The following function will split train and test following 2 approaches. 

1. Train with the 1st n-1 items and test with the most recent item. In this scenario, one would apply this approach to the train dataset as well for validation/tunning. The merge train+validation and re-train again with the best performing params 
2. Split at random. As before, one has perform hyperparameter optimisation, we could merge train and validation, re-train and test on the "un-touched" test dataset

The function can take a list of dictionaries where the values are the corresponding `timestamsp` or simply a list. In the later case, the list is assumed to be sorted in increasing timestamp order, so that the most recent interaction is the last one

In [26]:
def split_train_test(user_list, p=0.8, time_order=False):

    train_user_list = [None] * len(user_list)
    test_user_list  = [None] * len(user_list)

    for user, item_list in enumerate(user_list):
        if time_order:
            # Choose latest item
            if isinstance(item_list, dict):
                # if dict, sort by timestamp
                items = [i[0] for i in sorted(item_list.items(), key=lambda x: x[1])]
                test_user_list[user]  = items[-1]
                train_user_list[user] = items[:-1]
            elif isinstance(item_list, (list, np.ndarray)):
                print('I assume "user_list" is sorted with the most recent item being the last one.')
                test_user_list[user]  = item_list[-1]
                train_user_list[user] = item_list[:-1]
        else:
            # Random select
            if isinstance(item_list, dict):
                items =list(item_list.keys())
                sz = np.floor(len(items)*p).astype('int')
                train_user_list[user] = np.random.choice(items, sz, replace=False)
                test_user_list[user] = np.setdiff1d(items, train_user_list[user])
            elif isinstance(item_list, (list, np.ndarray)):
                sz = np.floor(len(item_list)*p).astype('int')
                train_user_list[user] = np.random.choice(item_list, sz, replace=False)
                test_user_list[user] = np.setdiff1d(item_list, train_user_list[user])                

    return train_user_list, test_user_list

In [27]:
train_user_list, test_user_list = split_train_test(total_user_list)

Let's have a look

In [28]:
train_user_list[0], test_user_list[1]

(array([4, 5, 1, 2]), array([10]))

Now we repeat the process for train/valid split

In [29]:
train_user_list, valid_user_list = split_train_test(train_user_list, p=0.9)

In [30]:
train_user_list[0], valid_user_list[0]

(array([1, 4, 2]), array([5]))

In [31]:
def create_pair(user_list):
    pair = []
    for user, item_set in enumerate(user_list):
        pair.extend([(user, item) for item in item_set])
    return pair

In [32]:
train_pair = create_pair(train_user_list)

In [33]:
train_pair[:10]

[(0, 1),
 (0, 4),
 (0, 2),
 (1, 9),
 (1, 6),
 (1, 7),
 (2, 13),
 (2, 18),
 (2, 20),
 (2, 12)]

We will save all the information we need for this method in sparse matrices. Given the fact that I took part of this code from [this repo](https://github.com/sh0416/bpr/blob/master/preprocess.py), I am going to honour their naming. 

In [35]:
def fill_sp_mtx(dataset, rows, columns):
    R = sp.dok_matrix((user_size, item_size), dtype=np.float32) 
    for u, itemset in enumerate(dataset):
        for i in itemset:
            R[u, i] = 1
    return R.tocsr()

In [36]:
train_w = fill_sp_mtx(train_user_list, user_size, item_size)

In [37]:
train_w

<123960x50052 sparse matrix of type '<class 'numpy.float32'>'
	with 1121051 stored elements in Compressed Sparse Row format>

In [38]:
valid_w = fill_sp_mtx(valid_user_list, user_size, item_size)
test_w = fill_sp_mtx(test_user_list, user_size, item_size)

In [39]:
valid_w

<123960x50052 sparse matrix of type '<class 'numpy.float32'>'
	with 191075 stored elements in Compressed Sparse Row format>

In [40]:
test_w

<123960x50052 sparse matrix of type '<class 'numpy.float32'>'
	with 385407 stored elements in Compressed Sparse Row format>

In [41]:
np.savez(DATA_PATH/"amazon_movies.npz", train_w=train_w, valid_w=valid_w, test_w=test_w, 
         train_pair=train_pair, n_users=user_size, n_items=item_size)