In [1]:
import os
import sys
import glob
import fileinput
from tqdm import tqdm

import numpy as np
import scipy
import scipy.sparse
import pandas as pd

from src.utils import concatenate_files, train_tune_test_split, filter_rows, split_train_test_proportion

## Constants

In [3]:
# All the constants of this notebook are defined in this cell

foursquare_raw_DIR = './data/raw/foursquare'
gowalla_raw_DIR = './data/raw/gowalla'
ml100_raw_DIR = './data/raw/ml-100k'
ml25_raw_DIR = './data/raw/ml-25m'

foursquare_clean_DIR = './data/clean/foursquare'
gowalla_clean_DIR = './data/clean/gowalla'
ml100_clean_DIR = './data/clean/ml-100k'
ml25_clean_DIR = './data/clean/ml-25m'

foursquare_checkins_file = f"{foursquare_raw_DIR}/checkins"
foursquare_pois_file = f"{foursquare_raw_DIR}/pois"
gowalla_checkins_file = f"{gowalla_raw_DIR}/checkins"
gowalla_pois_file = f"{gowalla_raw_DIR}/pois"
ml100_ratings_file = f"{ml100_raw_DIR}/ratings.csv"
ml25_ratings_file = f"{ml25_raw_DIR}/ratings"

foursquare_clicks = 1000000
gowalla_clicks = 1000000
ml_25_clicks = 20000000

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Concatenating input files
- because of file size limitation of github the datasets are splitted in small pieces.
- they must be first concatenating into one file
- so just do this once to get one file for each dataset

In [4]:
concatenate_files(foursquare_checkins_file, f"{foursquare_raw_DIR}/*checkins_0*")
concatenate_files(foursquare_pois_file, f"{foursquare_raw_DIR}/*pois_0*")

concatenate_files(gowalla_checkins_file, f"{gowalla_raw_DIR}/*checkins_0*")
concatenate_files(gowalla_pois_file, f"{gowalla_raw_DIR}/*pois_0*")

concatenate_files(ml25_ratings_file, f"{ml25_raw_DIR}/*ratings_0*")

## Processing

In [4]:
foursquare_checkins = pd.read_csv(foursquare_checkins_file, error_bad_lines=False, nrows=foursquare_clicks, sep='\t', usecols=[0,1], names=['user', 'item'])
gowalla_checkins = pd.read_csv(gowalla_checkins_file, error_bad_lines=False, nrows=gowalla_clicks, usecols=[0,1], names=['user', 'item'])
# ml25_ratings = pd.read_csv(ml25_ratings_file, error_bad_lines=False, header=0, nrows=ml_25_clicks, usecols=[0, 1, 2], names=['user', 'item', 'rating'])
# ml100_ratings = pd.read_csv(ml100_ratings_file, error_bad_lines=False, header=0, usecols=[0, 1, 2], names=['user', 'item', 'rating'])

In [73]:
ml100_ratings = ml100_ratings[ml100_ratings['rating'] > 3.5]
ml25_ratings = ml25_ratings[ml25_ratings['rating'] > 3.5]

In [5]:
def get_df(df_name):
    dfs = {
        'foursquare': [foursquare_checkins, foursquare_clean_DIR],
        'gowalla': [gowalla_checkins, gowalla_clean_DIR],
        'ml25': [ml25_ratings, ml25_clean_DIR],
        'ml100': [ml100_ratings, ml100_clean_DIR ]
    }
    return dfs.get(df_name)

In [7]:
# for df_name in tqdm(['foursquare', 'gowalla', 'ml25', 'ml100']):
for df_name in tqdm(['foursquare', 'gowalla']):
    df_info = get_df(df_name)
    df, clean_dir = df_info[0], df_info[1] 
    
    raw_data, user_activity, item_popularity = filter_rows(df)
    sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

    print(f"In {df_name}, after filtering, there are %d watching events from %d users and %d items (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))
    
    unique_uid = user_activity.index

    np.random.seed(98765)
    idx_perm = np.random.permutation(unique_uid.size)
    unique_uid = unique_uid[idx_perm]
    
    # create train/validation/test users
    n_users = unique_uid.size
    
    if df_name == 'foursquare':
        n_heldout_users = 2000
    if df_name == 'gowalla':
        n_heldout_users = 500
    if df_name == 'ml100':
        n_heldout_users = 100

    tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
    vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
    te_users = unique_uid[(n_users - n_heldout_users):]
    
    assert tr_users.shape[0] > 0
    assert vd_users.shape[0] > 0
    assert te_users.shape[0] > 0
    
    train_plays = raw_data.loc[raw_data['user'].isin(tr_users)]
    unique_sid = pd.unique(train_plays['item'])
    
    show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
    profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
    
    def numerize(tp):
        uid = list(map(lambda x: profile2id[x], tp['user']))
        sid = list(map(lambda x: show2id[x], tp['item']))
        return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])
    
    with open(os.path.join(clean_dir, 'unique_sid.txt'), 'w') as f:
        for sid in unique_sid:
            f.write('%s\n' % sid)
            
    vad_plays = raw_data.loc[raw_data['user'].isin(vd_users)]
    vad_plays = vad_plays.loc[vad_plays['item'].isin(unique_sid)]
    
    vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)
    
    test_plays = raw_data.loc[raw_data['user'].isin(te_users)]
    test_plays = test_plays.loc[test_plays['item'].isin(unique_sid)]

    test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)
    
    train_data = numerize(train_plays)
    train_data.to_csv(os.path.join(clean_dir, 'train.csv'), index=False)

    vad_data_tr = numerize(vad_plays_tr)
    vad_data_tr.to_csv(os.path.join(clean_dir, 'validation_tr.csv'), index=False)

    vad_data_te = numerize(vad_plays_te)
    vad_data_te.to_csv(os.path.join(clean_dir, 'validation_te.csv'), index=False)
    
    test_data_tr = numerize(test_plays_tr)
    test_data_tr.to_csv(os.path.join(clean_dir, 'test_tr.csv'), index=False)
    
    test_data_te = numerize(test_plays_te)
    test_data_te.to_csv(os.path.join(clean_dir, 'test_te.csv'), index=False)
    
    
    

  0%|          | 0/2 [00:00<?, ?it/s]

In foursquare, after filtering, there are 286433 watching events from 28932 users and 35804 items (sparsity: 0.028%)
0 users sampled
1000 users sampled
0 users sampled
1000 users sampled


 50%|█████     | 1/2 [00:05<00:05,  5.61s/it]

In gowalla, after filtering, there are 548990 watching events from 5815 users and 40504 items (sparsity: 0.233%)
0 users sampled
0 users sampled


100%|██████████| 2/2 [00:07<00:00,  3.70s/it]
