In [1]:
import os
import sys
import glob
import fileinput
from tqdm import tqdm

import numpy as np
import scipy
import scipy.sparse
import pandas as pd

from src.utils import concatenate_files, train_tune_test_split, filter_rows, split_train_test_proportion

## Constants

In [2]:
# All the constants of this notebook are defined in this cell

foursquare_raw_DIR = './data/raw/foursquare'
gowalla_raw_DIR = './data/raw/gowalla'
ml100_raw_DIR = './data/raw/ml-100k'
ml25_raw_DIR = './data/raw/ml-25m'
ml20_raw_DIR = './data/raw/ml-20m'

foursquare_clean_DIR = './data/clean/foursquare'
gowalla_clean_DIR = './data/clean/gowalla'
ml100_clean_DIR = './data/clean/ml-100k'
ml25_clean_DIR = './data/clean/ml-25m'
ml20_clean_DIR = './data/clean/ml-20m'

foursquare_checkins_file = f"{foursquare_raw_DIR}/checkins"
foursquare_pois_file = f"{foursquare_raw_DIR}/pois"
gowalla_checkins_file = f"{gowalla_raw_DIR}/checkins"
gowalla_pois_file = f"{gowalla_raw_DIR}/pois"
ml100_ratings_file = f"{ml100_raw_DIR}/ratings.csv"
ml25_ratings_file = f"{ml25_raw_DIR}/ratings"
ml20_ratings_file = f"{ml20_raw_DIR}/ratings.csv"

foursquare_clicks = 20000000
gowalla_clicks = 20000000
ml_25_clicks = 20000000

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

# France bounding box
lon_min, lat_min, lon_max, lat_max = -10, 35, 15, 55

## Concatenating input files
- because of file size limitation of github the datasets are splitted in small pieces.
- they must be first concatenating into one file
- so just do this once to get one file for each dataset

In [15]:
concatenate_files(foursquare_checkins_file, f"{foursquare_raw_DIR}/*checkins_0*")
concatenate_files(foursquare_pois_file, f"{foursquare_raw_DIR}/*pois_0*")

concatenate_files(gowalla_checkins_file, f"{gowalla_raw_DIR}/*checkins_0*")
concatenate_files(gowalla_pois_file, f"{gowalla_raw_DIR}/*pois_0*")

concatenate_files(ml25_ratings_file, f"{ml25_raw_DIR}/*ratings_0*")
concatenate_files(ml20_ratings_file, f"{ml20_raw_DIR}/*ratings*")

20000264it [00:19, 1015985.26it/s]


## Processing

In [9]:
foursquare_checkins = pd.read_csv(foursquare_checkins_file, error_bad_lines=False, nrows=foursquare_clicks, sep='\t', usecols=[0,1], names=['user', 'item'])
gowalla_checkins = pd.read_csv(gowalla_checkins_file, error_bad_lines=False, nrows=gowalla_clicks, usecols=[0,1], names=['user', 'item'])
# ml25_ratings = pd.read_csv(ml25_ratings_file, error_bad_lines=False, header=0, nrows=ml_25_clicks, usecols=[0, 1, 2], names=['user', 'item', 'rating'])
ml20_ratings = pd.read_csv(ml20_ratings_file, error_bad_lines=False, header=0, usecols=[0, 1, 2], names=['user', 'item', 'rating'])

  interactivity=interactivity, compiler=compiler, result=result)


In [112]:
# To get latitude and longitude we also load the 'items' features (including latitudes, longitudes)
foursquare_pois = pd.read_csv(foursquare_pois_file, error_bad_lines=False, sep='\t', usecols=[0,1, 2], names=['item', 'lat', 'lon'])
# To get latitude and longitude we also load the 'items' features (including latitudes, longitudes)
gowalla_pois = pd.read_csv(gowalla_pois_file, error_bad_lines=False, usecols=[0,2,3], names=['item', 'lon', 'lat'])



  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
gowalla_checkins.head()

Unnamed: 0,user,item
0,173812,452177
1,173812,829825
2,173812,196503
3,173812,1489546
4,173812,264217


In [5]:
gowalla_checkins.shape

(20000000, 2)

In [6]:
gowalla_checkins = gowalla_checkins.dropna()

In [131]:
gowalla_checkins.shape

(29712257, 4)

In [119]:
# we merge the checkins df with the pois features df
gowalla_checkins = pd.merge(left=gowalla_checkins, right=gowalla_pois, left_on='item', right_on='item')

In [122]:
gowalla_checkins[gowalla_checkins.lon.apply(lambda x: x.isnumeric())]

AttributeError: 'float' object has no attribute 'isnumeric'

In [7]:
gowalla_checkins['lon'] = pd.to_numeric(gowalla_checkins['lon'], errors='coerce')

KeyError: 'lon'

In [133]:
# we select checkins in France only in order to reduce the dataframe size
france_gowalla_checkins = gowalla_checkins[(gowalla_checkins['lon']>lon_min) & 
           (gowalla_checkins['lon'] < lon_max) & 
           (gowalla_checkins['lat'] > lat_min) & 
           (gowalla_checkins['lat'] < lat_max)]

In [134]:
# we merge the checkins df with the pois features df
foursquare_checkins = pd.merge(left=foursquare_checkins, right=foursquare_pois, left_on='item', right_on='item')

In [49]:
interactions = foursquare_checkins.shape[0]
print(f"Total interactions count in Foursquare: {interactions}")

Total interactions count in Foursquare: 24999920


In [50]:
foursquare_checkins.head()

Unnamed: 0,user,item,lat,lon
0,63752,4cc5a02c1e596dcbc639d967,19.428402,-99.161847
1,93757,4cc5a02c1e596dcbc639d967,19.428402,-99.161847
2,63752,4cc5a02c1e596dcbc639d967,19.428402,-99.161847
3,93757,4cc5a02c1e596dcbc639d967,19.428402,-99.161847
4,63752,4cc5a02c1e596dcbc639d967,19.428402,-99.161847


In [94]:
gowalla_checkins.head()

Unnamed: 0,user,item,lon,lat
0,173812,452177,-98.146598,29.721601
1,173812,452177,-98.146598,29.721601
2,173812,452177,-98.146598,29.721601
3,173812,452177,-98.146598,29.721601
4,173812,452177,-98.146598,29.721601


In [135]:
# we select checkins in France only in order to reduce the dataframe size
france_foursquare_checkins = foursquare_checkins[(foursquare_checkins['lon']>lon_min) & 
           (foursquare_checkins['lon'] < lon_max) & 
           (foursquare_checkins['lat'] > lat_min) & 
           (foursquare_checkins['lat'] < lat_max)]

users = france_foursquare_checkins['user'].nunique()
items = france_foursquare_checkins['item'].nunique()

print(f"In France we have:\n\t- distinct users count: {users}\n\t- distinct items count: {items}")

In France we have:
	- distinct users count: 33130
	- distinct items count: 260160


In [18]:
# ml100_ratings = ml100_ratings[ml100_ratings['rating'] > 3.5]
# ml25_ratings = ml25_ratings[ml25_ratings['rating'] > 3.5]
ml20_ratings['rating'] = pd.to_numeric(ml20_ratings['rating'], errors='coerce')
ml20_ratings = ml20_ratings[ml20_ratings['rating'] > 3.5]

In [10]:
def get_df(df_name):
    dfs = {
        'foursquare': [foursquare_checkins, foursquare_clean_DIR],
        'gowalla': [gowalla_checkins, gowalla_clean_DIR],
        'ml20': [ml20_ratings, ml20_clean_DIR],
#         'ml25': [ml25_ratings, ml25_clean_DIR],
#         'ml100': [ml100_ratings, ml100_clean_DIR ]
    }
    return dfs.get(df_name)

In [24]:
df_name = 'ml20'
df_info = get_df(df_name)
df, clean_dir = df_info[0], df_info[1] 

min_uc=55
min_sc=10


raw_data, user_activity, item_popularity = filter_rows(df, min_uc=min_uc, min_sc=min_sc)
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print(f"In {df_name}, after filtering, there are %d watching events from %d users and %d items (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))
    

In ml20, after filtering, there are 7811179 watching events from 51700 users and 12890 items (sparsity: 1.172%)


In [11]:
for df_name in tqdm(['ml20']):
# for df_name in tqdm(['foursquare', 'gowalla']):
    df_info = get_df(df_name)
    df, clean_dir = df_info[0], df_info[1] 
    
    min_uc=5
    min_sc=5
    if df_name == 'foursquare':
        n_heldout_users = 10000
        min_uc=38
        min_sc=80
    if df_name == 'gowalla':
        n_heldout_users = 10000
        min_uc=15
        min_sc=95
    if df_name == 'ml20':
        n_heldout_users = 10000
        min_uc=55
        min_sc=10
        
    raw_data, user_activity, item_popularity = filter_rows(df, min_uc=min_uc, min_sc=min_sc)
    sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

    print(f"In {df_name}, after filtering, there are %d watching events from %d users and %d items (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))
    
    unique_uid = user_activity.index

    np.random.seed(98765)
    idx_perm = np.random.permutation(unique_uid.size)
    unique_uid = unique_uid[idx_perm]
    
    # create train/validation/test users
    n_users = unique_uid.size  
    
    tr_users = unique_uid

#     tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
#     te_users = unique_uid[(n_users - n_heldout_users):]
    
    print(f"tr_users: {tr_users.shape[0]}")
    
    assert tr_users.shape[0] > 0
    
    train_plays = raw_data.loc[raw_data['user'].isin(tr_users)]
    unique_sid = pd.unique(train_plays['item'])
    train_plays_tr, train_plays_te = split_train_test_proportion(train_plays)
    
    train_data_tr = numerize(train_plays_tr)
    train_data_tr.to_csv(os.path.join(clean_dir, 'train_tr.csv'), index=False)
    train_data_te = numerize(train_plays_te)
    train_data_te.to_csv(os.path.join(clean_dir, 'train_te.csv'), index=False)
        
#     vad_plays = raw_data.loc[raw_data['user'].isin(vd_users)]
#     vad_plays = vad_plays.loc[vad_plays['item'].isin(unique_sid)]
    
#     vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)
    
#     test_plays = raw_data.loc[raw_data['user'].isin(te_users)]
#     test_plays = test_plays.loc[test_plays['item'].isin(unique_sid)]
    
#     test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)
         
    show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
    profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
    
    def numerize(tp):
        uid = list(map(lambda x: profile2id[x], tp['user']))
        sid = list(map(lambda x: show2id[x], tp['item']))
        return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

    with open(os.path.join(clean_dir, 'unique_sid.txt'), 'w') as f:
        for sid in unique_sid:
            f.write('%s\n' % sid)
            
#     vad_data_tr = numerize(vad_plays_tr)
#     vad_data_tr.to_csv(os.path.join(clean_dir, 'validation_tr.csv'), index=False)

#     vad_data_te = numerize(vad_plays_te)
#     vad_data_te.to_csv(os.path.join(clean_dir, 'validation_te.csv'), index=False)
    
#     test_data_tr = numerize(test_plays_tr)
#     test_data_tr.to_csv(os.path.join(clean_dir, 'test_tr.csv'), index=False)
    
#     test_data_te = numerize(test_plays_te)
#     test_data_te.to_csv(os.path.join(clean_dir, 'test_te.csv'), index=False)
    
    
    

  0%|          | 0/1 [00:00<?, ?it/s]

In ml20, after filtering, there are 18009017 watching events from 80000 users and 18028 items (sparsity: 1.249%)
tr_users: 80000
0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled
10000 users sampled
11000 users sampled
12000 users sampled
13000 users sampled
14000 users sampled
15000 users sampled
16000 users sampled
17000 users sampled
18000 users sampled
19000 users sampled
20000 users sampled
21000 users sampled
22000 users sampled
23000 users sampled
24000 users sampled
25000 users sampled
26000 users sampled
27000 users sampled
28000 users sampled
29000 users sampled
30000 users sampled
31000 users sampled
32000 users sampled
33000 users sampled
34000 users sampled
35000 users sampled
36000 users sampled
37000 users sampled
38000 users sampled
39000 users sampled
40000 users sampled
41000 users sampled
42000 users sampled
43000 users sampled
4400

  0%|          | 0/1 [01:50<?, ?it/s]


KeyError: 22