In [17]:
import os
import sys
import glob
import fileinput
from tqdm import tqdm

import numpy as np
import scipy
import pandas as pd

from src.utils import concatenate_files, train_tune_test_split

## Constants

In [7]:
# All the constants of this notebook are defined in this cell

foursquare_raw_DIR = './data/raw/foursquare'
gowalla_raw_DIR = './data/raw/gowalla'
ml100_raw_DIR = './data/raw/ml-100k'
ml25_raw_DIR = './data/raw/ml-25m'

foursquare_clean_DIR = './data/clean/foursquare'
gowalla_clean_DIR = './data/clean/gowalla'
ml100_clean_DIR = './data/clean/ml-100k'
ml25_clean_DIR = './data/clean/ml-25m'

foursquare_checkins_file = f"{foursquare_raw_DIR}/checkins"
foursquare_pois_file = f"{foursquare_raw_DIR}/pois"
gowalla_checkins_file = f"{gowalla_raw_DIR}/checkins"
gowalla_pois_file = f"{gowalla_raw_DIR}/pois"
ml100_ratings_file = f"{ml100_raw_DIR}/ratings.csv"
ml25_ratings_file = f"{ml25_raw_DIR}/ratings"

foursquare_clicks = 1000000
gowalla_clicks = 1000000
ml_25_clicks = 1000000

# France bounding box
lon_min, lat_min, lon_max, lat_max = -5, 40, 10, 52

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

## Concatenating input files
- because of file size limitation of github the datasets are splitted in small pieces.
- they must be first concatenating into one file
- so just do this once to get one file for each dataset

In [8]:
concatenate_files(foursquare_checkins_file, f"{foursquare_raw_DIR}/*checkins_0*")
concatenate_files(foursquare_pois_file, f"{foursquare_raw_DIR}/*pois_0*")

concatenate_files(gowalla_checkins_file, f"{gowalla_raw_DIR}/*checkins_0*")
concatenate_files(gowalla_pois_file, f"{gowalla_raw_DIR}/*pois_0*")

concatenate_files(ml25_ratings_file, f"{ml25_raw_DIR}/*ratings_0*")

## Processing
- for each of the 4 datasets loaded, the output of the processing are tab separated values (`.tsv`) files in each corresponding `./data/clean/<dataset>` folder
- the `.tsv` files are:
    * `train.tsv` for training
    * `tune.tsv` for validation
    * `test.tsv` for test

### Foursquare

In [9]:
# Loading the dataset into pandas dataframes
# We load only the 2 dimensions corresponding to 'users' and 'items'
foursquare_checkins = pd.read_csv(foursquare_checkins_file, error_bad_lines=False, nrows=foursquare_clicks, sep='\t', usecols=[0,1], names=['user', 'item'])

In [10]:
# To get latitude and longitude we also load the 'items' features (including latitudes, longitudes)
foursquare_pois = pd.read_csv(foursquare_pois_file, error_bad_lines=False, sep='\t', usecols=[0,1, 2], names=['item', 'lat', 'lon'])

In [11]:
# because we are interested only in the existence of an interaction, we keep only unique pairs 
foursquare_checkins = foursquare_checkins.drop_duplicates()

In [12]:
# we merge the checkins df with the pois features df
foursquare_checkins = pd.merge(left=foursquare_checkins, right=foursquare_pois, left_on='item', right_on='item')

In [13]:
interactions = foursquare_checkins.shape[0]
print(f"Total interactions count in Foursquare: {interactions}")

Total interactions count in Foursquare: 751716


In [14]:
# we select checkins in France only in order to reduce the dataframe size
france_foursquare_checkins = foursquare_checkins[(foursquare_checkins['lon']>lon_min) & 
           (foursquare_checkins['lon'] < lon_max) & 
           (foursquare_checkins['lat'] > lat_min) & 
           (foursquare_checkins['lat'] < lat_max)]

users = france_foursquare_checkins['user'].nunique()
items = france_foursquare_checkins['item'].nunique()

print(f"In France we have:\n\t- distinct users count: {users}\n\t- distinct items count: {items}")

In France we have:
	- distinct users count: 5801
	- distinct items count: 16726


In [18]:
# we convert the dataframe into a scipy sparse matrix
pivotable = france_foursquare_checkins.groupby(['user', 'item'], sort=False).size().unstack(fill_value=0)
X = scipy.sparse.csr_matrix(pivotable)
users, items = X.shape

print(f"- users: {users}\n- items: {items}")
print(f"- density = {float(round((X.getnnz() / np.prod(X.shape))*100,2))}%")

- users: 5801
- items: 16726
- density = 0.02%


In [21]:
#Finally create the train, tune, test files
train_tune_test_split(foursquare_clean_DIR, X, 15)

5801it [00:01, 5481.63it/s]


### gowalla

In [22]:
# Loading the dataset into pandas dataframes
# We load only the 2 dimensions corresponding to 'users' and 'items'
gowalla_checkins = pd.read_csv(gowalla_checkins_file, error_bad_lines=False, nrows=gowalla_clicks, usecols=[0,1], names=['user', 'item'])

In [23]:
# To get latitude and longitude we also load the 'items' features (including latitudes, longitudes)
gowalla_pois = pd.read_csv(gowalla_pois_file, error_bad_lines=False, usecols=[0,2,3], names=['item', 'lon', 'lat'])

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
# as we did above, we keep only unique pairs 
gowalla_checkins = gowalla_checkins.drop_duplicates()

In [25]:
# we merge the checkins df with the pois features df
gowalla_checkins = pd.merge(left=gowalla_checkins, right=gowalla_pois, left_on='item', right_on='item')
gowalla_checkins['lon'] = pd.to_numeric(gowalla_checkins['lon'])

In [26]:
interactions = gowalla_checkins.shape[0]
print(f"Total interactions count: {interactions}")

Total interactions count: 545233


In [27]:
# we select checkins in France only in order to reduce the dataframe size
france_gowalla_checkins = gowalla_checkins[(gowalla_checkins['lon']>lon_min) & 
           (gowalla_checkins['lon'] < lon_max) & 
           (gowalla_checkins['lat'] > lat_min) & 
           (gowalla_checkins['lat'] < lat_max)]

In [28]:
# we convert the dataframe into a scipy sparse matrix
pivotable = france_gowalla_checkins.groupby(['user', 'item'], sort=False).size().unstack(fill_value=0)
X = scipy.sparse.csr_matrix(pivotable)
users, items = X.shape
print(f"- users: {users}\n- items: {items}")
print(f"- density = {float(round((X.getnnz() / np.prod(X.shape))*100,2))}%")

- users: 1167
- items: 24504
- density = 0.12%


In [29]:
#Finally create the train, tune, test files
train_tune_test_split(gowalla_clean_DIR, X, 15)

1167it [00:00, 4831.28it/s]


### ml-100

In [30]:
# Loading the dataset into pandas dataframes
ml100_ratings = pd.read_csv(ml100_ratings_file, error_bad_lines=False, header=0, usecols=[0, 1, 2], names=['user', 'item', 'rating'])

In [31]:
# binarize the data (only keep ratings >= 4)
ml100_ratings = ml100_ratings[ml100_ratings['rating'] > 3.5]

In [32]:
# as we did above, we keep only unique pairs 
ml100_ratings = ml100_ratings.drop_duplicates()
interactions = ml100_ratings.shape[0]
print(f"Total interactions count: {interactions}")

Total interactions count: 48580


In [33]:
# we convert the dataframe into a scipy sparse matrix
pivotable = ml100_ratings.groupby(['user', 'item'], sort=False).size().unstack(fill_value=0)
X = scipy.sparse.csr_matrix(pivotable)
users, items = X.shape
print(f"- users: {users}\n- items: {items}")
print(f"- density = {float(round((X.getnnz() / np.prod(X.shape))*100,2))}%")

- users: 609
- items: 6298
- density = 1.27%


In [34]:
#Finally create the train, tune, test files
train_tune_test_split(ml100_clean_DIR, X, 15)

609it [00:00, 4093.49it/s]


### ml-25

In [35]:
# Loading the dataset into pandas dataframes
ml25_ratings = pd.read_csv(ml25_ratings_file, error_bad_lines=False, header=0, nrows=ml_25_clicks, usecols=[0, 1, 2], names=['user', 'item', 'rating'])

In [36]:
# binarize the data (only keep ratings >= 4)
ml25_ratings = ml25_ratings[ml25_ratings['rating'] > 3.5]

In [37]:
# as we did above, we keep only unique pairs 
ml25_ratings = ml25_ratings.drop_duplicates()
interactions = ml25_ratings.shape[0]
print(f"Total interactions count: {interactions}")

Total interactions count: 499262


In [38]:
# we convert the dataframe into a scipy sparse matrix
pivotable = ml25_ratings.groupby(['user', 'item'], sort=False).size().unstack(fill_value=0)
X = scipy.sparse.csr_matrix(pivotable)
users, items = X.shape
print(f"- users: {users}\n- items: {items}")
print(f"- density = {float(round((X.getnnz() / np.prod(X.shape))*100,2))}%")

- users: 6370
- items: 15814
- density = 0.5%


In [39]:
#Finally create the train, tune, test files
train_tune_test_split(ml25_clean_DIR, X, 15)

6370it [00:01, 4287.84it/s]
