In [175]:
import os
import glob
import fileinput
from tqdm import tqdm
import shutil
import sys

import numpy as np
from scipy import sparse

import pandas as pd

import scipy.sparse

# Data Preprocessing

## Concatenating small files into one
- just do this once to get one file for each dataset

In [77]:
foursquare_DIR = './data/raw/foursquare'
gowalla_DIR = './data/raw/gowalla'
ml100_DIR = './data/raw/ml-100k'
ml25_DIR = './data/raw/ml-25m'

In [146]:
foursquare_checkins_file = f"{foursquare_DIR}/checkins"
foursquare_pois_file = f"{foursquare_DIR}/pois"

gowalla_checkins_file = f"{gowalla_DIR}/checkins"
gowalla_pois_file = f"{gowalla_DIR}/pois"

ml100_ratings_file = f"{ml100_DIR}/ratings.csv"
ml25_ratings_file = f"{ml25_DIR}/ratings"

In [147]:
def concatenate_files(file, pattern):
    """
        If the ratings/checkins file exists for each dataset, we do nothing,
        otherwise we just concatenate the pieces of files
    """
    if not os.path.exists(file):
        checkins_files = glob.glob(pattern)
        with open(file, 'w') as out_file:
            input_lines = fileinput.input(checkins_files)
            prev_line = None
            for line in input_lines:
                if not fileinput.isfirstline():  # first lines are corrupted
                    if prev_line is not None:    # last lines are corrupted
                        out_file.write(prev_line)
                    prev_line = line

In [148]:
concatenate_files(foursquare_checkins_file, f"{foursquare_DIR}/*checkins_0*")
concatenate_files(foursquare_pois_file, f"{foursquare_DIR}/*pois_0*")

concatenate_files(gowalla_checkins_file, f"{gowalla_DIR}/*checkins_0*")
concatenate_files(gowalla_pois_file, f"{gowalla_DIR}/*pois_0*")

concatenate_files(ml25_ratings_file, f"{ml25_DIR}/*ratings_0*")

## Loading datasets

In [382]:
foursquare_checkins = pd.read_csv(foursquare_checkins_file, error_bad_lines=False, nrows=1000000, sep='\t', usecols=[0,1], names=['user', 'item'])
foursquare_pois = pd.read_csv(foursquare_pois_file, error_bad_lines=False, sep='\t', usecols=[0,1, 2], names=['item', 'lat', 'lon'])

gowalla_checkins = pd.read_csv(gowalla_checkins_file, error_bad_lines=False, nrows=10000000, usecols=[0,1], names=['user', 'item'])
gowalla_pois = pd.read_csv(gowalla_pois_file, error_bad_lines=False, usecols=[0,2,3], names=['item', 'lon', 'lat'])

ml100_ratings = pd.read_csv(ml100_ratings_file, error_bad_lines=False, header=0, usecols=[0, 1, 2])
ml25_ratings = pd.read_csv(ml25_ratings_file, error_bad_lines=False, header=0, nrows=10000000, usecols=[0, 1, 2], names=['user', 'item', 'rating'])

## Processing

### processing foursquare

In [367]:
foursquare_checkins = foursquare_checkins.drop_duplicates()
foursquare_checkins = pd.merge(left=foursquare_checkins, right=foursquare_pois, left_on='item', right_on='item')
interactions, _ = foursquare_checkins.shape
print(f"total interactions count: {interactions}")

total interactions count: 751716


In [368]:
lon_min, lat_min, lon_max, lat_max = -5, 40, 10, 52
france_foursquare_checkins = foursquare_checkins[(foursquare_checkins['lon']>lon_min) & 
           (foursquare_checkins['lon'] < lon_max) & 
           (foursquare_checkins['lat'] > lat_min) & 
           (foursquare_checkins['lat'] < lat_max)]
users = france_foursquare_checkins['user'].nunique()
items = france_foursquare_checkins['item'].nunique()

print(f"In France we have:\n\t- distinct users count: {users}\n\t- distinct items count: {items}")

In France we have:
	- distinct users count: 5801
	- distinct items count: 16726


In [369]:
pivotable = france_foursquare_checkins.groupby(['user', 'item'], sort=False).size().unstack(fill_value=0)
X = scipy.sparse.csr_matrix(pivotable)
users, items = X.shape
print(f"users: {users}\nitems: {items}")
print(f"density = {float(round((X.getnnz() / np.prod(X.shape))*100,2))}%")

users: 5801
items: 16726
density = 0.02%


In [371]:
c = 0
for row in X:
    s = row.sum()
    if s > 9:
        print(f"{c},{s}")
        print(row.nonzero()[1])
    c += 1

2,25
[    2   133   175   176   180   182   219   245   249   354   369   371
   409   426  1086  1110  3552  3998  4000  4093  4098  4534  4573  4685
 12883]
3,11
[    3  1154  1444  3337  4965  5310 10440 10864 14673 15095 16319]
10,20
[    9   615   619   844  1393  5444  9486  9525  9616  9932  9934 11179
 11206 11210 11219 11280 11387 12229 12821 13179]
13,16
[    9   495  2757  3236  3296  3420  3828  4576  4767  6110  7140  7392
 12639 13289 15414 15780]
20,11
[   9 2946 3176 3189 4441 4592 4733 5235 5442 8112 9798]
29,16
[    9   978  1569  2156  2160  2340  2341  7939  7941  9764  9772 12161
 12165 12364 12369 14742]
33,16
[   10  1184  1425  1426  2323  2455  3257  4282  4806  5375  6577  7727
  7777  7877 13634 16621]
36,11
[   13   191  1009  3315  6666  7628  7723  7726  8777 13762 15431]
37,17
[   14  2524  2535  2579  3032  3037  3039  3099  3105  3380  3404  3474
  4673  5943  6272 11391 13439]
38,45
[   14   392   408   486   597  1024  2030  2063  2090  2095  2207  31

[  444  1166  1912  3424  5389  5698  9055  9113  9521 10129 11338 13922
 15654]
952,13
[  445   757   797  1876  3061  3447  3832  4233 12271 12425 12480 12495
 13319]
959,11
[  449  2128  2488  2682  2729  8952  9996 10247 14512 15165 15966]
966,15
[  452  1234  1241  4031  4202  4749  5244  5824  5865  6479  6528  8773
 11443 14071 14887]
1012,19
[  487  2576  3288  3597  3601  4305  4828  5095  5258  6845  6931  7057
  7100  7533 11649 13250 13425 14729 14737]
1020,13
[  488  1104  2125  3000  3997  7615  7868  9478 10675 10888 11591 14377
 14470]
1021,12
[  488   822  2709  3056  3647  4117  6042  8610 11278 11953 11955 14246]
1024,13
[ 491 1067 2052 2369 2908 3124 3307 3354 3637 3665 3963 4138 8714]
1054,24
[  498   901  1024  1113  3282  4089  4129  4945  5337  5709  7326  8185
  8516  8583  9431  9714  9995 10098 10855 12649 13029 14122 16628 16687]
1064,13
[  498  3753  5248  7701  7801  8321  9044 10228 12433 13764 14135 15106
 15225]
1067,13
[  498   502   641  1040  1078  1

2351,15
[ 1619  4186  4414  5476  7536  7561  7628  7851  8914  9232 10429 12292
 13288 13533 15754]
2359,11
[ 1631  3894  4542  5635  5637  5643  5650  5651  5721  5723 13519]
2426,13
[ 1694  1823  5076  6541 11558 14103 14183 14270 14519 14619 16351 16359
 16369]
2427,15
[ 1695  1794  2920  4036  4611  7697  7905  8051  9088 12102 13274 15160
 15194 15378 16528]
2431,13
[ 1702  4440  4565  6612  7483  7968  8022  9386 10060 10235 12994 14065
 14070]
2440,13
[ 1718  1726  2237  2245  4473  4476  5097  5099  8563  9006 14298 14301
 15161]
2457,13
[ 1749  1873  3332  6987  7023 10083 11723 13367 13621 13630 14419 16072
 16432]
2469,12
[ 1778  1939  2589  3479  5887  7713  8487  8510  9554  9648 13472 14954]
2473,29
[ 1781  1978  3025  3442  3576  4317  4487  5502  6017  7129  7327  7975
  8168  8676  8889  8896  8900  8905  9310  9331  9593 10886 11447 11481
 12357 12408 12614 14831 16451]
2475,19
[ 1781  1871  2015  2383  2623  2919  3025  5016  7480 10088 11445 11456
 13635 13638 1372

### processing gowalla

In [383]:
gowalla_checkins = gowalla_checkins.drop_duplicates()
gowalla_checkins = pd.merge(left=gowalla_checkins, right=gowalla_pois, left_on='item', right_on='item')
interactions, _ = gowalla_checkins.shape
print(f"total interactions count: {interactions}")
gowalla_checkins['lon'] = pd.to_numeric(gowalla_checkins['lon'])

total interactions count: 5473023


In [398]:
lon_min, lat_min, lon_max, lat_max = -5, 40, 10, 52
france_gowalla_checkins = gowalla_checkins[(gowalla_checkins['lon']>lon_min) & 
           (gowalla_checkins['lon'] < lon_max) & 
           (gowalla_checkins['lat'] > lat_min) & 
           (gowalla_checkins['lat'] < lat_max)]
users = france_gowalla_checkins['user'].nunique()
items = france_gowalla_checkins['item'].nunique()

print(f"In France we have:\n\t- distinct users count: {users}\n\t- distinct items count: {items}")

In France we have:
	- distinct users count: 19772
	- distinct items count: 182279


In [400]:
pivotable = france_gowalla_checkins.groupby(['user', 'item'], sort=False).size().unstack(fill_value=0)
X = scipy.sparse.csr_matrix(pivotable)
users, items = X.shape
print(f"users: {users}\nitems: {items}")
print(f"density = {float(round((X.getnnz() / np.prod(X.shape))*100,2))}%")

ValueError: Unstacked DataFrame is too big, causing int32 overflow

### processing ml-100

In [113]:
# binarize the data (only keep ratings >= 4)
ml100_ratings = ml100_ratings[ml100_ratings['rating'] > 3.5]

### processing ml-25

In [123]:
# binarize the data (only keep ratings >= 4)
ml25_ratings = ml25_ratings[ml25_ratings['rating'] > 3.5]