In [18]:
import os
import glob
import fileinput
from tqdm import tqdm
import shutil
import sys

import numpy as np
from scipy import sparse

import pandas as pd

# Data Preprocessing

## Concatenating small files into one
- just do this once to get one file for each dataset

In [77]:
foursquare_DIR = './data/raw/foursquare'
gowalla_DIR = './data/raw/gowalla'
ml100_DIR = './data/raw/ml-100k'
ml25_DIR = './data/raw/ml-25m'

In [78]:
foursquare_checkins_file = f"{foursquare_DIR}/checkins"
gowalla_checkins_file = f"{gowalla_DIR}/checkins"
ml100_ratings_file = f"{ml100_DIR}/ratings.csv"
ml25_ratings_file = f"{ml25_DIR}/ratings"

In [108]:
def concatenate_files(checkins_file, pattern):
    """
        If the ratings/checkins file exists for each dataset, we do nothing,
        otherwise we just concatenate the pieces of files
    """
    if not os.path.exists(checkins_file):
        checkins_files = glob.glob(pattern)
        with open(checkins_file, 'w') as out_file:
            input_lines = fileinput.input(checkins_files)
            prev_line = None
            for line in input_lines:
                if not fileinput.isfirstline():  # first lines are corrupted
                    if prev_line is not None:    # last lines are corrupted
                        out_file.write(prev_line)
                    prev_line = line

In [79]:
concatenate_files(foursquare_checkins_file, f"{foursquare_DIR}/*checkins_0*")
concatenate_files(gowalla_checkins_file, f"{gowalla_DIR}/*checkins_0*")
concatenate_files(ml25_ratings_file, f"{ml25_DIR}/*ratings_0*")

## Loading datasets

In [121]:
foursquare_checkins = pd.read_csv(foursquare_checkins_file, error_bad_lines=False, nrows=100, sep='\t', usecols=[0,1], names=['user', 'item'])
gowalla_checkins = pd.read_csv(gowalla_checkins_file, error_bad_lines=False, nrows=100, usecols=[0,1], names=['user', 'item'])
ml100_ratings = pd.read_csv(ml100_ratings_file, error_bad_lines=False, header=0, nrows=100, usecols=[0, 1, 2])
ml25_ratings = pd.read_csv(ml25_ratings_file, error_bad_lines=False, header=0, nrows=100, usecols=[0, 1, 2], names=['user', 'item', 'rating'])

## Processing

### processing foursquare

### processing gowalla

### processing ml-100

In [113]:
# binarize the data (only keep ratings >= 4)
ml100_ratings = ml100_ratings[ml100_ratings['rating'] > 3.5]

### processing ml-25

In [123]:
# binarize the data (only keep ratings >= 4)
ml25_ratings = ml25_ratings[ml25_ratings['rating'] > 3.5]