In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import json
import datetime
import time

import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [63]:
def checkinPreprocessing(checkins):
    #date and time processing

    #create date-time feature
    dateTime = checkins['datetime']
    dateTime.head()

    #split into date and time column 
    #year format: YYYY-MM-DD 
    #time format: HH:MM:SS

    hourOfCheckin = list()
    dayOfCheckin = list()
    weekdayOrWeekendCheckin = list()
    dateList = list()
    timeList = list()

    for index in range(0, dateTime.shape[0]):
        
        #convert date to date object
        date = dateTime[index][:dateTime[index].find("T")] #extract date from datetime string 
        year = int(date[:date.find("-")])
        month = int(date[date.find("-")+1:date.rfind("-")])
        day = int(date[date.rfind("-")+1:])
        date = datetime.date(year, month, day) #create a date object
        dateList.append(date)
        #convert time to time object
        time = dateTime[index][dateTime[index].find("T")+1:-1] #extract time as string 
        hour = int(time[:time.find(":")])
        minute = int(time[time.find(":")+1:time.rfind(":")])
        second = int(time[time.rfind(":")+1:])
        time = datetime.time(hour=hour, minute=minute, second=second)
        timeList.append(time)
        
        #from date object extract day of the week
        day = date.weekday() #integer representation of day of the week - 0 is Monday and 6 is Sunday
        dayOfCheckin.append(day)
        
        #hour of checkin - one hot encode this feature as 24 slots  
        hourOfCheckin.append(hour)
        
        #weekday or weekend - FEATURE
        if day == 5 or day == 6: #saturday or sunday 
            weekdayOrWeekendCheckin.append(1) #weekened checkin
        else:
            weekdayOrWeekendCheckin.append(0) #weekday checkin

    #add dates and times to the dataframe
    checkins['Date'] = dateList
    checkins['Time'] = timeList
    checkins['WeekdayOrWeekend'] = weekdayOrWeekendCheckin #add weekday or weekend field

    checkins.drop("datetime", axis=1, inplace=True)

    return checkins, dateList

def locationPreprocessing(placesDf):

    #extract the location information 
    locationInformation = list()
    for rowIndex in placesDf.index:
        
        rowInstance = placesDf['spot_categories'][rowIndex]
        firstIndex = rowInstance.rfind(":") + 3
        secondIndex = rowInstance.rfind("'")
        
        if len(locationInformation) == 0:
            locationInformation = [rowInstance[firstIndex:secondIndex].strip()]
        else:
            locationInformation.append(rowInstance[firstIndex:secondIndex].strip())
            
    #dataframe created for checking purposes
    placesDf['Locations'] = pd.DataFrame(locationInformation, columns=['Locations'])

    #use locationDf and the read in location category relationship df
    locationCategoryDf = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Data/LocationCategoryRelationshipsData.csv", usecols=['Specific Category', 'Main Category'])

    #connect the locationDf and the location category relationship df to align the POI categories with the locations
    placesDf = placesDf.join(locationCategoryDf.set_index('Specific Category'), on='Locations')

    #drop any unecessary columns
    placesDf.drop("created_at", axis=1, inplace=True)
    placesDf.drop("spot_categories", axis=1, inplace=True)

    placesDf.index = range(0, placesDf.shape[0]) #reindex so join is succesful
    
    return placesDf

### Main Code

In [64]:
#LOCATIONS
placesDf = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Data/gowalla/gowalla_spots_subset1.csv")
print("Locations Loaded")
placesDf = locationPreprocessing(placesDf)
print("Locations Processed")

Locations Loaded
Locations Processed


In [69]:
chunksize = 10 ** 6
firstIteration = True
index = 0

for chunk in pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Data/gowalla/gowalla_checkins.csv", chunksize=chunksize):
    #each chunk is 1 million rows
    
    print("Start Iteration ", str(index))
    
    #checkin process chunk
    chunk.index = range(0,chunk.shape[0])
    checkins,dateList = checkinPreprocessing(chunk)
    
    #Join the locationDf and the checkinsDf
    checkins = checkins.join(placesDf.set_index('id'), on='placeid')
    
    #write out prior to aligning 
    if firstIteration:
        checkins.to_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv", header=True, index=False)
        firstIteration = False
    else:
        checkins.to_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv",  mode='a', header=False, index=False)
        
    print("Finished Iteration ", str(index))
    index += 1 
    

Start Iteration  0
Finished Iteration  0
Start Iteration  1
Finished Iteration  1
Start Iteration  2
Finished Iteration  2
Start Iteration  3
Finished Iteration  3
Start Iteration  4
Finished Iteration  4
Start Iteration  5
Finished Iteration  5
Start Iteration  6
Finished Iteration  6
Start Iteration  7
Finished Iteration  7
Start Iteration  8
Finished Iteration  8
Start Iteration  9
Finished Iteration  9
Start Iteration  10
Finished Iteration  10
Start Iteration  11
Finished Iteration  11
Start Iteration  12
Finished Iteration  12
Start Iteration  13
Finished Iteration  13
Start Iteration  14
Finished Iteration  14
Start Iteration  15
Finished Iteration  15
Start Iteration  16
Finished Iteration  16
Start Iteration  17
Finished Iteration  17
Start Iteration  18
Finished Iteration  18
Start Iteration  19
Finished Iteration  19
Start Iteration  20
Finished Iteration  20
Start Iteration  21
Finished Iteration  21
Start Iteration  22
Finished Iteration  22
Start Iteration  23
Finished It

ValueError: Length mismatch: Expected axis has 1959 elements, new values have 1000000 elements

In [71]:
chunk.index = range(0,chunk.shape[0])
    checkins,dateList = checkinPreprocessing(chunk)
    
    #Join the locationDf and the checkinsDf
    checkins = checkins.join(placesDf.set_index('id'), on='placeid')
    
    #write out prior to aligning 
    if firstIteration:
        checkins.to_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv", header=True, index=False)
        firstIteration = False
    else:
        checkins.to_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv",  mode='a', header=False, index=False)
        
    print("Finished Iteration ", str(index))
    index += 1 

(1959, 3)