In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import datetime
import copy
from xgboost import XGBClassifier

#disable warnings
import warnings
warnings.filterwarnings("ignore")

### Helper Functions

In [2]:
def alignPastCurrentPOI(checkins, dateList):
    
    #sort the checkins and then al

    #sort the checkins 
    #group by userId, then checkin date and then order by time 
    uniqueUserIds = np.unique(checkins['userid'])
    uniqueCheckinDates = sorted(np.unique(dateList))

    #sort checkins by userid and date
    sortedCheckins = pd.DataFrame([])

    for userId in uniqueUserIds:
        for checkinDate in uniqueCheckinDates:

            #group by userId and checkin date 
            groupedByUserId = checkins[userId==checkins['userid']]
            groupedByCheckinDate = groupedByUserId[groupedByUserId['Date'] == checkinDate]
            if groupedByCheckinDate.shape[0] == 0:
                continue

            #sort by time 
            groupedByCheckinDate.sort_values(by ='Time',ascending=True, inplace=True)
            groupedByCheckinDate.index = range(0, groupedByCheckinDate.shape[0])

            #create list of sorted checkins 
            sortedCheckins = pd.concat([sortedCheckins, groupedByCheckinDate], axis=0, ignore_index=True)
            
    #align past and current checkins - features and labels entire df
    pastCurrentCheckins = pd.DataFrame([])

    for userId in uniqueUserIds:

        #group by userId
        groupedByUserId = sortedCheckins[userId==checkins['userid']]
        tempDf = pd.DataFrame([]) #holds the current user and instance

        for rowIndex in groupedByUserId.index:

            try:
                pastPOI = groupedByUserId.loc[rowIndex][['Community','Entertainment','Food','Nightlife','Outdoors','Shopping','Travel']]
                currentPOI = groupedByUserId.loc[rowIndex+1]
                poiInstance = list(currentPOI) + list(pastPOI)
                tempDf = pd.concat([tempDf, pd.DataFrame(poiInstance).T], axis=0)
            except KeyError:
                break


        #add the first row
        firstRow = groupedByUserId.loc[groupedByUserId.index[0]]
        poiInstance = list(firstRow) + [0,0,0,0,0,0,0] #0's represent no POI location
        tempDf = pd.concat([pd.DataFrame(poiInstance).T,tempDf], axis=0)
        pastCurrentCheckins = pd.concat([pastCurrentCheckins, tempDf], axis=0)   

    #rename fields 
    pastCurrentCheckinDfColumnNames = ['Current ' + name for name in sortedCheckins.columns] + ['Past ' + name for name in ['Community','Entertainment','Food','Nightlife','Outdoors','Shopping','Travel']]
    pastCurrentCheckins.columns = pastCurrentCheckinDfColumnNames
    
    return pastCurrentCheckins

def preprocessing():
    
    #import data
    checkinsDf = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv")
    nzCheckins = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/nzCheckinsWithGridTokens.csv")

    #join checkins df and nz checkins to only extract NZ checkins 
    checkinsDf['id'] = checkinsDf.index #create id field
    nzCheckins.drop("lat", axis=1, inplace=True) #drop lat so no error
    nzCheckins.drop("lng", axis=1, inplace=True) #drop lng so no error
    checkinsDf = nzCheckins.join(checkinsDf.set_index('id'), on='id')
    checkinsDf.drop("Unnamed: 0", axis=1, inplace=True)
    
    #convert date and time to datetime variables
    #extract day of the week and hour of checkin
    checkinDate = [datetime.datetime.strptime(date, "%Y-%m-%d").date() for date in checkinsDf['Date']]
    checkinDay = [date.weekday() for date in checkinDate]
    checkinTime = [datetime.datetime.strptime(time, '%H:%M:%S').time() for time in checkinsDf['Time']]
    checkinHour = [time.hour for time in checkinTime]
    
    #one hot encode the time variable
    ohe = OneHotEncoder(dtype=np.int8, n_values=24)
    hourOfCheckinEncoded = ohe.fit_transform(np.array(checkinHour).reshape(-1,1))
    timeColumns = ['Time ' + str(num) for num in np.arange(0,24)]
    timeDf = pd.DataFrame(hourOfCheckinEncoded.toarray(), columns=timeColumns)

    #one hot encode the day of the week variable
    ohe = OneHotEncoder(dtype=np.int8, n_values=7)
    dayOfTheWeekEncoded = ohe.fit_transform(np.array(checkinDay).reshape(-1,1))
    dayColumns = ['DayOfWeek ' + str(num) for num in np.arange(0,7)]
    dayDf = pd.DataFrame(dayOfTheWeekEncoded.toarray(), columns=dayColumns)

    #one hot encode the main category
    numberOfMainCategories = len(np.unique(checkinsDf['Main Category']))
    labelEncoder = LabelEncoder()
    mainCategoriesEncoded = labelEncoder.fit_transform(checkinsDf['Main Category'].values.reshape(-1,1))
    ohe = OneHotEncoder(dtype=np.int8, n_values=numberOfMainCategories)
    mainCategoriesEncoded = ohe.fit_transform(mainCategoriesEncoded.reshape(-1,1))
    categoryDf = pd.DataFrame(mainCategoriesEncoded.toarray(), columns=labelEncoder.classes_)
    
    #concat fields
    checkinsDf = pd.concat([checkinsDf, timeDf], axis=1,copy=False) 
    checkinsDf = pd.concat([checkinsDf, dayDf], axis=1,copy=False) 
    checkinsDf = pd.concat([checkinsDf, categoryDf], axis=1,copy=False) 
    
    #past present align
    dateList = np.unique(checkinsDf['Date'])
    checkinsDf = alignPastCurrentPOI(checkinsDf, dateList)
    
    #drop fields 
    checkinsDf.drop("Current Date", axis=1, inplace=True)
    checkinsDf.drop("Current Time", axis=1, inplace=True)
    checkinsDf.drop("Current Main Category", axis=1, inplace=True)
    checkinsDf.drop("Current Locations", axis=1, inplace=True)
    
    #label feature split
    labelDf = checkinsDf[['Current Community','Current Entertainment','Current Food',
                          'Current Nightlife','Current Outdoors','Current Shopping','Current Travel']]
    featuresDf = checkinsDf.drop(['Current Community','Current Entertainment','Current Food','Current Nightlife',
                                  'Current Outdoors','Current Shopping','Current Travel'], axis=1)

    

    return featuresDf, labelDf

### Preprocessing

In [3]:
featuresDf, labelDf = preprocessing()

In [20]:
#integer encode the labels
labels = list()
for i in range(0, labelDf.shape[0]):
    labels.append(np.argmax(labelDf.values[i]))
    
labelDf = np.array(labels)

In [32]:
#convert dtype
featuresDf = featuresDf.astype(float)
labelDf = labelDf.astype(float)

In [33]:
#train test split
featuresTrain, featuresTest, labelTrain, labelTest = train_test_split(featuresDf, labelDf, test_size=0.2, random_state=42)

### Model

In [34]:
#train model
clf = XGBClassifier(n_estimators=1000, n_jobs=-1)
clf.fit(featuresTrain, labelTrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [36]:
#use model to predict
labelPredicted = clf.predict(featuresTest)

In [44]:
#accuracy
a = accuracy_score(labelPredicted,labelTest)
print("Accuracy: ", a*100)

Accuracy:  88.1406288685318
