<a href="https://colab.research.google.com/github/has-c/UserMobilityMining/blob/master/Gowalla%20ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import keras
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, Softmax
from sklearn.metrics import accuracy_score
import datetime

#disable warnings
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Helper Functions

In [47]:
def alignPastCurrentPOI(checkins, dateList):

    #group by userId, then checkin date and then order by time 
    uniqueUserIds = np.unique(checkins['userid'])
    uniqueCheckinDates = sorted(np.unique(dateList))

    sortedCheckins = pd.DataFrame([])

    for userId in uniqueUserIds:
        for checkinDate in uniqueCheckinDates:
            
            #group by userId and checkin date 
            groupedByUserId = checkins[userId==checkins['userid']]
            groupedByCheckinDate = groupedByUserId[groupedByUserId['Date'] == checkinDate]
            if groupedByCheckinDate.shape[0] == 0:
                continue
                
            #sort by time 
            groupedByCheckinDate.sort_values(by ='Time',ascending=True, inplace=True)
            groupedByCheckinDate.index = range(0, groupedByCheckinDate.shape[0])
            
            #create list of sorted checkins 
            sortedCheckins = pd.concat([sortedCheckins, groupedByCheckinDate], axis=0, ignore_index=True)
    
    #align past and current checkins - features and labels entire df
    pastCurrentCheckins = pd.DataFrame([])

    for userId in uniqueUserIds:
        
        #group by userId
        groupedByUserId = sortedCheckins[userId==checkins['userid']]
        
        for rowIndex in groupedByUserId.index:
            try:
                pastPOI = groupedByUserId.loc[rowIndex]
                currentPOI = groupedByUserId.loc[rowIndex+1]
                poiInstance = list(currentPOI) + list(pastPOI)
                pastCurrentCheckins = pd.concat([pastCurrentCheckins, pd.DataFrame(poiInstance).T], axis=0)
            except KeyError:
                break

    #rename fields 
    pastCurrentCheckinDfColumnNames = ['Current ' + name for name in sortedCheckins.columns] + ['Past ' + name for name in sortedCheckins.columns]
    pastCurrentCheckins.columns = pastCurrentCheckinDfColumnNames

    return pastCurrentCheckins

### Preprocessing 
<ul>
    <li> One hot encoding </li>
    <li> Standard scaling </li>
</ul>

In [4]:
#import data
checkinsDf = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv")
nzCheckins = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/nzCheckinsWithGridTokens.csv")

In [6]:
#join checkins df and nz checkins to only extract NZ checkins 
checkinsDf['id'] = checkinsDf.index #create id field
nzCheckins.drop("lat", axis=1, inplace=True) #drop lat so no error
nzCheckins.drop("lng", axis=1, inplace=True) #drop lng so no error
checkinsDf = nzCheckins.join(checkinsDf.set_index('id'), on='id')
checkinsDf.drop("Unnamed: 0", axis=1, inplace=True)

In [13]:
#convert date and time to datetime variables
#extract day of the week and hour of checkin
checkinDate = [datetime.datetime.strptime(date, "%Y-%m-%d").date() for date in checkinsDf['Date']]
checkinDay = [date.weekday() for date in checkinDate]
checkinTime = [datetime.datetime.strptime(time, '%H:%M:%S').time() for time in checkinsDf['Time']]
checkinHour = [time.hour for time in checkinTime]

In [15]:
#one hot encode the time variable
ohe = OneHotEncoder(dtype=np.int8, n_values=24)
hourOfCheckinEncoded = ohe.fit_transform(np.array(checkinHour).reshape(-1,1))
timeColumns = ['Time ' + str(num) for num in np.arange(0,24)]
timeDf = pd.DataFrame(hourOfCheckinEncoded.toarray(), columns=timeColumns)

#one hot encode the day of the week variable
ohe = OneHotEncoder(dtype=np.int8, n_values=7)
dayOfTheWeekEncoded = ohe.fit_transform(np.array(checkinDay).reshape(-1,1))
dayColumns = ['DayOfWeek ' + str(num) for num in np.arange(0,7)]
dayDf = pd.DataFrame(dayOfTheWeekEncoded.toarray(), columns=dayColumns)

#one hot encode the main category
numberOfMainCategories = len(np.unique(checkinsDf['Main Category']))
labelEncoder = LabelEncoder()
mainCategoriesEncoded = labelEncoder.fit_transform(checkinsDf['Main Category'].values.reshape(-1,1))
ohe = OneHotEncoder(dtype=np.int8, n_values=numberOfMainCategories)
mainCategoriesEncoded = ohe.fit_transform(mainCategoriesEncoded.reshape(-1,1))
categoryDf = pd.DataFrame(mainCategoriesEncoded.toarray(), columns=labelEncoder.classes_)

In [16]:
#concat fields
checkinsDf = pd.concat([checkinsDf, timeDf], axis=1,copy=False) 
checkinsDf = pd.concat([checkinsDf, dayDf], axis=1,copy=False) 
checkinsDf = pd.concat([checkinsDf, categoryDf], axis=1,copy=False) 

In [17]:
#drop fields 
checkinsDf.drop("Date", axis=1, inplace=True)
checkinsDf.drop("Time", axis=1, inplace=True)
checkinsDf.drop("Main Category", axis=1, inplace=True)
checkinsDf.drop("Locations", axis=1, inplace=True)

In [18]:
#train test split - normal 
labelDf = checkinsDf[['Community','Entertainment','Food','Nightlife','Outdoors','Shopping','Travel']]
featuresDf = checkinsDf.drop(['Community','Entertainment','Food','Nightlife','Outdoors','Shopping','Travel'], axis=1)

#train test split
featuresTrain, featuresTest, labelTrain, labelTest = train_test_split(featuresDf, labelDf, test_size=0.2, random_state=42)

### Neural Net

In [34]:
model = Sequential()
model.add(Dense(35, input_dim=45, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
#fit model 
history = model.fit(featuresTrain, labelTrain, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [24]:
#predict
labelPredicted = model.predict(featuresTest)