<a href="https://colab.research.google.com/github/has-c/UserMobilityMining/blob/master/Gowalla%20ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import keras
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, Softmax
from sklearn.metrics import accuracy_score
import datetime
import copy

#disable warnings
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Helper Functions

In [2]:
def alignPastCurrentPOI(checkins, dateList):
    
    #sort the checkins and then al

    #sort the checkins 
    #group by userId, then checkin date and then order by time 
    uniqueUserIds = np.unique(checkins['userid'])
    uniqueCheckinDates = sorted(np.unique(dateList))

    #sort checkins by userid and date
    sortedCheckins = pd.DataFrame([])

    for userId in uniqueUserIds:
        for checkinDate in uniqueCheckinDates:

            #group by userId and checkin date 
            groupedByUserId = checkins[userId==checkins['userid']]
            groupedByCheckinDate = groupedByUserId[groupedByUserId['Date'] == checkinDate]
            if groupedByCheckinDate.shape[0] == 0:
                continue

            #sort by time 
            groupedByCheckinDate.sort_values(by ='Time',ascending=True, inplace=True)
            groupedByCheckinDate.index = range(0, groupedByCheckinDate.shape[0])

            #create list of sorted checkins 
            sortedCheckins = pd.concat([sortedCheckins, groupedByCheckinDate], axis=0, ignore_index=True)
            
    #align past and current checkins - features and labels entire df
    pastCurrentCheckins = pd.DataFrame([])

    for userId in uniqueUserIds:

        #group by userId
        groupedByUserId = sortedCheckins[userId==checkins['userid']]
        tempDf = pd.DataFrame([]) #holds the current user and instance

        for rowIndex in groupedByUserId.index:

            try:
                pastPOI = groupedByUserId.loc[rowIndex][['Community','Entertainment','Food','Nightlife','Outdoors','Shopping','Travel']]
                currentPOI = groupedByUserId.loc[rowIndex+1]
                poiInstance = list(currentPOI) + list(pastPOI)
                tempDf = pd.concat([tempDf, pd.DataFrame(poiInstance).T], axis=0)
            except KeyError:
                break


        #add the first row
        firstRow = groupedByUserId.loc[groupedByUserId.index[0]]
        poiInstance = list(firstRow) + [0,0,0,0,0,0,0] #0's represent no POI location
        tempDf = pd.concat([pd.DataFrame(poiInstance).T,tempDf], axis=0)
        pastCurrentCheckins = pd.concat([pastCurrentCheckins, tempDf], axis=0)   

    #rename fields 
    pastCurrentCheckinDfColumnNames = ['Current ' + name for name in sortedCheckins.columns] + ['Past ' + name for name in ['Community','Entertainment','Food','Nightlife','Outdoors','Shopping','Travel']]
    pastCurrentCheckins.columns = pastCurrentCheckinDfColumnNames
    
    return pastCurrentCheckins


### Preprocessing 
<ul>
    <li> One hot encoding </li>
    <li> Standard scaling </li>
</ul>

In [24]:
#import data
checkinsDf = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/processedCheckins.csv")
nzCheckins = pd.read_csv("/home/usermobilitymining/Notebooks/volume/Hasnain/Processed Data/nzCheckinsWithGridTokens.csv")

In [25]:
#join checkins df and nz checkins to only extract NZ checkins 
checkinsDf['id'] = checkinsDf.index #create id field
nzCheckins.drop("lat", axis=1, inplace=True) #drop lat so no error
nzCheckins.drop("lng", axis=1, inplace=True) #drop lng so no error
checkinsDf = nzCheckins.join(checkinsDf.set_index('id'), on='id')
checkinsDf.drop("Unnamed: 0", axis=1, inplace=True)

In [26]:
#convert date and time to datetime variables
#extract day of the week and hour of checkin
checkinDate = [datetime.datetime.strptime(date, "%Y-%m-%d").date() for date in checkinsDf['Date']]
checkinDay = [date.weekday() for date in checkinDate]
checkinTime = [datetime.datetime.strptime(time, '%H:%M:%S').time() for time in checkinsDf['Time']]
checkinHour = [time.hour for time in checkinTime]

In [27]:
#one hot encode the time variable
ohe = OneHotEncoder(dtype=np.int8, n_values=24)
hourOfCheckinEncoded = ohe.fit_transform(np.array(checkinHour).reshape(-1,1))
timeColumns = ['Time ' + str(num) for num in np.arange(0,24)]
timeDf = pd.DataFrame(hourOfCheckinEncoded.toarray(), columns=timeColumns)

#one hot encode the day of the week variable
ohe = OneHotEncoder(dtype=np.int8, n_values=7)
dayOfTheWeekEncoded = ohe.fit_transform(np.array(checkinDay).reshape(-1,1))
dayColumns = ['DayOfWeek ' + str(num) for num in np.arange(0,7)]
dayDf = pd.DataFrame(dayOfTheWeekEncoded.toarray(), columns=dayColumns)

#one hot encode the main category
numberOfMainCategories = len(np.unique(checkinsDf['Main Category']))
labelEncoder = LabelEncoder()
mainCategoriesEncoded = labelEncoder.fit_transform(checkinsDf['Main Category'].values.reshape(-1,1))
ohe = OneHotEncoder(dtype=np.int8, n_values=numberOfMainCategories)
mainCategoriesEncoded = ohe.fit_transform(mainCategoriesEncoded.reshape(-1,1))
categoryDf = pd.DataFrame(mainCategoriesEncoded.toarray(), columns=labelEncoder.classes_)

In [None]:
#concat fields
checkinsDf = pd.concat([checkinsDf, timeDf], axis=1,copy=False) 
checkinsDf = pd.concat([checkinsDf, dayDf], axis=1,copy=False) 
checkinsDf = pd.concat([checkinsDf, categoryDf], axis=1,copy=False) 

In [None]:
dateList = np.unique(checkinsDf['Date'])
checkinsDf = alignPastCurrentPOI(checkinsDf, dateList)

In [None]:
#drop fields 
checkinsDf.drop("Current Date", axis=1, inplace=True)
checkinsDf.drop("Current Time", axis=1, inplace=True)
checkinsDf.drop("Current Main Category", axis=1, inplace=True)
checkinsDf.drop("Current Locations", axis=1, inplace=True)

In [None]:
#train test split - normal 
labelDf = checkinsDf[['Current Community','Current Entertainment','Current Food',
                      'Current Nightlife','Current Outdoors','Current Shopping','Current Travel']]
featuresDf = checkinsDf.drop(['Current Community','Current Entertainment','Current Food','Current Nightlife',
                              'Current Outdoors','Current Shopping','Current Travel'], axis=1)



In [None]:
#standardize features
scaler = StandardScaler() 
featuresDf = scaler.fit_transform(featuresDf)

#add header 
featuresDf = pd.DataFrame(featuresDf, columns=header)

In [None]:
#train test split
featuresTrain, featuresTest, labelTrain, labelTest = train_test_split(featuresDf, labelDf, test_size=0.2, random_state=42)

### Neural Net

In [18]:
model = Sequential()
model.add(Dense(100, input_dim=52, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
#fit model 
history = model.fit(featuresTrain, labelTrain, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-a65b59d612ff>", line 2, in <module>
    history = model.fit(featuresTrain, labelTrain, epochs=100)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 1239, in fit
    validation_freq=validation_freq)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/training_arrays.py", line 216, in fit_loop
    callbacks.on_epoch_end(epoch, epoch_logs)
  File "/usr/local/lib/python3.6/dist-packages/keras/callbacks/callbacks.py", line 152, in on_epoch_end
    callback.on_epoch_end(epoch, logs)
  File "/usr/local/lib/python3.6/dist-packages/keras/callbacks/callbacks.py", line 611, in on_epoch_end
    self.progbar.update(self.seen, self.log_values)
  File "/usr/local/lib/python3.6/dist-packages/keras/utils/generic_utils.py", line 437, in update
    self._valu

KeyboardInterrupt: 

In [83]:
#predict
labelPredicted = model.predict(featuresTest)

In [88]:
#evaluate and find accuracy

#Converting predictions to label
pred = list()
for i in range(len(labelPredicted)):
    pred.append(np.argmax(labelPredicted[i]))
#Converting one hot encoded test label to label
test = list()
for i in range(len(labelTest)):
    test.append(np.argmax(labelTest[i]))

#accuracy
a = accuracy_score(pred,test)
print("Accuracy: ", a*100)

array([9.8425148e+01, 1.3858372e+00, 1.8206075e-01, 1.8255763e-03,
       5.0743353e-03, 4.6015764e-05, 6.0991567e-09], dtype=float32)