# Classification - Build model, predict result

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Now, we use our trained model to see the result

1. get the coordinate
1. get the near venues from Foursquare
1. according to the near venues and our trained model, predict if the result below to Type 0, 1, or 2

Foursquare ID info

In [2]:
CLIENT_ID = '5P4U2KHYRYF4YZIKIGSGT45I2ILKGBKE5TKNH2LIDZAPLO3J' # your Foursquare ID
CLIENT_SECRET = 'EF2SFFD0UYUVDKCLI0BVKPCTZGR50BMKTE5KTNQIUF4CWZHB' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5P4U2KHYRYF4YZIKIGSGT45I2ILKGBKE5TKNH2LIDZAPLO3J
CLIENT_SECRET:EF2SFFD0UYUVDKCLI0BVKPCTZGR50BMKTE5KTNQIUF4CWZHB


In [3]:
def get_geo_coordinates(address):
    geolocator = Nominatim(user_agent="mtr_agent")
    location = geolocator.geocode(address)
    if location == None:
        return (np.nan, np.nan)
    latitude = location.latitude
    longitude = location.longitude
    return (latitude, longitude)

addr = 'San Ma Tau St, Ma Tau Kok, To Kwa Wan, Hong Kong, China'
lat, long = get_geo_coordinates(addr)
lat, long

(22.3188727, 114.1920097)

In [5]:
# Function to get the top 100 venues that are in a neighborhood within a radius of 500 meters

LIMIT = 100 # top 100 venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Type', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)



In [6]:
# Run the above function on each spots and create a new dataframe called venues

venues = getNearbyVenues(
                                names=[np.nan],
                                latitudes=[lat],
                                longitudes=[long]
                                )

We add the above venues data into the venues list near the locations of McDonald's and Starbucks 

In [7]:
all_venues = pd.read_excel('venues.xlsx', index_col=0)
all_venues = all_venues.append(venues)

In [8]:
all_venues[np.isnan(all_venues['Type'])]

Unnamed: 0,Type,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,,22.318873,114.19201,The Mahjong Hostel (麻雀客棧),22.320208,114.187643,Hostel
1,,22.318873,114.19201,Victoria Harbour Restaurant 海港酒家,22.316943,114.189593,Chinese Restaurant
2,,22.318873,114.19201,鴻福海鮮四季火鍋,22.315963,114.189806,Chinese Restaurant
3,,22.318873,114.19201,The Great Restaurant (一品雞煲火鍋),22.322365,114.189045,Chinese Restaurant
4,,22.318873,114.19201,Cattle Depot Artist Village (Ex-Ma Tau Kok Ani...,22.321037,114.191651,Art Gallery
5,,22.318873,114.19201,Tsui Wah Restaurant (翠華餐廳),22.321261,114.188455,Hong Kong Restaurant
6,,22.318873,114.19201,Harbour Plaza 8 Degrees (8度海逸酒店),22.322952,114.190646,Hotel
7,,22.318873,114.19201,McDonald's 麥當勞,22.31893,114.187754,Fast Food Restaurant
8,,22.318873,114.19201,To Kwa Wan Market 土瓜灣街巿,22.317467,114.188206,Food & Drink Shop
9,,22.318873,114.19201,Goteborg Restaurant (哥登堡餐廳),22.317019,114.189571,Steakhouse


create one more column to identify spots to group data

In [9]:
# one hot encoding
onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Type'] = all_venues['Type'] 
onehot['Latitude'] = all_venues['Latitude']
onehot['Longitude'] = all_venues['Longitude']


In [10]:
onehot['dummyID'] = onehot['Latitude'].astype(str) + onehot['Longitude'].astype(str)

In [11]:
# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

grouped = onehot.groupby('dummyID').mean().reset_index()


In [12]:
data = grouped[~np.isnan(grouped['Type'])]
result = grouped[np.isnan(grouped['Type'])]
Y = data['Type']
X = data.drop(columns=['Type', 'Latitude', 'Longitude','dummyID'])

X_result = result.drop(columns=['Type', 'Latitude', 'Longitude','dummyID'])

In [13]:
from sklearn.neighbors import KNeighborsClassifier

### Train Test Split  

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (229, 307) (229,)
Test set: (58, 307) (58,)


<div id="classification">
    <h2>Classification</h2>
</div>

<h3>K nearest neighbor (KNN)</h3>

#### Import library 

Classifier implementing the k-nearest neighbors vote.

### Training

Lets start the algorithm with k=4 for now:

In [35]:
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=2,
           weights='uniform')

### Predicting
we can use the model to predict the test set:

In [36]:
yhat = neigh.predict(X_test)
yhat

array([2., 0., 2., 0., 0., 2., 1., 2., 1., 1., 2., 1., 1., 0., 0., 1., 2.,
       2., 0., 2., 1., 2., 2., 2., 0., 2., 2., 2., 0., 1., 0., 2., 0., 1.,
       1., 2., 0., 2., 2., 2., 2., 0., 2., 1., 2., 0., 2., 2., 2., 0., 2.,
       1., 2., 0., 2., 1., 2., 2.])

### Accuracy evaluation
In multilabel classification, __accuracy classification score__ is a function that computes subset accuracy. This function is equal to the jaccard_similarity_score function. Essentially, it calculates how closely the actual labels and predicted labels are matched in the test set.

In [37]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.7161572052401747
Test set Accuracy:  0.5862068965517241


#### What about other K?

In [38]:
from sklearn.metrics import jaccard_similarity_score

Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
jaccard = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)


mean_acc

array([0.67241379, 0.63793103, 0.67241379, 0.5862069 , 0.62068966,
       0.5862069 , 0.62068966, 0.60344828, 0.60344828])

According to the above results, best K will be k=3

In [39]:
k = 3
#Train Model and Predict  
result_type = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train).predict(X_result)
result_type

array([0.])