# Classification - Build model, predict result

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Now, we use our trained model to see the result

1. get the coordinate
1. get the near venues from Foursquare
1. according to the near venues and our trained model, predict if the result below to Type 0, 1, or 2

Foursquare ID info

In [2]:
CLIENT_ID = '5P4U2KHYRYF4YZIKIGSGT45I2ILKGBKE5TKNH2LIDZAPLO3J' # your Foursquare ID
CLIENT_SECRET = 'DWF3I3K2GHAB4WRU1Q22FWD5ZCKO4VXFFZNY1CKUPJHEYH5K' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5P4U2KHYRYF4YZIKIGSGT45I2ILKGBKE5TKNH2LIDZAPLO3J
CLIENT_SECRET:DWF3I3K2GHAB4WRU1Q22FWD5ZCKO4VXFFZNY1CKUPJHEYH5K


In [3]:
def get_geo_coordinates(address):
    geolocator = Nominatim(user_agent="mtr_agent")
    location = geolocator.geocode(address)
    if location == None:
        return (np.nan, np.nan)
    latitude = location.latitude
    longitude = location.longitude
    return (latitude, longitude)

addr = 'San Ma Tau St, Ma Tau Kok, To Kwa Wan, Hong Kong, China'
lat, long = get_geo_coordinates(addr)
lat, long

(22.3188727, 114.1920097)

In [4]:
# Function to get the top 100 venues that are in a neighborhood within a radius of 500 meters

LIMIT = 100 # top 100 venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            np.nan,
            np.nan,
            name, 
            lat, 
            lng, 
            np.nan,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
                  'm_id',
                  's_id',
                  'Type', 
                  'Latitude', 
                  'Longitude', 
                  'Venue ID',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)



In [5]:
# Run the above function on each spots and create a new dataframe called venues

venues = getNearbyVenues(
                                names=[np.nan],
                                latitudes=[lat],
                                longitudes=[long]
                                )

We add the above venues data into the venues list near the locations of McDonald's and Starbucks 

In [6]:
all_venues = pd.read_excel('venues.xlsx', index_col=0)
all_venues = all_venues.append(venues)

In [7]:
all_venues[np.isnan(all_venues['Type'])]

Unnamed: 0,m_id,s_id,Type,Latitude,Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category
0,,,,22.318873,114.19201,,The Mahjong Hostel (麻雀客棧),22.320208,114.187643,Hostel
1,,,,22.318873,114.19201,,鴻福海鮮四季火鍋,22.315963,114.189806,Chinese Restaurant
2,,,,22.318873,114.19201,,Victoria Harbour Restaurant 海港酒家,22.316943,114.189593,Chinese Restaurant
3,,,,22.318873,114.19201,,The Great Restaurant (一品雞煲火鍋),22.322365,114.189045,Chinese Restaurant
4,,,,22.318873,114.19201,,Cattle Depot Artist Village (Ex-Ma Tau Kok Ani...,22.321037,114.191651,Art Gallery
5,,,,22.318873,114.19201,,Tsui Wah Restaurant (翠華餐廳),22.321261,114.188455,Hong Kong Restaurant
6,,,,22.318873,114.19201,,Grand Waterfront Plaza 翔龍灣廣場,22.319335,114.192903,Shopping Mall
7,,,,22.318873,114.19201,,North Viet Pho (北越牛肉粉專門店),22.319042,114.192967,Vietnamese Restaurant
8,,,,22.318873,114.19201,,Kowloon City Ferry Pier Bus Stop (九龍城碼頭巴士站),22.318478,114.193269,Bus Stop
9,,,,22.318873,114.19201,,Fu Wah Café (富華餐廳),22.31946,114.193032,Cha Chaan Teng


create one more column to identify spots to group data

In [8]:
# one hot encoding
onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Type'] = all_venues['Type'] 
onehot['Latitude'] = all_venues['Latitude']
onehot['Longitude'] = all_venues['Longitude']


In [9]:
onehot['dummyID'] = onehot['Latitude'].astype(str) + onehot['Longitude'].astype(str)

In [10]:
# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

grouped = onehot.groupby('dummyID').mean().reset_index()


In [11]:
data = grouped[~np.isnan(grouped['Type'])]
result = grouped[np.isnan(grouped['Type'])]
Y = data['Type']
X = data.drop(columns=['Type', 'Latitude', 'Longitude','dummyID'])

X_result = result.drop(columns=['Type', 'Latitude', 'Longitude','dummyID'])

In [12]:
from sklearn.neighbors import KNeighborsClassifier

According to the above results, best K will be k=8

In [16]:
k = 8
#Train Model and Predict  
result_type = KNeighborsClassifier(n_neighbors = k).fit(X,Y).predict(X_result)

In [18]:
print('The predicted type of the place "', addr, '" is ', result_type[0])

The predicted type of the place " San Ma Tau St, Ma Tau Kok, To Kwa Wan, Hong Kong, China " is  0.0


In [19]:
grouped.to_excel('grouped.xlsx')