### Import necessary libary

In [1]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  54.92 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  16.22 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  30.37 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  46.64 MB/s
Libraries imported.


### Scratch Neighborhood data from wiki website

In [2]:
from bs4 import BeautifulSoup
!wget -q -O 'html_doc' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
print('Data downloaded!')
with open('html_doc') as html_doc:
    soup = BeautifulSoup(html_doc)

Data downloaded!




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


### Parse the data that scratched from website

In [3]:
table = soup.tbody
data = []

table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    if len(cols)>0:
        data.append([ele for ele in cols if ele])
df=pd.DataFrame(data)

df.columns=['PostCode','Borough','Neighborhood']
df.head()
df.shape        
        

(289, 3)

### Clean data to ignore 'Not assigned' Boroughs or Neighborhoods

In [4]:
df_tt=df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df_tt.head()
df_tt.shape

(212, 3)

In [5]:
for i, index in enumerate(df_tt.index.values):
    nerb = df_tt.loc[index, 'Neighborhood']

    if nerb == 'Not assigned':
        df_tt.loc[index,'Neighborhood'] = df_tt.loc[index,'Borough']
df_tt.shape

(212, 3)

### Group Neighborhood by postcode and borough

In [6]:
g=df_tt.groupby(['PostCode', 'Borough'])
df_gp =g.apply( lambda x:','.join(x['Neighborhood'])).reset_index(name='Neighborhood')
df_gp.shape

(103, 3)

### Load Geograph data from csv file.

In [7]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data', index_col=0)
print('Geo data loaded')


Geo data loaded


### Add the latitude and longitude data into neighborhood data

In [8]:
for index, row in df_gp.iterrows():
    pc = row['PostCode']
    df_gp.loc[index, 'Latitude'] = df_geo.loc[pc]['Latitude']
    df_gp.loc[index, 'Longitude'] = df_geo.loc[pc]['Longitude']
    


In [9]:
print('The dataframe has {} postcodes, {} boroughs and {} neighborhoods.'.format(
        len(df_gp['PostCode'].unique()),
        len(df_gp['Borough'].unique()),
        df_gp.shape[0]
    )
)



The dataframe has 103 postcodes, 11 boroughs and 103 neighborhoods.


In [10]:
toronto_data = df_gp

In [11]:
'''
toronto_data = df_gp[df_gp['Borough'].str.contains('Toronto') ].reset_index(drop=True)
print('The dataframe has {} postcodes, {} boroughs and {} neighborhoods which contains world Toronto in its Borough.'.format(
        len(toronto_data['PostCode'].unique()),
        len(toronto_data['Borough'].unique()),
        toronto_data.shape[0]
    )
)
'''

"\ntoronto_data = df_gp[df_gp['Borough'].str.contains('Toronto') ].reset_index(drop=True)\nprint('The dataframe has {} postcodes, {} boroughs and {} neighborhoods which contains world Toronto in its Borough.'.format(\n        len(toronto_data['PostCode'].unique()),\n        len(toronto_data['Borough'].unique()),\n        toronto_data.shape[0]\n    )\n)\n"

### To visualize neighborhoods with folium map

In [12]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [29]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [14]:
toronto_data.shape

(103, 5)

### Use Foursqure API to get popular venues for each neighborhoods.

In [15]:
import requests

CLIENT_ID = 'RFIRBIWLR1J32AU5152LC5R0IEWENTABIYGZI5SZBZVYOA5S' # your Foursquare ID
CLIENT_SECRET = 'FQXBLNXERCFKIE53S3HZR4VVZYIO4RRMO2MWHRUS1G4GYJN0' # your Foursquare Secret
VERSION = '20181227' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[1, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

radius=500

LIMIT=100

url="https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
    CLIENT_ID,CLIENT_SECRET,VERSION,neighborhood_latitude,neighborhood_longitude,radius,LIMIT
)

results = requests.get(url).json()

Your credentails:
CLIENT_ID: RFIRBIWLR1J32AU5152LC5R0IEWENTABIYGZI5SZBZVYOA5S
CLIENT_SECRET:FQXBLNXERCFKIE53S3HZR4VVZYIO4RRMO2MWHRUS1G4GYJN0
Latitude and longitude values of Highland Creek,Rouge Hill,Port Union are 43.806686299999996, -79.19435340000001.


In [16]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Fetch data from Foursqure API

In [17]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056
1,Interprovincial Group,Print Shop,43.80563,-79.200378


In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
    #    print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

In [19]:
venue_list = toronto_venues['Venue Category'].tolist()

### Select store and coffee shops from venue data for each neighborhood

In [20]:
s = set(venue_list)
venues = list(s)

stores_malls = []
coffee_shops = []
for l in venues:
    if l.find('Store') > -1 or l.find('Mall') > -1:
        stores_malls.append([l, 'Store'])
    elif l.find('Caf') > -1 or l.find('Coffee') > -1:
        coffee_shops.append([l, 'Coffee Shop'])
    else:
        continue
df_stores_malls = pd.DataFrame(stores_malls)
df_stores_malls.columns=['Venue Category', 'Type']
df_coffee_shops = pd.DataFrame(coffee_shops)
df_coffee_shops.columns=['Venue Category', 'Type']

In [21]:
df_v = df_stores_malls.append(other = df_coffee_shops, ignore_index=True)

### Combine venue data with type of store and coffee shops

In [22]:
toronto_venues_flag = toronto_venues.join(df_v.set_index('Venue Category'), how = 'inner', on='Venue Category').reset_index(drop=True)
toronto_venues_flag.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Type
0,"Guildwood,Morningside,West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store,Store
1,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,Apple Fairview,43.777952,-79.343582,Electronics Store,Store
2,Willowdale South,43.77012,-79.408493,Best Buy,43.768115,-79.412608,Electronics Store,Store
3,"CFB Toronto,Downsview East",43.737473,-79.464763,First Class Realty Ltd,43.737133,-79.463298,Electronics Store,Store
4,Leaside,43.70906,-79.363452,Best Buy,43.709255,-79.36168,Electronics Store,Store


### Add one hot dummies for type of stores/coffee shops.

In [30]:
toronto_venues_flag_onehot = pd.get_dummies(toronto_venues_flag[['Type']], prefix="", prefix_sep="")
toronto_venues_flag_onehot['Neighborhood'] = toronto_venues_flag['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [toronto_venues_flag_onehot.columns[-1]] + list(toronto_venues_flag_onehot.columns[:-1])
toronto_venues_flag_onehot = toronto_venues_flag_onehot[fixed_columns]

t_cs= toronto_venues_flag_onehot.groupby(['Neighborhood'] ).sum()

### Calculate two indicators: Rate and Diff for data analysis.

In [33]:

for index,row in t_cs.iterrows():
    v = row['Store']/row['Coffee Shop']
    t_cs.loc[index,'Rate'] = v
    t_cs.loc[index,'Diff'] = row['Store'] - row['Coffee Shop']

  app.launch_new_instance()


In [37]:
t_cs.sort_values('Rate', ascending=False, inplace=True)
t_cs.reset_index()

Unnamed: 0,Neighborhood,Coffee Shop,Store,Rate,Diff
0,"Forest Hill North,Forest Hill West",0,1,inf,1.0
1,East Toronto,0,1,inf,1.0
2,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",0,1,inf,1.0
3,Downsview Northwest,0,3,inf,3.0
4,Caledonia-Fairbanks,0,1,inf,1.0
5,"Kingsway Park South West,Mimico NW,The Queensw...",0,3,inf,3.0
6,"CFB Toronto,Downsview East",0,1,inf,1.0
7,"Clarks Corners,Sullivan,Tam O'Shanter",0,1,inf,1.0
8,"The Junction North,Runnymede",0,2,inf,2.0
9,Downsview West,0,2,inf,2.0


### Combine neighborhood data with venue of stores and coffee shops data

In [38]:
toronto_merged = toronto_data.join(t_cs, how='right', on='Neighborhood', sort = False)
toronto_merged.sort_values('Rate', ascending=False, inplace=True)
toronto_merged.replace(np.inf, '10', inplace=True)
toronto_merged['Rate']= pd.to_numeric(toronto_merged['Rate'])

In [39]:
toronto_merged.reset_index(drop=True)

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude,Coffee Shop,Store,Rate,Diff
0,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,0,1,10.000000,1.0
1,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",43.691116,-79.476013,0,1,10.000000,1.0
2,M3N,North York,Downsview Northwest,43.761631,-79.520999,0,3,10.000000,3.0
3,M3L,North York,Downsview West,43.739015,-79.506944,0,2,10.000000,2.0
4,M3K,North York,"CFB Toronto,Downsview East",43.737473,-79.464763,0,1,10.000000,1.0
5,M4J,East York,East Toronto,43.685347,-79.338106,0,1,10.000000,1.0
6,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,2,10.000000,2.0
7,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,0,1,10.000000,1.0
8,M4C,East York,Woodbine Heights,43.695344,-79.318389,0,1,10.000000,1.0
9,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,0,1,10.000000,1.0


### Visualize the neighborhood with marking selected high business potential with red and bigger markers.

In [41]:

# create map of Manhattan using latitude and longitude values
map_toronto_merged = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label, c, s, r, d in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], 
                                 toronto_merged['Coffee Shop'], toronto_merged['Store'], 
                                       toronto_merged['Rate'], toronto_merged['Diff']):
    label = folium.Popup(label, parse_html=True)
    co = 'blue'
    ra = 1
    if r >= 1 and d >=3:
        co = 'Red'
        ra = d
    folium.CircleMarker(
        [lat, lng],
        radius=ra,
        popup=label,
        color=co,
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_merged)  
    
map_toronto_merged