# IBM Capstone Project

## Part 1: Segmenting and Clustering Toronto Neighborhood


### Installing package & libraries

In [1]:
!pip install folium



In [2]:
!pip install beautifulsoup4



In [3]:
import numpy as np
import pandas as pd

import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize

#Import data from HTML
from bs4 import BeautifulSoup

#Data Analysis 
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, r2_score
import itertools


print('Libraries imported!')

Libraries imported!


### Scrap the HTML 

We examine the HTML file of the website to find the data table, and use BeautifulSoup to scrap the data

In [4]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL).text 
soup = BeautifulSoup(r, 'html.parser')

In [5]:
#Read HTML file to find the data table
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XnJWpApAIHkAABsKSIwAAAAE","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":946188405,"wgRevisionId":946188405,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Article

In [99]:
#create a blank dataset called records
records = []

#find data table
code_table = soup.find('table')

#find p items --data lie between <p> code
name_codes_item = code_table.find_all('p')

for mysource in name_codes_item:    
    
    myPostaCode = mysource.find('b').contents[0]
    # <b>M4A</b><br/> -> M4A
    
   
    myBorough_myNeighborhood = mysource.find_all('span')[0].text
    # <a href="/wiki/North_York" title="North York">North York</a> (<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)
    
    first_part = myBorough_myNeighborhood.replace( ')', '') 
    first_part1 = first_part.replace( '/', ', ')
    #first_part1
    new_base = first_part1.split('(')
    
    myBorough = new_base[0:1]
    myBorough = ','.join(myBorough)
    
    myNeighborhood = new_base[1:]
    myNeighborhood = ','.join(myNeighborhood)
    
    # clone Borough's value if no value to Neighborhood
    if myNeighborhood == '':
        myNeighborhood = myBorough 
    else:
        myNeighborhood = myNeighborhood
    
    
    #print( myPostaCode, myBorough,myNeighborhood ) in the dataset
    records.append((myPostaCode,myBorough, myNeighborhood))

In [7]:
#transfer the data into the dataframe
df_toronto = pd.DataFrame(records, columns=['PostalCode', 'Borough', 'Neighborhood'])
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"


### Remove non assigned rows

In [8]:
df_toronto.replace("Not assigned", np.nan, inplace=True)

In [9]:
df_toronto.dropna(inplace=True)
df_toronto.reset_index(drop = True, inplace = True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,"Queen's Park , Ontario Provincial Government","Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don MillsNorth
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Group neighborhood by postal code and borough 

In [10]:
df_toronto2 = df_toronto.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
df_toronto2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Deal with non-assigned neighborhood 

In [11]:
#Replace Neighborhood name with Borough name
na_neigh = df_toronto2.Neighborhood == 'Not assigned'
df_toronto2.loc[na_neigh, 'Neighborhood'] = df_toronto2.loc[na_neigh, 'Borough'
                                                           ]
#Check if any non-assigned table left
df_toronto2[na_neigh]

Unnamed: 0,PostalCode,Borough,Neighborhood


In [100]:
#Check the final data table
df_toronto2.shape
df_toronto2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village..."
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [14]:
#save data table to csv file
df_toronto2.to_csv('toronto.csv')

## Part 2: Adding latitude and longitude to the database

In [15]:
#Get latitude and longitude data
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Coordinates downloaded!')
coors = pd.read_csv('toronto_coordinates.csv')

Coordinates downloaded!


In [16]:
print(coors.shape)
coors.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
#merge two datasets
df_toronto3 = df_toronto2.set_index('PostalCode')
coors2 = coors.set_index('Postal Code')
toronto_coors = pd.concat([df_toronto3, coors2], axis=1, join='inner')

toronto_coors.index.name = 'Postal Code'
toronto_coors.reset_index(inplace=True)

In [18]:
#Check the new dataframe
print(toronto_coors.shape)
toronto_coors.head()

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Explore and Analyze the Data

### Get Toronto-only location information 

In [98]:
#Get Toronto-only dataset
toronto_only= toronto_coors[toronto_coors['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
print(toronto_only.shape)
toronto_only.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106
2,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
3,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
4,M4M,East Toronto,Studio District,43.659526,-79.340923


### Create a location map

In [20]:
#Toronto location
latitude = 43.6532
longitude= -79.3832

# create map of TORONTO using latitude and longitude values above with folium
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_only['Latitude'], toronto_only['Longitude'], toronto_only['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

## Part 3: Using Foursquare API to explore the restaurant in the Toronto Area

The basis of this study is to help a small group of investors planning to open their first U.S. based brewery / restaurant expansion in Toronto. Being that Toronto is the most populated city in Canada, and continually ranks as an important global city based on a high quality of living, the choice to expand into the neighbor of the north market was an easy selection for the investing group. However, with limited knowledge of the Toronto market, the group of investors have selected us to assist in the selection of which areas of Toronto will facilitate a launch of their brewery / restaurant expansion.

We will use Foursquare API to explore the demographic of each area in the Toronto Area. 
Then we will statistical modeling and machine learning model to find which areas are the most suitable for their brewery/restaurant launch.

### 1.  Gather the data: Using Foursquare API

### Define Foursquare credentials and version

In [21]:
CLIENT_ID = 'VL3N5Q5SBW2W2ENROYEIFUFLDGY5QQPKPQBZHW1AEL1XNXLA'
CLIENT_SECRET = '1TXO0THBTYKXGXEEQQXUCZR5KQIOTSZOIZWSADG2JFMNJXST'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

ll=43.6532,-79.3832


#Create URLs
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

print(url)

Your credentails:
CLIENT_ID: VL3N5Q5SBW2W2ENROYEIFUFLDGY5QQPKPQBZHW1AEL1XNXLA
CLIENT_SECRET:1TXO0THBTYKXGXEEQQXUCZR5KQIOTSZOIZWSADG2JFMNJXST
https://api.foursquare.com/v2/venues/explore?&client_id=VL3N5Q5SBW2W2ENROYEIFUFLDGY5QQPKPQBZHW1AEL1XNXLA&client_secret=1TXO0THBTYKXGXEEQQXUCZR5KQIOTSZOIZWSADG2JFMNJXST&v=20180605&ll=43.6532,-79.3832&radius=500&limit=100


### Using API to get the venue category data

In [23]:
# scrape the data from the generated URL

results = requests.get(url).json()
results

# extract the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
#record the data    
    
venues = results['response']['groups'][0]['items']
toronto_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 
                    'venue.location.lng', 'venue.id']
toronto_venues =toronto_venues.loc[:, filtered_columns]

# filter the category for each row
toronto_venues['venue.categories'] = toronto_venues.apply(get_category_type, axis=1)

# clean columns
toronto_venues.columns = [col.split(".")[-1] for col in toronto_venues.columns]


print('{} venues were returned by Foursquare.'.format(toronto_venues.shape[0]))

100 venues were returned by Foursquare.


In [26]:
#Check the categories of 100 venues in Toronto
toronto_venues['categories'].unique()

array(['Neighborhood', 'Plaza', 'Bookstore', 'Cosmetics Shop',
       'Breakfast Spot', 'Monument / Landmark', 'Shopping Mall',
       'Coffee Shop', 'Sushi Restaurant', 'Asian Restaurant',
       'Restaurant', 'Electronics Store', 'Department Store', 'Bakery',
       'Poke Place', 'Fast Food Restaurant', 'Italian Restaurant',
       'Clothing Store', 'Office', 'Concert Hall', 'American Restaurant',
       'Bubble Tea Shop', 'Theater', 'Gym', 'Furniture / Home Store',
       'Lingerie Store', 'Vegetarian / Vegan Restaurant',
       'Seafood Restaurant', 'Food Court', 'Miscellaneous Shop',
       'Toy / Game Store', 'Greek Restaurant', 'Bar',
       'Distribution Center', "Women's Store", 'Tanning Salon',
       'Art Museum', 'New American Restaurant', 'Ramen Restaurant',
       'Chinese Restaurant', 'Tea Room', 'Latin American Restaurant',
       'Opera House', 'Bank', 'Pizza Place', 'Comic Shop', 'Steakhouse',
       'Café', 'Ice Cream Shop', 'Hotel', 'Music Venue',
       'Japanese R

In [27]:
#remove non-food-related venues categories
removal_list = ['Neighborhood', 'Plaza', 'Bookstore', 'Cosmetics Shop',
       'Monument / Landmark', 'Shopping Mall', 'Electronics Store', 'Department Store',
       'Clothing Store', 'Office', 'Concert Hall', 'Theater', 'Gym', 'Furniture / Home Store',
       'Lingerie Store',  'Miscellaneous Shop','Toy / Game Store',
       'Distribution Center', "Women's Store", 'Tanning Salon',
       'Art Museum','Opera House', 'Bank', 'Comic Shop', 'Hotel', 'Music Venue','Gym / Fitness Center',
       'General Travel', 'Movie Theater', 'Shoe Store']

toronto_restaurant = toronto_venues[~toronto_venues['categories'].isin(removal_list)]

toronto_restaurant['categories'].unique()
        

array(['Breakfast Spot', 'Coffee Shop', 'Sushi Restaurant',
       'Asian Restaurant', 'Restaurant', 'Bakery', 'Poke Place',
       'Fast Food Restaurant', 'Italian Restaurant',
       'American Restaurant', 'Bubble Tea Shop',
       'Vegetarian / Vegan Restaurant', 'Seafood Restaurant',
       'Food Court', 'Greek Restaurant', 'Bar', 'New American Restaurant',
       'Ramen Restaurant', 'Chinese Restaurant', 'Tea Room',
       'Latin American Restaurant', 'Pizza Place', 'Steakhouse', 'Café',
       'Ice Cream Shop', 'Japanese Restaurant', 'Dessert Shop',
       'Food & Drink Shop', 'Gastropub', 'Modern European Restaurant',
       'Beer Bar', 'Diner', 'Burger Joint', 'Juice Bar',
       'Middle Eastern Restaurant', 'Deli / Bodega'], dtype=object)

### Use API to get the venue likes data

In [28]:
url_list = []
like_list = []
json_list = []

for i in list(toronto_restaurant.id):
    venue_url = 'https://api.foursquare.com/v2/venues/{}/likes?client_id={}&client_secret={}&v={}'.format(i, CLIENT_ID, CLIENT_SECRET, VERSION)
    url_list.append(venue_url)
for link in url_list:
    result = requests.get(link).json()
    likes = result['response']['likes']['count']
    like_list.append(likes)
print(like_list)


toronto_restaurant['likes'] = like_list
toronto_restaurant.head(10)

[212, 32, 161, 22, 9, 11, 31, 8, 105, 219, 250, 199, 270, 29, 21, 4, 67, 54, 35, 136, 141, 206, 202, 33, 81, 72, 76, 16, 43, 355, 449, 17, 21, 27, 199, 35, 21, 41, 55, 233, 86, 11, 28, 391, 183, 232, 29, 97, 9, 201, 15, 62, 5, 69, 135, 14]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,categories,lat,lng,id,likes
4,Eggspectation Bell Trinity Square,Breakfast Spot,43.653144,-79.38198,537773d1498e74a75bb75c1e,212
7,M Square Coffee Co,Coffee Shop,43.651218,-79.383555,54132b3b498ee9ca9332e189,32
8,Japango,Sushi Restaurant,43.655268,-79.385165,4ae7b27df964a52068ad21e3,161
10,Noodle King,Asian Restaurant,43.651706,-79.383046,4b8d5856f964a520f4f532e3,22
11,JOEY Eaton Centre,Restaurant,43.655404,-79.381929,59246b5aad1789316b35d66c,9


In [30]:
restaurant = toronto_restaurant
restaurant.head(10)

Unnamed: 0,name,categories,lat,lng,id,likes
4,Eggspectation Bell Trinity Square,Breakfast Spot,43.653144,-79.38198,537773d1498e74a75bb75c1e,212
7,M Square Coffee Co,Coffee Shop,43.651218,-79.383555,54132b3b498ee9ca9332e189,32
8,Japango,Sushi Restaurant,43.655268,-79.385165,4ae7b27df964a52068ad21e3,161
10,Noodle King,Asian Restaurant,43.651706,-79.383046,4b8d5856f964a520f4f532e3,22
11,JOEY Eaton Centre,Restaurant,43.655404,-79.381929,59246b5aad1789316b35d66c,9
14,Danish Pastry House,Bakery,43.654574,-79.38074,5c2151463362730039c4ef0b,11
15,Poke Guys,Poke Place,43.654895,-79.385052,57bcd3b7498e652a678d0378,31
16,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889,4e5d8181a8092f63968617ee,8
17,Trattoria Mercatto,Italian Restaurant,43.654453,-79.380974,4d306dd82748b60c62b6dba0,105
19,Bannock,Restaurant,43.652101,-79.381178,4dfe1cf0a809d61e2fc568ce,219


In [31]:
#Check the restaurant categories
restaurant['categories'].unique()

array(['Breakfast Spot', 'Coffee Shop', 'Sushi Restaurant',
       'Asian Restaurant', 'Restaurant', 'Bakery', 'Poke Place',
       'Fast Food Restaurant', 'Italian Restaurant',
       'American Restaurant', 'Bubble Tea Shop',
       'Vegetarian / Vegan Restaurant', 'Seafood Restaurant',
       'Food Court', 'Greek Restaurant', 'Bar', 'New American Restaurant',
       'Ramen Restaurant', 'Chinese Restaurant', 'Tea Room',
       'Latin American Restaurant', 'Pizza Place', 'Steakhouse', 'Café',
       'Ice Cream Shop', 'Japanese Restaurant', 'Dessert Shop',
       'Food & Drink Shop', 'Gastropub', 'Modern European Restaurant',
       'Beer Bar', 'Diner', 'Burger Joint', 'Juice Bar',
       'Middle Eastern Restaurant', 'Deli / Bodega'], dtype=object)

###  2. Prepare the data: Create better categorical variables

#### Venue Categories

In [34]:
# we can group some cuisines together to make a better categorical variable

asia_pacific = ['Asian Restaurant','Sushi Restaurant','Ramen Restaurant', 'Chinese Restaurant','Poke Place','Japanese Restaurant']    
        
drink_dessert = ['Coffee Shop', 'Bubble Tea Shop','Tea Room', 'Café',  'Ice Cream Shop','Dessert Shop','Juice Bar']
        
casual = ['Breakfast Spot','Restaurant', 'Bakery', 'Fast Food Restaurant', 'Vegetarian / Vegan Restaurant', 'Seafood Restaurant',
       'Food Court','Steakhouse','Food & Drink Shop','Deli / Bodega','Burger Joint', 'Diner']    
        
bar = ['Bar','Beer Bar','Gastropub']        
        
euro = [ 'Italian Restaurant','Greek Restaurant', 'Pizza Place','Modern European Restaurant']        
        
american = ['American Restaurant','New American Restaurant']
        
latin = ['Latin American Restaurant']    
        
middle_eastern = ['Middle Eastern Restaurant']        


def conditions(s):
    if s['categories'] in euro:
        return 'euro'
    if s['categories'] in latin:
        return 'latin'
    if s['categories'] in asia_pacific:
        return 'asia_pacific'
    if s['categories'] in casual:
        return 'casual'
    if s['categories'] in american:
        return 'american'
    if s['categories'] in bar:
        return 'bar'
    if s['categories'] in middle_eastern:
        return 'middle_eastern'
    if s['categories'] in drink_dessert:
        return 'drink_dessert'


restaurant['categories_classified']=restaurant.apply(conditions, axis=1)
restaurant

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,categories,lat,lng,id,likes,categories_classified
4,Eggspectation Bell Trinity Square,Breakfast Spot,43.653144,-79.38198,537773d1498e74a75bb75c1e,212,casual
7,M Square Coffee Co,Coffee Shop,43.651218,-79.383555,54132b3b498ee9ca9332e189,32,drink_dessert
8,Japango,Sushi Restaurant,43.655268,-79.385165,4ae7b27df964a52068ad21e3,161,asia_pacific
10,Noodle King,Asian Restaurant,43.651706,-79.383046,4b8d5856f964a520f4f532e3,22,asia_pacific
11,JOEY Eaton Centre,Restaurant,43.655404,-79.381929,59246b5aad1789316b35d66c,9,casual
14,Danish Pastry House,Bakery,43.654574,-79.38074,5c2151463362730039c4ef0b,11,casual
15,Poke Guys,Poke Place,43.654895,-79.385052,57bcd3b7498e652a678d0378,31,asia_pacific
16,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889,4e5d8181a8092f63968617ee,8,casual
17,Trattoria Mercatto,Italian Restaurant,43.654453,-79.380974,4d306dd82748b60c62b6dba0,105,euro
19,Bannock,Restaurant,43.652101,-79.381178,4dfe1cf0a809d61e2fc568ce,219,casual


In [36]:
#Number of restaurants in each category
pd.crosstab(index=restaurant["categories_classified"],
            columns="count")

col_0,count
categories_classified,Unnamed: 1_level_1
american,4
asia_pacific,7
bar,4
casual,23
drink_dessert,11
euro,5
latin,1
middle_eastern,1


Casual restaurants are the most popular group.

In [70]:
average_likes= restaurant.groupby('categories_classified').mean()
print(average_likes)

                             lat        lng       likes   ranking
categories_classified                                            
american               43.652042 -79.380559  237.000000  3.750000
asia_pacific           43.654809 -79.385187   72.714286  2.285714
bar                    43.653269 -79.382006  189.000000  3.250000
casual                 43.652420 -79.381696   82.260870  2.260870
drink_dessert          43.654142 -79.382161   95.909091  2.181818
euro                   43.654138 -79.381468   66.400000  2.600000
latin                  43.651722 -79.379205   72.000000  3.000000
middle_eastern         43.655029 -79.380245  201.000000  4.000000


However, American restaurant, Middle Eastern restaurant and Bar are the top 3 restaurant which have most likes. 

#### Create a new categorical variable for 'Ranking' from 'Likes'

In [43]:
#Descriptive statistics
restaurant.describe()

#print(np.percentile(restaurant['likes'], 25))
#print(np.percentile(restaurant['likes'], 50))
#print(np.percentile(restaurant['likes'], 75))

Unnamed: 0,lat,lng,likes
count,56.0,56.0,56.0
mean,43.653278,-79.382074,102.946429
std,0.002589,0.002481,106.362232
min,43.648832,-79.387424,4.0
25%,43.650643,-79.383766,21.75
50%,43.653704,-79.381774,58.5
75%,43.655382,-79.380188,187.0
max,43.657472,-79.378055,449.0


In [101]:
def ranking (s):
    if s['likes']<=21.75:
        return 1
    if s['likes']<=58.5:
        return 2
    if s['likes']<=187:
        return 3
    if s['likes']>187:
        return 4
    
restaurant['ranking']= restaurant.apply(rankings, axis=1)
restaurant.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,categories,lat,lng,id,likes,categories_classified,ranking
4,Eggspectation Bell Trinity Square,Breakfast Spot,43.653144,-79.38198,537773d1498e74a75bb75c1e,212,casual,4
7,M Square Coffee Co,Coffee Shop,43.651218,-79.383555,54132b3b498ee9ca9332e189,32,drink_dessert,2
8,Japango,Sushi Restaurant,43.655268,-79.385165,4ae7b27df964a52068ad21e3,161,asia_pacific,3
10,Noodle King,Asian Restaurant,43.651706,-79.383046,4b8d5856f964a520f4f532e3,22,asia_pacific,2
11,JOEY Eaton Centre,Restaurant,43.655404,-79.381929,59246b5aad1789316b35d66c,9,casual,1


In [58]:
pd.crosstab(index=restaurant["ranking"],
            columns="count")

col_0,count
ranking,Unnamed: 1_level_1
1,14
2,14
3,14
4,14


American, Middle Eastern restaurant or bar are the restaurant has over 100 likes

### 3. Data Analysis: Regression Models

### Multiple Linear Regression Model

In [77]:
# create dummies for linear regression modelling

# one hot encoding
dataset = pd.get_dummies(restaurant[['categories_classified']], 
                               prefix="", 
                               prefix_sep="")

# add name, ranking, and likes columns back to dataframe
dataset['ranking'] = restaurant['ranking']
dataset['likes'] = restaurant['likes']
dataset['name'] = restaurant['name']
dataset['categories'] = restaurant['categories']
dataset['ranking'] = restaurant['ranking']

# move name column to the first column
columns = [dataset.columns[-1]] + list(dataset.columns[:-1])
dataset = dataset[columns]


dataset.head()

Unnamed: 0,categories,american,asia_pacific,bar,casual,drink_dessert,euro,latin,middle_eastern,ranking,likes,name
4,Breakfast Spot,0,0,0,1,0,0,0,0,4,212,Eggspectation Bell Trinity Square
7,Coffee Shop,0,0,0,0,1,0,0,0,2,32,M Square Coffee Co
8,Sushi Restaurant,0,1,0,0,0,0,0,0,3,161,Japango
10,Asian Restaurant,0,1,0,0,0,0,0,0,2,22,Noodle King
11,Restaurant,0,0,0,1,0,0,0,0,1,9,JOEY Eaton Centre


In [95]:
# Multiple Linear Regression
from sklearn import linear_model
msk = np.random.rand(len(dataset)) < 0.8
train = dataset[msk]
test = dataset[~msk]

regr = linear_model.LinearRegression()
x = np.asanyarray(train[['american', 'middle_eastern', 'bar']])
y = np.asanyarray(train[['likes']])
regr.fit (x, y)
# The coefficients
print ('Coefficients: ', regr.coef_)

Coefficients:  [[183.34210526 115.34210526 103.34210526]]


In [96]:
# Multiple Linear Regression Prediction Capabilities

y_hat= regr.predict(test[['american', 'middle_eastern', 'bar']])
x = np.asanyarray(test[['american', 'middle_eastern', 'bar']])
y = np.asanyarray(test[['likes']])
print("Residual sum of squares: %.2f"
      % np.mean((y_hat - y) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x, y))

Residual sum of squares: 6338.24
Variance score: -0.30


The model is not a good prediction of t

### Logistics Regression Model

In [86]:
# Multinomial Ordinal Logistic Regression

x_train = np.asanyarray(train[['american', 'middle_eastern', 'bar','euro']])
y_train = np.asanyarray(train['ranking'])

x_test = np.asanyarray(test[['american', 'middle_eastern', 'bar','euro']])
y_test = np.asanyarray(test['ranking'])


# LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)
# LR

mul_ordinal = linear_model.LogisticRegression(multi_class='multinomial',
                                              solver='newton-cg',
                                              fit_intercept=True).fit(x_train,
                                                                      y_train)

mul_ordinal

coef = mul_ordinal.coef_[0]
print (coef)

[-0.59299287  0.         -0.63817479 -0.6871386 ]


In [89]:
yhat = mul_ordinal.predict(x_test)
yhat

yhat_prob = mul_ordinal.predict_proba(x_test)
yhat_prob


jaccard_similarity_score(y_test, yhat)

0.21428571428571427

In [90]:
log_loss(y_test, yhat_prob)

1.4108678029491422

In [92]:
# Exploration of Coefficient Magnitudes of Full Dataset

x_all = np.asanyarray(dataset[['american', 'middle_eastern', 'bar','euro']])
y_all = np.asanyarray(dataset['ranking'])



LR = linear_model.LogisticRegression(multi_class='multinomial',
                                            solver='newton-cg',
                                            fit_intercept=True).fit(x_all,
                                                                    y_all)

LR

coef = LR.coef_[0]
print (coef)

[-0.55781691 -0.21900015 -0.59904486 -0.63956926]


In [93]:
print (classification_report(y_test, yhat))

              precision    recall  f1-score   support

           1       0.21      1.00      0.35         3
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         3

   micro avg       0.21      0.21      0.21        14
   macro avg       0.05      0.25      0.09        14
weighted avg       0.05      0.21      0.08        14



  'precision', 'predicted', average, warn_for)
