#### Importing necessary Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
from pandas.io.json import json_normalize
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import xml
#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library
print('Libraries imported.')

Libraries imported.


## Data Collection
#### Exracting a Wikipedia page to a text file

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population'
page = requests.get(url) 
soup = BeautifulSoup(page.text)

#### Searching table with List of Cities of USA and Extracting Table into a DataFrame

In [3]:
table = soup.find_all('table')[4]
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)
df = pd.DataFrame(res, columns=["Rank", "City", "State", "del1", "del2", "del3", "Sq.Area", "del5", "population density in Sq Mi", "Population density in Km2", "Location"])
df.head()

Unnamed: 0,Rank,City,State,del1,del2,del3,Sq.Area,del5,population density in Sq Mi,Population density in Km2,Location
0,1,New York City[d],New York,8622698,8175133,+5.47%,301.5 sq mi,780.9 km2,"28,317/sq mi","10,933/km2",40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿...
1,2,Los Angeles,California,3999759,3792621,+5.46%,468.7 sq mi,"1,213.9 km2","8,484/sq mi","3,276/km2",34°01′10″N 118°24′39″W﻿ / ﻿34.0194°N 118.4108°...
2,3,Chicago,Illinois,2716450,2695598,+0.77%,227.3 sq mi,588.7 km2,"11,900/sq mi","4,600/km2",41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿...
3,4,Houston[3],Texas,2312717,2100263,+10.12%,637.5 sq mi,"1,651.1 km2","3,613/sq mi","1,395/km2",29°47′12″N 95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿...
4,5,Phoenix,Arizona,1626078,1445632,+12.48%,517.6 sq mi,"1,340.6 km2","3,120/sq mi","1,200/km2",33°34′20″N 112°05′24″W﻿ / ﻿33.5722°N 112.0901°...


In [13]:
df.shape

(311, 6)

## Data Wrangling

#### Drop Unnecessary Columns

In [4]:
df.drop(columns = ["Rank", "del1", "del2", "del3", "del5", "population density in Sq Mi"], inplace = True)
df

Unnamed: 0,City,State,Sq.Area,Population density in Km2,Location
0,New York City[d],New York,301.5 sq mi,"10,933/km2",40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿...
1,Los Angeles,California,468.7 sq mi,"3,276/km2",34°01′10″N 118°24′39″W﻿ / ﻿34.0194°N 118.4108°...
2,Chicago,Illinois,227.3 sq mi,"4,600/km2",41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿...
3,Houston[3],Texas,637.5 sq mi,"1,395/km2",29°47′12″N 95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿...
4,Phoenix,Arizona,517.6 sq mi,"1,200/km2",33°34′20″N 112°05′24″W﻿ / ﻿33.5722°N 112.0901°...
5,Philadelphia[e],Pennsylvania,134.2 sq mi,"4,511/km2",40°00′34″N 75°08′00″W﻿ / ﻿40.0094°N 75.1333°W﻿...
6,San Antonio,Texas,461.0 sq mi,"1,250/km2",29°28′21″N 98°31′30″W﻿ / ﻿29.4724°N 98.5251°W﻿...
7,San Diego,California,325.2 sq mi,"1,670/km2",32°48′55″N 117°08′06″W﻿ / ﻿32.8153°N 117.1350°...
8,Dallas,Texas,340.9 sq mi,"1,493/km2",32°47′36″N 96°45′59″W﻿ / ﻿32.7933°N 96.7665°W﻿...
9,San Jose,California,177.5 sq mi,"2,231/km2",37°17′48″N 121°49′08″W﻿ / ﻿37.2967°N 121.8189°...


#### Calculate Radius of Each City

In [5]:
new= df["Sq.Area"].str.split("s", n=1, expand = True)
new = new[0].str.replace(u'\xa0',u'')
df["Sq.Area"] = new.str.replace(',','')
df["Sq.Area"] = df["Sq.Area"].astype(float) #Changing datatype to Float
df["Radius"] = np.sqrt(df["Sq.Area"]) #Calculating Squareroot

df.drop(columns = ["Sq.Area"], inplace = True)
df

Unnamed: 0,City,State,Population density in Km2,Location,Radius
0,New York City[d],New York,"10,933/km2",40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿...,17.363755
1,Los Angeles,California,"3,276/km2",34°01′10″N 118°24′39″W﻿ / ﻿34.0194°N 118.4108°...,21.64948
2,Chicago,Illinois,"4,600/km2",41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿...,15.076472
3,Houston[3],Texas,"1,395/km2",29°47′12″N 95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿...,25.248762
4,Phoenix,Arizona,"1,200/km2",33°34′20″N 112°05′24″W﻿ / ﻿33.5722°N 112.0901°...,22.750824
5,Philadelphia[e],Pennsylvania,"4,511/km2",40°00′34″N 75°08′00″W﻿ / ﻿40.0094°N 75.1333°W﻿...,11.584472
6,San Antonio,Texas,"1,250/km2",29°28′21″N 98°31′30″W﻿ / ﻿29.4724°N 98.5251°W﻿...,21.470911
7,San Diego,California,"1,670/km2",32°48′55″N 117°08′06″W﻿ / ﻿32.8153°N 117.1350°...,18.033303
8,Dallas,Texas,"1,493/km2",32°47′36″N 96°45′59″W﻿ / ﻿32.7933°N 96.7665°W﻿...,18.463477
9,San Jose,California,"2,231/km2",37°17′48″N 121°49′08″W﻿ / ﻿37.2967°N 121.8189°...,13.322913


#### Splitting Latitude and Logitude of Cities and making seperate Columns

In [6]:
df["Location"]= df["Location"].str.split("/", n = 2, expand = True)[1]
new = df["Location"].str.split(" ", n = 0, expand = False)
k = df.copy(deep = True)
Latitude = []
Longitude = []
for i in range(len(new)):
    Latitude.append(new[i][1][:-2])
    Longitude.append(new[i][2][:-3]) 

k["Latitude"] = Latitude
k["Longitude"] = Longitude
k["Latitude"] = k["Latitude"].str.replace(u'\ufeff',u'')
k.drop(columns = ["Location"], inplace = True)
k.head()
df = k.copy(deep = True)
df['Longitude'] = -df['Longitude'].astype(float)
df['Latitude'] = df['Latitude'].astype(float)
df['Radius'] = df['Radius']* 1000
df.head()

Unnamed: 0,City,State,Population density in Km2,Radius,Latitude,Longitude
0,New York City[d],New York,"10,933/km2",17363.755354,40.6635,-73.9387
1,Los Angeles,California,"3,276/km2",21649.480363,34.0194,-118.4108
2,Chicago,Illinois,"4,600/km2",15076.471736,41.8376,-87.6818
3,Houston[3],Texas,"1,395/km2",25248.762346,29.7866,-95.3909
4,Phoenix,Arizona,"1,200/km2",22750.824161,33.5722,-112.0901


In [15]:
df.shape

(311, 6)

### Statewise per capita income USA

In [7]:
link1 = 'https://en.wikipedia.org/wiki/List_of_United_States_counties_by_per_capita_income'
page1 = requests.get(link1) 
soup1 = BeautifulSoup(page1.text)
table = soup1.find_all('table')[1]
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)
df_state = pd.DataFrame(res, columns=["Rank", "Country-equivalent", "State", "Per capita income", "del2", "del3", "Population", "del5"])
df_state.head()

Unnamed: 0,Rank,Country-equivalent,State,Per capita income,del2,del3,Population,del5
0,1,New York County,New York,"$62,498","$69,659","$84,627",1605272,736192
1,2,Arlington,Virginia,"$62,018","$103,208","$139,244",214861,94454
2,3,Falls Church City,Virginia,"$59,088","$120,000","$152,857",12731,5020
3,4,Marin,California,"$56,791","$90,839","$117,357",254643,102912
4,5,Alexandria City,Virginia,"$54,608","$85,706","$107,511",143684,65369


In [16]:
df.shape

(311, 6)

#### Dropping unnecessary Columns

In [8]:
df_state.drop(columns = ['Rank','del2', 'del3', 'del5'], axis = 1, inplace = True)
df_state.head()

Unnamed: 0,Country-equivalent,State,Per capita income,Population
0,New York County,New York,"$62,498",1605272
1,Arlington,Virginia,"$62,018",214861
2,Falls Church City,Virginia,"$59,088",12731
3,Marin,California,"$56,791",254643
4,Alexandria City,Virginia,"$54,608",143684


## Visualization

In [9]:
# create map of USA cities that we have using latitude and longitude values
map_tohood = folium.Map(location=[37.0902,-95.7129], zoom_start=3)

# add markers to map
for lat, lng, state, city in zip(df['Latitude'], df['Longitude'], df['State'], df['City']):
    label = '{}, {}'.format(city, state)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_tohood)  
    
map_tohood

### Using Foursquare API

In [10]:
CLIENT_ID = 'CCZMEC4N5W03WF3N5I2LK0YKPVTY3PBQTDDC5KJY4GRXLY04' # your Foursquare ID
CLIENT_SECRET = 'O0GWFSPPH5B33T14LAGHYQGMZ5L3JNK1LO5AGTM22EWK2VFO' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 20
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CCZMEC4N5W03WF3N5I2LK0YKPVTY3PBQTDDC5KJY4GRXLY04
CLIENT_SECRET:O0GWFSPPH5B33T14LAGHYQGMZ5L3JNK1LO5AGTM22EWK2VFO


In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    for name, lat, lng,radius in zip(names, latitudes, longitudes,radius):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
       # print(results)
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
df_venues = getNearbyVenues(names = df['City'], latitudes = df['Latitude'],longitudes = df['Longitude'], radius = df['Radius'])
df_venues.head()

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York City[d],40.6635,-73.9387,Barboncino,40.672104,-73.957412,Pizza Place
1,New York City[d],40.6635,-73.9387,Brooklyn Botanic Garden,40.667622,-73.963191,Botanical Garden
2,New York City[d],40.6635,-73.9387,Covenhoven,40.675143,-73.960203,Beer Bar
3,New York City[d],40.6635,-73.9387,Prospect Park Boathouse & Audubon Center,40.660884,-73.964949,Building
4,New York City[d],40.6635,-73.9387,Brooklyn Museum,40.671521,-73.963677,Art Museum


In [19]:
df_venues.shape

(6202, 7)

### Setting benchmarks by assigning Weights to some Categories

In [20]:
k = df_venues.copy(deep = True)
weights_dict={'Movie Theater':3,'Beach':3,'Concert Hall':2.5,'Playground':3,'Coffee Shop':3.5,'Food Court':4,'Nightclub':4,'Toy / Game Store':4.5,'Theme Park Ride / Attraction':4,'Pub':4}
data = df_venues['Venue Category']
allVenues = list(data)

In [21]:
weights = []
for i in allVenues:
    if i in weights_dict.keys():
        weights.append(weights_dict[i])
    else :
        weights.append(0)
df_venues['weights'] = weights;
df_venues.head()

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,weights
0,New York City[d],40.6635,-73.9387,Barboncino,40.672104,-73.957412,Pizza Place,0.0
1,New York City[d],40.6635,-73.9387,Brooklyn Botanic Garden,40.667622,-73.963191,Botanical Garden,0.0
2,New York City[d],40.6635,-73.9387,Covenhoven,40.675143,-73.960203,Beer Bar,0.0
3,New York City[d],40.6635,-73.9387,Prospect Park Boathouse & Audubon Center,40.660884,-73.964949,Building,0.0
4,New York City[d],40.6635,-73.9387,Brooklyn Museum,40.671521,-73.963677,Art Museum,0.0


### Cleaning Data by dropping Irrelevent Rows (i.e. Rows with Weights = 0)

In [22]:
df_venues.drop(df_venues[df_venues.weights < 1.0].index, inplace=True)
df_venues.head()

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,weights
22,Los Angeles,34.0194,-118.4108,Blue Bottle Coffee,34.027115,-118.387637,Coffee Shop,3.5
58,Chicago,41.8376,-87.6818,Sawada Coffee,41.88373,-87.648726,Coffee Shop,3.5
77,Houston[3],29.7866,-95.3909,Boomtown Coffee,29.802849,-95.400855,Coffee Shop,3.5
78,Houston[3],29.7866,-95.3909,White Oak Music Hall,29.785994,-95.367173,Concert Hall,2.5
100,Philadelphia[e],40.0094,-75.1333,Amalgam Comics & Coffeehouse,39.98512,-75.124364,Coffee Shop,3.5


#### Calculating Mean and Grouping Venues by City

In [23]:
citywise_venues_weights = df_venues[['City','weights']].copy()
citywise_venues_weights_means = citywise_venues_weights.groupby(['City']).mean()
citywise_venues_weights_means = citywise_venues_weights_means.reset_index(drop=False)
citywise_venues_weights_means.head()

Unnamed: 0,City,weights
0,Abilene,3.5
1,Alexandria[m],3.5
2,Allen,2.5
3,Amarillo,3.5
4,Anaheim,3.5


#### Merging Weights and Wiki Tables

In [24]:
city_selection = pd.merge(df, citywise_venues_weights_means, on='City')
city_selection = city_selection[['City','Population density in Km2','weights']].copy()
city_selection.head()

Unnamed: 0,City,Population density in Km2,weights
0,Los Angeles,"3,276/km2",3.5
1,Chicago,"4,600/km2",3.5
2,Houston[3],"1,395/km2",3.0
3,Philadelphia[e],"4,511/km2",3.5
4,San Antonio,"1,250/km2",3.25


In [25]:
city_selection.shape

(243, 3)

#### Normalizing Data

In [26]:
k = city_selection.copy(deep = True)
k['Population density in Km2'] = k['Population density in Km2'].str.split("/", n = 0, expand = True)
k['Population density in Km2'] = k['Population density in Km2'].str.replace(',','')
k['Population density in Km2'] = k['Population density in Km2'].astype(float)
city_selection = k.copy(deep = True)
city_selection.head()

Unnamed: 0,City,Population density in Km2,weights
0,Los Angeles,3276.0,3.5
1,Chicago,4600.0,3.5
2,Houston[3],1395.0,3.0
3,Philadelphia[e],4511.0,3.5
4,San Antonio,1250.0,3.25


In [27]:
from sklearn import preprocessing
column_names_to_normalize = ['Population density in Km2', 'weights']
x = city_selection[column_names_to_normalize].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
city_selection[column_names_to_normalize] = pd.DataFrame(x_scaled)
city_selection.head()

Unnamed: 0,City,Population density in Km2,weights
0,Los Angeles,0.470174,0.5
1,Chicago,0.664224,0.5
2,Houston[3],0.194489,0.25
3,Philadelphia[e],0.65118,0.5
4,San Antonio,0.173238,0.375


In [29]:
city_selection.shape

(243, 4)

#### Determing City by Calculating Sum of Normalized Data

In [28]:
city_selection['sum'] = city_selection['Population density in Km2'] + city_selection['weights']
row_num = city_selection['sum'].argmax()
city_name = city_selection['City'].iloc[row_num]
city_name

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  


'Jersey City'

#### Searching State in which city belongs

In [30]:
row = df.loc[df['City']== city_name].index[0]
state_name = df['State'].iloc[row]
state_name

'New Jersey'

### Benchmarking City by Per Capita income (min 50,000 USD)

In [31]:
p_row = df_state.loc[df_state['State'] == state_name].index[0]
per_capital_income = df_state['Per capita income'].iloc[p_row]
print("Per capita income of New Jersey is :", per_capital_income)

Per capita income of New Jersey is : $50,349


+ Since our Benchmarks are met so The best city to open a Casino/ Shopping Mall is **New Jersey**.  
Now lets find out the locality in New Jersey.

In [32]:
lat_newJersey = df['Latitude'].iloc[row]
long_newJersey = df['Longitude'].iloc[row]
print(lat_newJersey, long_newJersey)

40.7114 -74.0648


#### Using FourSquare API to get Venues in New Jersey

In [33]:
def getNearbyVenues1(name, latitudes, longitudes, radius):
    
    LIMIT = 150       
        # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitudes, 
            longitudes, 
            radius, 
            LIMIT)
            
        # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
   # print(results)
    venues_list=[]
    venues_list.append([(name,lat,lng,v['venue']['name'],v['venue']['location']['lat'],v['venue']['location']['lng'],v['venue']['categories'][0]['name'])for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 'Latitude', 'Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude','Venue Category']
    return(nearby_venues)


new_jersey_venues = getNearbyVenues1(name = 'Jersey City', latitudes = lat_newJersey ,longitudes = long_newJersey, radius = 2500)
new_jersey_venues.head()

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Jersey City,38.3539,-121.9728,The Grind Shop,40.71167,-74.062872,Coffee Shop
1,Jersey City,38.3539,-121.9728,Harry’s Daughter,40.710904,-74.062071,Caribbean Restaurant
2,Jersey City,38.3539,-121.9728,Corgi Spirits at The Jersey City Distillery,40.708304,-74.064803,Distillery
3,Jersey City,38.3539,-121.9728,Hooked JC,40.714709,-74.067009,Fish & Chips Shop
4,Jersey City,38.3539,-121.9728,The Oak On Pine,40.710312,-74.061406,Restaurant


In [34]:
new_jersey_venues.shape

(100, 7)

In [35]:
venues_in_newjersey = new_jersey_venues.copy(deep = True)
venues_in_newjersey.shape

(100, 7)

#### Setting Benchmarks for Locality for 100 Venues

In [36]:
k = new_jersey_venues.copy(deep = True)
new_weightage_dict= {'Coffee Shop' : 3, 
'Caribbean Restaurant':3,
'Distillery':2,
'Fish & Chips Shop':3,
'Science Museum':3,
'Latin American Restaurant':4,
'Restaurant':5,
'State / Provincial Park':1,
'Diner':1,
'Supermarket':1,
'Bar':1,
'Jazz Club':1,
'Golf Course':3,
'Park':2,
'Cajun / Creole Restaurant':2,
'Bakery':2,
'Go Kart Track':3,
'Taco Place':3,
'Hot Dog Joint':2,
'Food Truck':3,
'Beer Garden':3,
'Boutique':4,
'Café':5,
'Bagel Shop':1,
'Record Shop':1,
'Bakery':1,
'Pizza Place':1,
'Ramen Restaurant':1,
'Wine Bar':3,
'Middle Eastern Restaurant':2,
'French Restaurant':2,
'Theater':2,
'Lounge':3,
'Wine Shop':3,
'Cocktail Bar':2,
'New American Restaurant':3,
'Residential Building (Apartment / Condo)':3,
'Pool':4,
'Burger Joint':5,
'Cheese Shop':1,
'Coffee Shop':1,
'Bagel Shop':1,
'Vietnamese Restaurant':1,
'Portuguese Restaurant':1,
'Ice Cream Shop':3,
'Italian Restaurant':2,
'Gym':2,
'Farmers Market':2,
'Bar':3,
'Pizza Place':3,
'Bakery':2,
'Bookstore':3,
'Bar':3,
'Farmers Market':4,
'Asian Restaurant':5,
'Tea Room':1,
'Donut Shop':1,
'Historic Site':1,
'Gym / Fitness Center':1,
'Café':1,
'Mexican Restaurant':3,
'Plaza':2,
'Gay Bar':2,
'Bar':3,
'College Administrative Building':3,
'Mexican Restaurant':2,
'Bakery':3,
'American Restaurant':3,
'American Restaurant':4,
'American Restaurant':5,
'Café':1,
'New American Restaurant':1,
'Chocolate Shop':1,
'Gym':1,
'Grocery Store':1,
'Middle Eastern Restaurant':3,
'American Restaurant':2,
'Frozen Yogurt Shop':2,
'Japanese Restaurant':2,
'Bar':3,
'Liquor Store':3,
'Ice Cream Shop':2,
'Fish Market':3,
'Indie Movie Theater':3,
'Grocery Store':4,
'Modern European Restaurant':5,
'American Restaurant':1,
'Poke Place':1,
'Ramen Restaurant':1,
'Diner':1,
'Brewery':1,
'Burger Joint':3,
'Burger Joint':2,
'Café':2,
'Fried Chicken Joint':2,
'Beer Garden':3,
'Gym / Fitness Center':3,
'Vietnamese Restaurant':2,
'Italian Restaurant':3,
'Pet Store':3}

### Plotting Venes

In [37]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

# create map of the venues that we have using latitude and longitudes
venues_map = folium.Map(location=[lat_newJersey, long_newJersey], zoom_start=15) # generate map centred around Jersey city


# add Jersey City as a red circle mark
folium.features.CircleMarker(
    [lat_newJersey, long_newJersey],
    radius=10,
    popup='Jersey city',
    fill=True,
    color='red',
    fill_color='red',
    fill_opacity=0.6
    ).add_to(venues_map)

<folium.features.CircleMarker at 0x17cddbb0a58>

In [38]:
# add all the venuew of the Jersey city to the map as blue circle markers
for lat, lng, label in zip(venues_in_newjersey['Venue Latitude'], venues_in_newjersey['Venue Longitude'], venues_in_newjersey['Venue']):
    label=folium.Popup(label,parse_html=True)
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.6,
        parse_html = False).add_to(venues_map)
venues_map

#### Assigning Weights to Categories

In [39]:
allVenuesinCity1 = k['Venue Category']

f_weights1 = []
for i in allVenuesinCity1:
    if i in new_weightage_dict.keys():
        f_weights1.append(new_weightage_dict[i])
    else :
        f_weights1.append(0)
k['weights'] = f_weights1;
k.head()

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,weights
0,Jersey City,38.3539,-121.9728,The Grind Shop,40.71167,-74.062872,Coffee Shop,1
1,Jersey City,38.3539,-121.9728,Harry’s Daughter,40.710904,-74.062071,Caribbean Restaurant,3
2,Jersey City,38.3539,-121.9728,Corgi Spirits at The Jersey City Distillery,40.708304,-74.064803,Distillery,2
3,Jersey City,38.3539,-121.9728,Hooked JC,40.714709,-74.067009,Fish & Chips Shop,3
4,Jersey City,38.3539,-121.9728,The Oak On Pine,40.710312,-74.061406,Restaurant,5


In [40]:
newframe = k[['City','Venue Category','weights']].copy()
newframe = k.groupby(['Venue Category']).mean()
newframe.drop(columns = ["Latitude", "Longitude"], inplace = True)
newframe 

Unnamed: 0_level_0,Venue Latitude,Venue Longitude,weights
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Restaurant,40.714692,-74.041587,1
Asian Restaurant,40.721099,-74.044339,5
Australian Restaurant,40.717187,-74.044216,0
Bagel Shop,40.72299,-74.058068,1
Bakery,40.721297,-74.048836,3
Bar,40.718437,-74.058701,3
Beer Garden,40.718166,-74.043789,3
Bookstore,40.719984,-74.043205,3
Boutique,40.717606,-74.044299,4
Brewery,40.72066,-74.040287,1


In [41]:
newframe.shape

(69, 3)

### Applying Kmeans Algorithm to Cluster Venues

In [42]:
from scipy import stats
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
#Standardize
clmns = ['weights','Venue Latitude', 'Venue Longitude']
df_tr_std = stats.zscore(newframe[clmns])
#Cluster the data
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_tr_std)
labels = kmeans.labels_
newframe['clusters'] = labels
#Add the column into our list
clmns.extend(['clusters'])
#Lets analyze the clusters
kframe = newframe[clmns].groupby(['Venue Category']).mean()
kframe = kframe.reset_index(drop = False)
kframe.head()

Unnamed: 0,Venue Category,weights,Venue Latitude,Venue Longitude,clusters
0,American Restaurant,1,40.714692,-74.041587,1
1,Asian Restaurant,5,40.721099,-74.044339,0
2,Australian Restaurant,0,40.717187,-74.044216,1
3,Bagel Shop,1,40.72299,-74.058068,1
4,Bakery,3,40.721297,-74.048836,0


In [43]:
#new group by clusters and add weights of each cluster 
finalWeight = kframe.groupby(['clusters']).mean()
finalWeight

Unnamed: 0_level_0,weights,Venue Latitude,Venue Longitude
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.375,40.719921,-74.047287
1,1.382353,40.720276,-74.04816
2,2.545455,40.708189,-74.06583


In [45]:
# Coordinates with maximum weight
lat1 = 40.719921
long1 = -74.047287

### Plotting the Exact location in 100 mtr area

In [46]:
# create map of the venues that we have using latitude and longitudes
final_map = folium.Map(location=[lat1, long1], zoom_start=15) # generate map centred around Jersey city


# add prefered location in the City as a green circle mark
folium.features.CircleMarker(
    [lat1, long1],
    radius=100,
    popup='Shopping Mall/Casino can be opened within this circle',
    fill=True,
    color='green',
    fill_color='green',
    fill_opacity=0.6
    ).add_to(final_map)
final_map

## CONCLUSION
+ Best City in USA for opening Shopping Mall/Casino - **New Jearsey**
+ Best Locality in New Jearsey - on **Jearsey Avenue**

## Drawbacks and Refinements of above approach - 
+ Number of Categories for Benchmarking of City can be increased by assigning more weights to categories.
+ Radius search can be refined but as Foursquare API's have a limit in free trial mode we had to limit the radius.
+ Based on above two points our Final cluster can move to a different and a better city.