In [12]:
# Lebanese restaurent in Los Angeles

# Import required libraries
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
import requests
from bs4 import BeautifulSoup
import io
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim 
%matplotlib inline

In [13]:
# DATA ACQUISITION

# List of districts and neighborhoods of Los Angeles
# Get data from wikipedia page, process the page with BeautifulSoup to extract the data
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_districts_and_neighborhoods_of_Los_Angeles').text
soup = BeautifulSoup(website_url,'html')
# soup.prettify()
neighbourhoods_list = soup.findAll('ul')  # in this result set, the useful lists are the 3rd and 4th (table in 2 parts)
# Extracting neighborhoods' list
neighborhoods = []
for item in neighbourhoods_list[2].findAll('li'): # 3rd list
    neighborhoods.append(item.a.renderContents().decode("utf-8"))
for item in neighbourhoods_list[3].findAll('li'): # 4th list
    neighborhoods.append(item.a.renderContents().decode("utf-8"))

print('List of Los Angeles neighborhoods:\n' + "\n".join(neighborhoods))

List of Los Angeles neighborhoods:
Angelino Heights
Arleta
Arlington Heights
Arts District
Atwater Village
Baldwin Hills
Baldwin Hills/Crenshaw
Baldwin Village
Baldwin Vista
Beachwood Canyon
Bel Air, Bel-Air or Bel Air Estates
Benedict Canyon
Beverly Crest
Beverly Glen
Beverly Grove
Beverly Hills Post Office
Beverly Park
Beverlywood
Boyle Heights
Brentwood
Brentwood Circle
Brentwood Glen
Broadway-Manchester
Brookside
Bunker Hill
Cahuenga Pass
Canoga Park
Canterbury Knolls
Carthay
Castle Heights
Central-Alameda
Central City
Century City
Chatsworth
Chesterfield Square
Cheviot Hills
Chinatown
Civic Center
Crenshaw
Crestwood Hills
Cypress Park
Del Rey
Downtown
Eagle Rock
East Gate Bel Air
East Hollywood
Echo Park
Edendale
El Sereno
Elysian Heights
Elysian Park
Elysian Valley
Encino
Exposition Park
Faircrest Heights
Fairfax
Fashion District
Filipinotown, Historic
Financial District
Florence
Flower District
Franklin Hills
Gallery Row
Garvanza
Glassell Park
Gramercy Park
Granada Hills
Green M

In [14]:
# Correcting specific neighborhoods names
for n, i in enumerate(neighborhoods):
    if i == '[33]':
        neighborhoods[n] = 'Pico Robertson'
    if i == 'Bel Air, Bel-Air or Bel Air Estates':
        neighborhoods[n] = 'Bel Air'

neighborhoods = [s.replace('-', ' ') for s in neighborhoods]
        
print('List of Los Angeles neighborhoods:\n' + "\n".join(neighborhoods))

List of Los Angeles neighborhoods:
Angelino Heights
Arleta
Arlington Heights
Arts District
Atwater Village
Baldwin Hills
Baldwin Hills/Crenshaw
Baldwin Village
Baldwin Vista
Beachwood Canyon
Bel Air
Benedict Canyon
Beverly Crest
Beverly Glen
Beverly Grove
Beverly Hills Post Office
Beverly Park
Beverlywood
Boyle Heights
Brentwood
Brentwood Circle
Brentwood Glen
Broadway Manchester
Brookside
Bunker Hill
Cahuenga Pass
Canoga Park
Canterbury Knolls
Carthay
Castle Heights
Central Alameda
Central City
Century City
Chatsworth
Chesterfield Square
Cheviot Hills
Chinatown
Civic Center
Crenshaw
Crestwood Hills
Cypress Park
Del Rey
Downtown
Eagle Rock
East Gate Bel Air
East Hollywood
Echo Park
Edendale
El Sereno
Elysian Heights
Elysian Park
Elysian Valley
Encino
Exposition Park
Faircrest Heights
Fairfax
Fashion District
Filipinotown, Historic
Financial District
Florence
Flower District
Franklin Hills
Gallery Row
Garvanza
Glassell Park
Gramercy Park
Granada Hills
Green Meadows
Griffith Park
Hancock

In [15]:
# Forming the neighborhoods dataframe
columns = ['Neighborhood']
neighborhoods_df = pd.DataFrame(columns = columns)
neighborhoods_df['Neighborhood'] = neighborhoods
print('There are {} neighborhoods in the dataframe.'.format(neighborhoods_df.shape[0]))
neighborhoods_df.head(10)

There are 195 neighborhoods in the dataframe.


Unnamed: 0,Neighborhood
0,Angelino Heights
1,Arleta
2,Arlington Heights
3,Arts District
4,Atwater Village
5,Baldwin Hills
6,Baldwin Hills/Crenshaw
7,Baldwin Village
8,Baldwin Vista
9,Beachwood Canyon


In [16]:
# Getting the geograpical coordinates of each neighborhood
latitudes = []
longitudes = []

for neigh in neighborhoods_df['Neighborhood']:
    location = None
    for k in range(10):
        try:
            address = '{}, Los Angeles, US'.format(neigh)
            geolocator = Nominatim(user_agent="LA_explorer")
            location = geolocator.geocode(address)
            latitude = location.latitude
            longitude = location.longitude
            latitudes.append(latitude)
            longitudes.append(longitude)
            print('The location coordinates of {} are {} and {}'.format(neigh, latitude, longitude))
            break
        except:
            pass
    if location is None:
        latitudes.append(0)
        longitudes.append(0)
print('Done!')

The location coordinates of Angelino Heights are 34.0702889 and -118.2547965
The location coordinates of Arleta are 34.2413266 and -118.4322047
The location coordinates of Arlington Heights are 34.0434937 and -118.3213735
The location coordinates of Arts District are 34.0412389 and -118.2344503
The location coordinates of Atwater Village are 34.1186975 and -118.2623924
The location coordinates of Baldwin Hills/Crenshaw are 34.0075684 and -118.3505956
The location coordinates of Beachwood Canyon are 34.1222919 and -118.3213845
The location coordinates of Bel Air are 34.0988833 and -118.4598811
The location coordinates of Benedict Canyon are 34.0447702 and -118.4000223
The location coordinates of Beverly Crest are 34.1167701 and -118.4322607
The location coordinates of Beverly Glen are 34.1077855 and -118.4456361
The location coordinates of Beverly Grove are 34.0760338 and -118.3699723
The location coordinates of Beverly Park are 34.06376935 and -118.26468959898818
The location coordinat

The location coordinates of Solano Canyon are 34.0818871 and -118.234983
The location coordinates of South Park are 33.9959019 and -118.2697722
The location coordinates of Studio City are 34.1483989 and -118.3961877
The location coordinates of Sunland are 34.2669466 and -118.3023
The location coordinates of Sunset Junction are 34.0926493 and -118.2807954
The location coordinates of Sun Valley are 34.2204227 and -118.3878945
The location coordinates of Sylmar are 34.3076252 and -118.4492148
The location coordinates of Tarzana are 34.1714436 and -118.5429789
The location coordinates of Terminal Island are 33.7451061 and -118.26302201608382
The location coordinates of Thai Town are 34.1018533 and -118.3048616
The location coordinates of Toluca Lake are 34.1521688 and -118.3571417
The location coordinates of Toy District are 34.0470483 and -118.24551021662734
The location coordinates of Tujunga are 34.252225 and -118.2884105
The location coordinates of University Hills are 34.0666487 and -

In [17]:
# Add the obtained coordinates to our dataframe
neighborhoods_df['Latitude'] = latitudes
neighborhoods_df['Longitude'] = longitudes
# Drop those neighborhoods for which we could not find the coordinates
neighborhoods_df = neighborhoods_df[[a and b for a, b in zip(neighborhoods_df['Latitude'] != 0, neighborhoods_df['Longitude'] != 0)]].reset_index(drop=True)
print('There are {} neighborhoods in the dataframe.'.format(neighborhoods_df.shape[0]))
neighborhoods_df.head(10)

There are 155 neighborhoods in the dataframe.


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Angelino Heights,34.070289,-118.254796
1,Arleta,34.241327,-118.432205
2,Arlington Heights,34.043494,-118.321374
3,Arts District,34.041239,-118.23445
4,Atwater Village,34.118698,-118.262392
5,Baldwin Hills/Crenshaw,34.007568,-118.350596
6,Beachwood Canyon,34.122292,-118.321384
7,Bel Air,34.098883,-118.459881
8,Benedict Canyon,34.04477,-118.400022
9,Beverly Crest,34.11677,-118.432261


In [41]:
# Now we will get the data of area and population of each neighborhood

# We start by adding the new columns to the dataframe
neighborhoods_df['Population'] = [0 for i in range(neighborhoods_df.shape[0])]
neighborhoods_df['Area'] = [0 for i in range(neighborhoods_df.shape[0])]

# To get the area of each neighborhood, we will use the data from this link:
# http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_LA_Neighborhoods_Data
# Although this table contains the population as well, we decide not to extract it from here since it's a bit old (2010)
website_url = requests.get('http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_LA_Neighborhoods_Data').text
soup = BeautifulSoup(website_url,'html')
# soup.prettify()
area_table = soup.findAll('table',{'class':'wikitable'})[1].findAll('tr')[1:]

In [42]:
for row in area_table:
    neighbor_name = row.findAll('td')[0].a.renderContents().strip().decode("utf-8").replace("_", " ").replace("-", " ")
    population = float(row.findAll('td')[11].renderContents())
    area = float(row.findAll('td')[12].renderContents())
    if any(neighborhoods_df['Neighborhood'] == neighbor_name):
        neighborhoods_df.loc[neighborhoods_df['Neighborhood'] == neighbor_name, 'Population'] = population
        neighborhoods_df.loc[neighborhoods_df['Neighborhood'] == neighbor_name, 'Area'] = area

In [43]:
# Drop the neighborhoods for which the area data is missing
neighborhoods_df = neighborhoods_df[neighborhoods_df['Area'] != 0].reset_index(drop=True)
neighborhoods_df = neighborhoods_df[neighborhoods_df['Population'] != 0].reset_index(drop=True)
print('There are {} neighborhoods in the dataframe.'.format(neighborhoods_df.shape[0]))
neighborhoods_df.head(10)

There are 108 neighborhoods in the dataframe.


Unnamed: 0,Neighborhood,Latitude,Longitude,Population,Area
0,Arleta,34.241327,-118.432205,31068.0,3.1
1,Arlington Heights,34.043494,-118.321374,22106.0,1.0
2,Atwater Village,34.118698,-118.262392,14888.0,1.8
3,Baldwin Hills/Crenshaw,34.007568,-118.350596,30123.0,3.0
4,Bel Air,34.098883,-118.459881,7928.0,6.6
5,Beverly Crest,34.11677,-118.432261,10610.0,7.9
6,Beverly Grove,34.076034,-118.369972,21417.0,1.7
7,Beverlywood,34.046633,-118.395038,6080.0,0.8
8,Boyle Heights,34.043689,-118.209768,92785.0,6.5
9,Brentwood,34.05214,-118.47407,31344.0,15.2


In [57]:
neighborhoods_df['Population density'] = round(neighborhoods_df['Population'] / neighborhoods_df['Area'])
neighborhoods_df.head(10)

Unnamed: 0,Neighborhood,Latitude,Longitude,Population,Area,Population density
0,Arleta,34.241327,-118.432205,31068.0,3.1,10022.0
1,Arlington Heights,34.043494,-118.321374,22106.0,1.0,22106.0
2,Atwater Village,34.118698,-118.262392,14888.0,1.8,8271.0
3,Baldwin Hills/Crenshaw,34.007568,-118.350596,30123.0,3.0,10041.0
4,Bel Air,34.098883,-118.459881,7928.0,6.6,1201.0
5,Beverly Crest,34.11677,-118.432261,10610.0,7.9,1343.0
6,Beverly Grove,34.076034,-118.369972,21417.0,1.7,12598.0
7,Beverlywood,34.046633,-118.395038,6080.0,0.8,7600.0
8,Boyle Heights,34.043689,-118.209768,92785.0,6.5,14275.0
9,Brentwood,34.05214,-118.47407,31344.0,15.2,2062.0


In [58]:
# MAP VISUALISATION

# get the address of Los Angeles
address = 'Los Angeles, US'
geolocator = Nominatim(user_agent="LA_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Los Angeles are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Los Angeles are 34.0536909, -118.2427666.


In [60]:
# create map of Los Angeles using latitude and longitude values
map_LA = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood, pop in zip(neighborhoods_df['Latitude'], neighborhoods_df['Longitude'], neighborhoods_df['Neighborhood'], neighborhoods_df['Population density']):
    label = '{}, density {}/km'.format(neighborhood, pop)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius= 5 * (pop / 10000),
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_LA)  
    
map_LA

In [None]:
# Define foursquare credentials
CLIENT_ID = 'QED5T4EOYGPHTLNIPYLSVINTSUYSVZ30GZBMENBIXAPBNVLN' # Foursquare ID
CLIENT_SECRET = 'DRNUECJTLEUK2HLDNIM0DMHN3IMOB0C0XHYTKYSWFV120AL1' # Foursquare Secret
VERSION = '20200416' # API version
LIMIT = 100
radius = 500