First of all let's import necessery libraries.

In [1]:
!pip install bs4
!conda install -c conda-forge folium=0.5.0 --yes
!pip install geopy

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import folium
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

Will use information about Los Angeles neighborhoods from Wikipedia.

In [3]:
url1 = 'http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_LA_Neighborhoods_Data'
df1 = pd.read_html(url1, flavor='bs4')[2]
df1.head()

Unnamed: 0,LA_Nbhd,Income,Schools,Diversity,Age,Homes,Vets,Asian,Black,Latino,White,Population,Area,Longitude,Latitude
0,Adams_Normandie,29606,691,0.6,26,0.26,0.05,0.05,0.25,0.62,0.06,31068,0.8,-118.30027,34.03097
1,Arleta,65649,719,0.4,29,0.29,0.07,0.11,0.02,0.72,0.13,31068,3.1,-118.430015,34.240603
2,Arlington_Heights,31423,687,0.8,31,0.31,0.05,0.13,0.25,0.57,0.05,22106,1.0,-118.320109,34.043611
3,Atwater_Village,53872,762,0.9,34,0.34,0.06,0.2,0.01,0.51,0.22,14888,1.8,-118.265808,34.124908
4,Baldwin_Hills/Crenshaw,37948,656,0.4,36,0.36,0.1,0.05,0.71,0.17,0.03,30123,3.0,-118.3667,34.01909


Will clean the table: drop / rename some columns and values.

In [4]:
neighborhoods = pd.DataFrame([df1.LA_Nbhd, df1.Income, df1.Longitude, df1.Latitude]).transpose()
neighborhoods.rename(columns={'LA_Nbhd':'Neighborhood'}, inplace=True)
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace('_',' ')
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace('/',' / ')
print('There are ', neighborhoods['Neighborhood'].count(), ' neighborhoods in LA:')
neighborhoods.head()

There are  110  neighborhoods in LA:


Unnamed: 0,Neighborhood,Income,Longitude,Latitude
0,Adams Normandie,29606,-118.3,34.031
1,Arleta,65649,-118.43,34.2406
2,Arlington Heights,31423,-118.32,34.0436
3,Atwater Village,53872,-118.266,34.1249
4,Baldwin Hills / Crenshaw,37948,-118.367,34.0191


Let's get Los Angeles neighborhoods Crime data.

In [5]:
url2 = 'http://maps.latimes.com/neighborhoods/property-crime/neighborhood/list/'
df2 = pd.read_html(url2, flavor='bs4')[3]
df2.head()

Unnamed: 0,Rank,Neighborhood,Per Capita,Total
0,1,Elysian Park,263.2,70
1,2,Fairfax,251.5,336
2,3,Beverly Grove,243.3,556
3,4,Playa Vista,231.3,139
4,5,Rancho Park,199.7,91


Will use geopy library to get the latitude and longitude values of Los Angeles.

In [6]:
address = 'Los Angeles, LA'
geolocator = Nominatim(user_agent="LA_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Los Angeles are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Los Angeles are 34.0536909, -118.242766.


Folium is a great visualization library. We can zoom into the below map, and click on each circle mark to reveal the name of the neighborhood.

In [7]:
# create map of Los Angeles using latitude and longitude values
map_LA = folium.Map(location=[latitude, longitude], zoom_start=9)

# will add markers to map
for lat, lng, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_LA)  

# will show the map
print('This is a map of Los Angeles neighburhoods')
map_LA

This is a map of Los Angeles neighburhoods


Will define Foursquare Credentials and Version below (sensitive code cell).

In [8]:
CLIENT_ID = 'LNCKLABY1DEZOJNPD0JA200QJDKCGOLCDMJUW2NA010RCDVA'
CLIENT_SECRET = 'EYRFXSLVQPKBD54I5C2V0O4BJKSE52G5RVODPMEII4FADD40'
VERSION = '20210313' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

Let's create a function returns venues in 500m from Foursquare to all the neighborhoods in Los Angeles.

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url3 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url3).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Let's create a dataframe of Los Angeles venues and see how many of them were returned:

In [11]:
LA_venues = getNearbyVenues(names = neighborhoods['Neighborhood'],
                                   latitudes = neighborhoods['Latitude'],
                                   longitudes = neighborhoods['Longitude']
                                  )
LA_venues.groupby('Neighborhood').count()

Adams Normandie
Arleta
Arlington Heights
Atwater Village
Baldwin Hills / Crenshaw
Bel-Air
Beverly Crest
Beverly Grove
Beverlywood
Boyle Heights
Brentwood
Broadway Manchester
Canoga Park
Carthay
Central Alameda
Century City
Chatsworth
Chesterfield Square
Cheviot Hills
Chinatown
Cypress Park
Del Rey
Downtown
Eagle Rock
East Hollywood
Echo Park
El Sereno
Elysian Park
Elysian Valley
Encino
Exposition Park
Fairfax
Florence
Glassell Park
Gramercy Park
Granada Hills
Green Meadows
Hancock Park
Harbor City
Harbor Gateway
Harvard Heights
Harvard Park
Highland Park
Historic South Central
Hollywood
Hollywood Hills
Hollywood Hills West
Hyde Park
Jefferson Park
Koreatown
Lake Balboa
Lake View Terrace
Larchmont
Leimert Park
Lincoln Heights
Los Feliz
Manchester Square
Mar Vista
Mid City
Mid Wilshire
Mission Hills
Montecito Heights
Mount Washington
North Hills
North Hollywood
Northridge
Pacific Palisades
Pacoima
Palms
Panorama City
Pico Robertson
Pico Union
Playa del Rey
Playa Vista
Porter Ranch
Rancho

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adams Normandie,9,9,9,9,9,9
Arleta,5,5,5,5,5,5
Arlington Heights,8,8,8,8,8,8
Atwater Village,40,40,40,40,40,40
Baldwin Hills / Crenshaw,2,2,2,2,2,2
...,...,...,...,...,...,...
Westchester,6,6,6,6,6,6
Westlake,53,53,53,53,53,53
Wilmington,14,14,14,14,14,14
Winnetka,11,11,11,11,11,11


Let's find out how many unique categories can be curated from all the returned venues.

In [12]:
print('There are {} uniques categories.'.format(len(LA_venues['Venue Category'].unique())))

There are 267 uniques categories.


In [13]:
# one hot encoding
LA_onehot = pd.get_dummies(LA_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
LA_onehot['Neighborhood'] = LA_venues['Neighborhood'] 

# move neighbourhood column to the first column
fixed_columns = [LA_onehot.columns[-1]] + list(LA_onehot.columns[:-1])
LA_onehot = LA_onehot[fixed_columns]

LA_onehot.head()

Unnamed: 0,Yoga Studio,ATM,Accessories Store,Adult Boutique,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Watch Shop,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Next, let's group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category.

In [14]:
LA_grouped = LA_onehot.groupby('Neighborhood').mean().reset_index()
LA_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,ATM,Accessories Store,Adult Boutique,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Watch Shop,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Adams Normandie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Arleta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arlington Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Atwater Village,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,...,0.0,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Baldwin Hills / Crenshaw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's write a function to sort the venues in descending order:


In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Let's create a dataframe with the top 10 venues for each neighborhood.

In [16]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = LA_grouped['Neighborhood']

for ind in np.arange(LA_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(LA_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adams Normandie,Sushi Restaurant,Gas Station,Park,Grocery Store,Playground,Taco Place,Latin American Restaurant,Women's Store,Ethiopian Restaurant,Event Service
1,Arleta,Bakery,Flower Shop,Convenience Store,Historic Site,Video Store,Farm,Escape Room,Ethiopian Restaurant,Event Service,Fabric Shop
2,Arlington Heights,Seafood Restaurant,Shop & Service,Grocery Store,Rental Car Location,Restaurant,Donut Shop,Café,Escape Room,Women's Store,Event Service
3,Atwater Village,Food Truck,Mobile Phone Shop,Chinese Restaurant,Fast Food Restaurant,Ice Cream Shop,Diner,Steakhouse,Spa,Shoe Store,Coffee Shop
4,Baldwin Hills / Crenshaw,Flower Shop,Clothing Store,Women's Store,Escape Room,Food Stand,Food Service,Food Court,Food,Filipino Restaurant,Fast Food Restaurant
