In [1]:
import pandas as pd
import numpy as np
import pickle
#https://pypi.org/project/geopy/
from geopy.geocoders import Nominatim

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# built in api 
import foursquare as fs

import requests
from bs4 import BeautifulSoup
from pandas import json_normalize

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

---
# Foursquare Data Pull

In [4]:
inputLocation = 'Chicago, IL'

def get_coords(inputLocation):
    geolocator = Nominatim(user_agent="my_user_agent")
    location = geolocator.geocode(inputLocation)
    latitude = location.latitude
    longitude = location.longitude
    latitude_str = str(location.latitude)
    longitude_str = str(location.longitude)
    ll = latitude_str + ',' + longitude_str
    return ll, latitude, longitude

get_coords(inputLocation)

('41.8755616,-87.6244212', 41.8755616, -87.6244212)

In [5]:
geolocator = Nominatim(user_agent="my_user_agent")
location = geolocator.geocode(inputLocation)
print(f'Location Address\n{location.address}\n')
print(f'Latitude & Longitude\n{(location.latitude, location.longitude)}\n')
# print(location.raw)

Location Address
Chicago, Cook County, Illinois, United States

Latitude & Longitude
(41.8755616, -87.6244212)



In [6]:
def foursquare_api():
    with open('foursquare_keys.txt', 'r') as f:
        CLIENT_ID, CLIENT_SECRET = [lines.strip() for lines in f.readlines()]

    # VERSION = '20180605' # FOURSQUARE API VERSION
    VERSION = '20201112' # FOURSQUARE API VERSION

    # Construct the client object
    client = fs.Foursquare(client_id=CLIENT_ID, 
                           client_secret=CLIENT_SECRET, 
                           version=VERSION)
    return client

In [7]:
#https://developer.foursquare.com/docs/api-reference/venues/search/#parameters
def explore_venues(client, inputLocation, limit=100, radius=250):
    '''function to get n-places using explore in foursquare, where n is the limit when calling the function.
    This returns a pandas dataframe with name, city, categories, address, Latitude, Longitude.
    Arguments: client (foursquare_api()), inputLocation (city, state) , limit (defaults to 100), radius (defaults to a city-wide area)
    '''
    
    ll = get_coords(inputLocation)[0]
    params={'ll':ll,
            'limit':limit, 
            'intent' : 'browse',
            'radius':radius, 
           }
    venues = client.venues.explore(params)
    venues = venues['groups'][0]['items']
    venues = json_normalize(venues)
    filtered_cols = ['venue.name',
                     'venue.location.city',
                     'venue.categories',
                     'venue.location.address',
                     'venue.location.lat', 
                     'venue.location.lng']
    venues = venues.loc[:, filtered_cols]
    venues['venue.categories'] = [value[0]['name'] for i, value in venues['venue.categories'].items()]
    venues.columns = [col.split(".")[-1] for col in venues.columns]

    
    return venues

In [8]:
explore_venues(foursquare_api(), 'Chicago, IL', radius=100000)

FileNotFoundError: [Errno 2] No such file or directory: 'foursquare_keys.txt'

In [None]:
explore_venues(foursquare_api(), 'Brook Park, OH', radius=100000)

---
# City Data Web Scraping
+ http://www.city-data.com/

In [None]:
# function to pull data from hgraph 
def pull_value(hg, item_pos):
    '''function to pull data from hgraph '''
    return hg[item_pos].find('table').find_all('tr')[0].find_all('td')[1].text


In [None]:
keys = ['med household income'
        ,'med rent'
        ,'males'
        ,'med age males'
        ,'med age females'
        ,'avg household size'
        ,'pct family household'
        ,'pct married couple'
        ,'pct families with children'
        ,'pct single mother'
        ,'pct never married males > 15'
        ,'pct never married females > 15'
        ,'pct not speak English well'
        ,'pct born in state'
        ,'pct born in another us state'
        ,'pct native residents born outside us'
        ,'pct foreign born residents'
        ,'avg number of cars houses'
        ,'avg number of cars apts'
        ,'pct units mortgage']
len(keys)

In [None]:
#https://developer.foursquare.com/docs/api-reference/venues/search/#parameters
def pull_neighborhood_data(url):
    '''function to extract all data for a neighborhood from page URL'''
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    row_data = soup.find(class_='row')
    item = row_data.find(class_='content-item')
    # extract area and population
    dictionary = {}
    for i in item.find_all('b'):
        try:
            key = i.text.strip()
            value = i.next_sibling.strip()
            dictionary[key] = value
        except:
            pass
    hg = row_data.find_all(class_ = 'hgraph')[1:]
    values = [pull_value(hg, pos) for pos in range(0,43)]
    # join both datasets: (area,population) + (rest of the data)
    dictionary_copy = dictionary.copy()
    dictionary_copy.update(dict(zip(keys,values)))
    return dictionary_copy

In [None]:
url = 'http://www.city-data.com/neighborhood/Albany-Park-Chicago-IL.html'
pull_neighborhood_data(url)

In [None]:
pull_neighborhood_data('http://www.city-data.com/neighborhood/Brook-Park-Brook-Park-OH.html')

---
# Scrape List of Neighborhoods in Brook Park, OH

In [None]:
url = "http://www.city-data.com/neighborhood/Brook-Park-Brook-Park-OH.html"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
row_data = soup.find(class_='row')
row_data

In [None]:
item = row_data.find(class_='content-item')
item

In [None]:
# extract area and population
d = {}
for i in item.find_all('b'):
    try:
        key = i.text.strip()
        value = i.next_sibling.strip()
        d[key] = value
    except:
        pass
d

In [None]:
# function to pull data from hgraph 
def pull_value(hg, item_pos):
    return hg[item_pos].find('table').find_all('tr')[0].find_all('td')[1].text

# test function above on another hgraph
hg = row_data.find_all(class_ = 'hgraph')[1:]
pull_value(hg,2)

In [None]:
# pull all data from the page
keys = ['med household income'
        ,'med rent'
        ,'males'
        ,'med age males'
        ,'med age females'
        ,'avg household size'
        ,'pct family household'
        ,'pct married couple'
        ,'pct families with children'
        ,'pct single mother'
        ,'pct never married males > 15'
        ,'pct never married females > 15'
        ,'pct not speak English well'
        ,'pct born in state'
        ,'pct born in another us state'
       ,'pct native residents born outside us'
       ,'pct foreign born residents'
       ,'avg number of cars houses'
       ,'avg number of cars apts'
       ,'pct units mortgage']
values = [pull_value(hg, pos) for pos in range(0,20)]
dict(zip(keys,values))

In [None]:
# join both datasets: (area,population)+(rest of the data)
z = d.copy()
z.update(dict(zip(keys,values)))
z
# area in sq. miles

In [None]:
# Function to extract all data for a neighborhood from page URL

keys = ['med household income'
        ,'med rent'
        ,'males'
        ,'med age males'
        ,'med age females'
        ,'avg household size'
        ,'pct family household'
        ,'pct married couple'
        ,'pct families with children'
        ,'pct single mother'
        ,'pct never married males > 15'
        ,'pct never married females > 15'
        ,'pct not speak English well'
        ,'pct born in state'
        ,'pct born in another us state'
       ,'pct native residents born outside us'
       ,'pct foreign born residents'
       ,'avg number of cars houses'
       ,'avg number of cars apts'
       ,'pct units mortgage']

def pull_neigh_data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    row_data = soup.find(class_='row')
    item = row_data.find(class_='content-item')
    d = {}
    for i in item.find_all('b'):
        try:
            key = i.text.strip()
            value = i.next_sibling.strip()
            d[key] = value
        except:
            pass
    hg = row_data.find_all(class_ = 'hgraph')[1:]
    values = [pull_value(hg, pos) for pos in range(0,20)]
    z = d.copy()
    z.update(dict(zip(keys,values)))
    return z

# test run on Murraywood
url = "http://www.city-data.com/neighborhood/Brook-Park-Brook-Park-OH.html"

pull_neigh_data(url)

---
# Scrape List of Neighborhoods in Illinois

In [None]:
# The list of neighborhoods is available at the following url
page_no = 1 # use only the first page for now
url = "http://www.city-data.com/indexes/neighborhoods/IL/%d/"%page_no

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
li = soup.find_all('li')
li[100] # show one of the neighborhoods

In [None]:
url_prefix = 'http://www.city-data.com'

# function returns neighborhood name and url for a list item
def get_neigh_url(li_item):
    value = url_prefix + li_item.find('a').get('href')
    key = li_item.text
    return (key,value)
    
get_neigh_url(li[50])

In [None]:
def scrape_page(page_no):
    '''function to scrape all neighborhood names from one page identified by page_no'''
    url = "http://www.city-data.com/indexes/neighborhoods/IL/%d/"%page_no
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    urls_dict = []
    for li_item in soup.find_all('li'):
        if li_item.text.find('Chicago, IL') >= 0: # Use only neighborhoods containing 'Westmont, IL' in their name
            urls_dict.append(get_neigh_url(li_item))
    return urls_dict

# Example -- scrape page 2 from the website
# urls_dict = scrape_page(2)

In [None]:
def scrape_page2(page_no):
    '''function to scrape all neighborhood names from one page identified by page_no'''
    url = "http://www.city-data.com/indexes/neighborhoods/OH/%d/"%page_no
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    urls_dict = []
    for li_item in soup.find_all('li'):
        if li_item.text.find('Brook Park, OH') >= 0:
            urls_dict.append(get_neigh_url(li_item))
    return urls_dict

# Example -- scrape page 2 from the website
# urls_dict = scrape_page(2)

In [None]:
def scrape_all_pages():
    urls_dict = []
    for page_no in range(1,8):
        scraped = scrape_page(page_no)
        urls_dict += scraped
        print(f'Processed page {page_no} of 7\n')
        
    # save neighborhood names and urls for later usage
    with open('urls_dict.pickle','wb') as f:
        pickle.dump(urls_dict, f)
    return urls_dict

---
# Scrape Socioeconomic Data (production)

+ Now that we have the full list of urls and neighborhood names, we can scrape socioeconomic data for each neighborhood in the list

In [None]:
def scrape_neighborhood(urls):
    lst = []
    count = 0
    for name, url in urls:
        try:
            df = pull_neighborhood_data(url)
            df['neighborhood'] = name
            lst.append(df)
            count += 1
            print(f'{name} processed {count}, now waiting...')
        except:
            print(f'{name} unable to pull data')
    df_neighborhoods = pd.DataFrame(lst)
    return df_neighborhoods

In [None]:
df_neighborhoods = scrape_neighborhood(scrape_all_pages())

In [None]:
df_neighborhoods.to_csv('./data/neighborhoods.csv')

In [None]:
df_copy = df_neighborhoods.copy()
df_copy.tail()

In [None]:
# add Austin neighborhood to df
row = pull_neighborhood_data('http://www.city-data.com/neighborhood/Brook-Park-Brook-Park-OH.html')
# if the neighborhood has not been added, add it to the df
if len(df_copy[df_copy['neighborhood'] == 'Brook Park, Brook Park, OH']) == 0:
    df_copy = df_copy.append(pd.Series(row), ignore_index = True)

In [None]:
df_copy

In [None]:
df_copy[df_copy['neighborhood'] == 'Brook Park, Brook Park, OH']

In [None]:
df_copy.to_csv('./data/Brook_Park_neighborhoods.csv')

In [None]:
df = pd.read_csv('./data/Brook_Park_neighborhoods.csv',index_col = 0)
df = df.rename(columns = {'neighborhood':'Neighborhood'})
df['Neighborhood'] = df['Neighborhood'].apply(lambda x:x.replace(' neighborhood in',','))
df = df.set_index('Neighborhood')
df.head()

In [None]:
df

---
# Scrape Venues

In [None]:
def scrape_venues():
    with open('./data/urls_dict.pickle','rb') as f:
        urls_dict = pickle.load(f)
    names = []
    latitudes = []
    longitudes = []

    for u in urls_dict:
        try:
            name = u[0].replace(' neighborhood in',',') # make name look like an address
            lat = get_coords(name)[1]
            lng = get_coords(name)[2]
            names.append(name)
            latitudes.append(lat)
            longitudes.append(lng)
        except:
            pass

    coords = list(set(zip(names, latitudes, longitudes))) # remove duplicates
    # save coords for later use
    with open('coords.pickle','wb') as f:
        pickle.dump(coords,f)
    return coords

---
# Pull Venus Data from Coords

In [None]:
def get_nearby_venues(scrape_venues, client, radius, limit):
    
    with open('./data/coords.pickle','rb') as f:
        coords = pickle.load(f)
        
    venues_lst = []
    for name, lat, lng in coords:
        print(name)
        try:
            latitude_str = str(lat)
            longitude_str = str(lng)
            ll = latitude_str + ',' + longitude_str       
            venues = client.venues.explore(params={
                'll':ll,
                'limit':100,
                'intent' : 'browse',
                'radius':8000,
            })
            venues = venues['groups'][0]['items']
            # return only relevant information for each nearby venue
            venues_lst.append([(
                        name, 
                        lat, 
                        lng, 
                        venue['venue']['name'], 
                        venue['venue']['location']['lat'], 
                        venue['venue']['location']['lng'],  
                        venue['venue']['categories'][0]['name']) for venue in venues])
        except:
            pass

        nearby_venues = pd.DataFrame([item for venue_list in venues_lst for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                                 'Neighborhood Latitude', 
                                 'Neighborhood Longitude', 
                                 'Venue', 
                                 'Venue Latitude', 
                                 'Venue Longitude', 
                                 'Venue Category']
    return nearby_venues

In [None]:
venues = get_nearby_venues(scrape_venues(), foursquare_api(), radius=500, limit=100)

In [None]:
venues.to_csv('./data/venues.csv')

---
# Data Exploration

In [None]:
venues.groupby('Neighborhood').count().head()

In [None]:
# convert venue category to one hot encoding
venues_onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
venues_onehot['Neighborhood'] = venues['Neighborhood']
# reorder columns
# cols = list(venues_onehot.columns)
# cols = [cols[-1]] + cols[:-1]
# venues_onehot = venues_onehot[cols]

field_ix = list(venues_onehot.columns).index('Neighborhood')
fixed_columns = [venues_onehot.columns[field_ix]]\
            +list(venues_onehot.columns[:field_ix])\
            +list(venues_onehot.columns[(field_ix+1):])
venues_onehot = venues_onehot[fixed_columns]


venues_onehot.head()

In [None]:
# count the number of venues in each category
venues_grouped = venues_onehot.groupby('Neighborhood').mean().reset_index()
venues_grouped = venues_grouped.set_index('Neighborhood')
venues_grouped.head()

---
# Venues Data

In [None]:
venues_grouped.head()

---
# Socioeconmic Data

In [None]:
df_neighborhoods.head()

In [None]:
# cols = ['avg household size',
#         'avg number of cars apts',
#         'avg number of cars houses',
#         'males',
#         'med age females',
#         'med age males',
#         'med household income',
#         'med rent',
#         'pct born in another us state',
#         'pct born in state',
#         'pct families with children',
#         'pct family household',
#         'pct foreign born residents',
#         'pct married couple',
#         'pct native residents born outside us',
#         'pct never married females > 15',
#         'pct never married males > 15',
#         'pct not speak English well',
#         'pct single mother',
#         'pct units mortgage']
# df_neighborhoods = df_neighborhoods[cols]
# df_neighborhoods

In [None]:
# join both datasets
df_neighs_venues = df_neighborhoods.join(venues_grouped, how = 'inner')
df_neighs_venues = df_neighs_venues.rename(columns = {'Area:':'area','Population:':'population'})
df_neighs_venues.to_csv('./data/neighs_venues.csv')
df_neighs_venues.head()
df_neighs_venues.shape

---
# Methodology
---
## Clean & Prepare Data

In [None]:
import pandas as pd
df = pd.read_csv('./data/neighs_venues.csv',index_col = 0)
df = df.rename(columns = {'Area:':'area','Population:':'population'})
df.shape
df.head()

In [None]:
# Remove any neighborhood that may have NaN's
df.dropna(inplace = True, axis = 0)

In [None]:
# Remove ' people' from household field and convert to float
def household_process(item):
    if type(item) == float:
        return item
    try:
        if len(item) > 6:
            return float(item[:-6])
        else:
            return 0
    except:
        return 0

df['avg household size'] = df['avg household size'].apply(household_process).astype(float)
df.loc[df['avg household size'] == 0,'avg household size'] = 1 # make household size = 1

In [None]:
# Remove ' years' from fields
def remove_years(item):
    if type(item) == float:
        return item
    try:
        if len(item) > 6:
            return float(item[:-6])
        else:
            return 0
    except:
        return 0

for f in ['med age males','med age females']:
    df[f] = df[f].apply(remove_years).astype(float)

In [None]:
# Remove $ sign
def remove_dollar(item):
    if type(item) == float:
        return item
    try:
        return float(item)
    except:
        pass
    try:
        if len(item) > 1:
            item = ''.join(item.split(','))[1:]
            return float(item)
        else:
            return 0
    except:
        return 0
for f in ['med household income','med rent']:
    df[f] = df[f].apply(remove_dollar)

In [None]:
# Remove % symbols from fields
def remove_percent(item):
    if type(item) == float:
        return item
    try:
        return float(item)
    except:
        pass
    try:
        if len(item) > 1:
            return float(item[:-1])
        else:
            return 0
    except:
        return 0
    
for f in ['pct born in another us state','pct born in state'
         ,'pct families with children','pct family household'
         ,'pct foreign born residents','pct married couple'
         ,'pct native residents born outside us','pct never married females > 15'
         ,'pct never married males > 15','pct not speak English well','pct units mortgage'
         ,'pct single mother','pct born in another us state'
         ]:
    df[f] = df[f].apply(remove_percent)

In [None]:
# Remove ',' from population field
df['population'] = df.population.apply(lambda x:int(''.join(x.split(','))))

In [None]:
# Try to process avg number of cars
def try_to_convert(item):
    try:
        return float(item)
    except:
        return None
df['avg number of cars apts'] = df['avg number of cars apts'].apply(try_to_convert)
df['avg number of cars houses'] = df['avg number of cars houses'].apply(try_to_convert)

In [None]:
# Try to convert males field to a number
df['males'] = df['males'].apply(try_to_convert)

In [None]:
# Take another look at the data
df.head()

---
### Fill NaNs with means

Several fields ended up having NaN's, where data could not be converted to numeric format:
* Avg number of cars -- fill with avg across the dataset
* Males -- fill with population/2

In [None]:
field = 'avg number of cars apts'
df[field].fillna(df[field].mean(), inplace = True)

In [None]:
field = 'avg number of cars houses'
df[field].fillna(df[field].mean(), inplace = True)

In [None]:
df.loc[df.isnull().any(axis = 1),'males'] = df.loc[df.isnull().any(axis = 1),'population']/2

In [None]:
df.to_csv('neighborhoods_clean.csv')

---
### Remove columns with sparse data

In [None]:
df = pd.read_csv('./data/neighborhoods_clean.csv',index_col = 0)
df.head()

In [None]:
# Investigate which venue columns have only few occurrences
socioeconomic_cols = list(df.columns[:22])
venues_cols = list(df.iloc[:,22:].sum().sort_values(0, ascending = False).index[:10]) # ten most common venues)
cols_to_keep = socioeconomic_cols + venues_cols
cols_to_keep

In [None]:
df_dropped = df[cols_to_keep]
df_dropped.to_csv('./data/chicago_neighborhoods_top_10')
df_dropped.head()

## Neighborhood Area

In [None]:
df['area'].sort_values(ascending=False)

In [None]:
df.area.describe()

In [None]:
df.area.plot(kind='hist', bins = 100, figsize = (5,3))
plt.title('Neighborhood Areas')
plt.xlabel('Area, sq. miles')
plt.ylabel('Frequency')

In [None]:
df.area.plot(kind='hist', bins = 300, figsize = (5,3))
plt.title('Neighborhood Areas')
plt.xlabel('Area, acres')
plt.ylabel('Frequency')
plt.xlim([0,1])

+ It appears that area field has outliers. Let's explore it further via boxplot

In [None]:
df.area.plot(kind='box',figsize = (10,3), vert = False)

+ There is something going on here. Assuming that one house takes approximately 0.5 acres., let's discard neighborhoods with less than 4 houses (2/640 sq. miles):

In [None]:
df_areas = df[df.area >= 2.0/640]

+ Also, the neighborhood with the largest area is O'Hare in Chicago:

In [None]:
df_areas[df_areas.area > 10]

In [None]:
df_areas.area.plot(kind='box', figsize = (10,3), vert = False)
# plt.xlim([0,0.5])

In [None]:
df_areas[df_areas.area > 0.1].area.plot(kind='box', vert = False)

+ O'Hare neighborhood is the top outlier and majority of neighborhoods in Chicago have way smaller area. One possible reason is that neighborhoods in the dataset are 'split', i.e. multiple rows represent the same neighborhood. Area will be excluded from further analysis

---
### Average household size

In [None]:
df['avg household size'].plot(kind='hist', bins = 50, figsize = (10,5))
plt.xlabel('Average household size')
plt.title('Average household size')

In [None]:
df[df['avg household size'] > 10]

+ Neighborhoods with more than 10 people on average per household do not appear legit. Let's clean them

In [None]:
df_household = df[(df['avg household size'] < 10)]
df_household.shape
df_household

In [None]:
df_household['avg household size'].plot(kind='hist', bins = 50)

---
### Average number of cars in appartments and houses

In [None]:
df_household.loc[:,'avg number of cars'] = (df_household['avg number of cars apts'] + df_household['avg number of cars apts'])/2.0

In [None]:
df_household['avg number of cars'].plot(kind='hist', bins = 50)

In [None]:
df_cars = df_household[df_household['avg number of cars'] < 5]
df_cars['avg number of cars'].plot(kind='hist', bins = 30)

---
### Medium Age

In [None]:
df_cars['med age'] = (df_cars['med age females'] + df_cars['med age males'])/2.0

In [None]:
df_cars['med age'].plot(kind='hist', bins = 20)

+ A few neighborhoods appear to have suspiciously young residents. Perhaps in some neighborhoods, the number of children is higher than the number of adults?

---
### Median Household income and rent

In [None]:
df_cars['med household income'].plot(kind='hist', bins = 20)
plt.xlabel('Income')
plt.title('Median Household Income')

In [None]:
df_cars['med rent'].plot(kind='hist', bins = 20)
plt.xlabel('Rent')
plt.title('Median Rent')

---
### Families with children, family household, married couples

In [None]:
df_cars['pct families with children'].plot(kind='hist', bins = 10)
plt.xlabel('percent')
plt.title('Percent families with children')

In [None]:
df_cars['pct family household'].plot(kind='hist', bins = 20)
plt.xlabel('percent')
plt.title('Percent family household')

In [None]:
df_cars['pct married couple'].plot(kind='hist', bins = 10)
plt.xlabel('percent')
plt.title('Percent married couple')

+ Percents should not exceed 100

In [None]:
df_cars['pct never married females > 15'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent never married females older than 15')

In [None]:
df_cars['pct never married males > 15'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent never married males older than 15')

---
### Percent residents born in other states or outside the U.S.

In [None]:
df_cars['pct born in another us state'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent born in another U.S. state')

In [None]:
df_cars['pct born in state'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent born in IL')

In [None]:
df_cars['pct foreign born residents'].plot(kind='hist', bins = 10)
plt.xlabel('percent')
plt.title('Percent foreign born residents')

In [None]:
df_cars['pct native residents born outside us'].plot(kind='hist', bins = 10)
plt.xlabel('percent')
plt.title('Percent native residents born outside of the U.S.')

In [None]:
df_cars['pct not speak English well'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent residents who do not speak English well')

In [None]:
df_cars['pct single mother'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent single mother households')

In [None]:
df_cars['pct units mortgage'].plot(kind='hist', bins = 15)
plt.xlabel('percent')
plt.title('Percent units with mortgage')

In [None]:
df_clean = df_cars[df_cars['pct units mortgage'] < 100]
df_clean.shape

In [None]:
# Save for further use
df_clean.to_csv('./data/chicago_neighborhoods_clean.csv')

---
# Clustering

In [None]:
df = pd.read_csv('./data/chicago_neighborhoods_clean.csv', index_col = 0)
df.head()