# Coursera Capstone Project

## Comparing Chicago and New York City

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import lxml.html as lf

from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


Get the data for New York and put it into a dataframe.

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [5]:
#Load the json data
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

#put it in an array    
ny_neighborhoods_data = newyork_data['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
ny_neighborhoods = pd.DataFrame(columns=column_names)

#Loop through the array
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    ny_neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

#Let's see the dataframe
ny_neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


Make sure the dataframe is the right size. It should have 5 boroughs and 306 neighborhoods.

In [6]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


Visualize the New York neighborhoods.

In [8]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
ny_latitude = location.latitude
ny_longitude = location.longitude

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[ny_latitude, ny_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Now scrape the Chicago data.

In [34]:
#The website we are scraping
url=requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago').text

#Load the website
soup = BeautifulSoup(url,'lxml')

#Find the table
My_table = soup.find('table',{'class':'wikitable sortable'})

#Get it into a CSV
table1=""
for tr in My_table.find_all('tr'):
    row1=""
    tds = tr.find_next('td').get_text(strip=True)
    row1=row1+","+tds
    tds = tr.find_next('td').get_text(strip=True)
    row1=row1+","+tds
    row1=row1+'\n'
    table1=table1+row1[1:]

#Load it as a CSV
file=open("chicago.csv","wb")
#file.write(bytes(headers,encoding="ascii",errors="ignore"))
file.write(bytes(table1,encoding="ascii",errors="ignore"))

print("File Written!")

File Written!


In [36]:
#Put it into a dataframe
chicago_df = pd.read_csv('chicago.csv', header=None)
chicago_df.columns=["Neighborhood","Community area"]
chicago_df.drop_duplicates(subset=None, keep='first', inplace=True)
chicago_df.reset_index(inplace=True, drop=True)
chicago_df

Unnamed: 0,Neighborhood,Community area
0,Albany Park,Albany Park
1,Altgeld Gardens,Altgeld Gardens
2,Andersonville,Andersonville
3,Archer Heights,Archer Heights
4,Armour Square,Armour Square
5,Ashburn,Ashburn
6,Ashburn Estates,Ashburn Estates
7,Auburn Gresham,Auburn Gresham
8,Avalon Park,Avalon Park
9,Avondale,Avondale


In [39]:
chicago_df.shape

(246, 2)

I found a dataset that contains the longitude and latitude of the neighborhoods in Chicago. The dataset needs a little bit of cleaning because it contains duplications.

In [62]:
chicago_lat_lon = pd.read_csv('community_to_gps.txt', sep='\t', header = 0)
chicago_lat_lon.head()

Unnamed: 0,Community Area,Community Name,Latitude,Longitude
0,1,Rogers Park,42.003801,-87.657651
1,1,Rogers Park,42.002439,-87.657809
2,1,Rogers Park,41.99839,-87.657676
3,1,Rogers Park,42.009069,-87.661341
4,1,Rogers Park,42.00568,-87.660129


Clean up all the duplications.

In [65]:
chicago_lat_lon.drop_duplicates('Community Name', inplace=True)
chicago_lat_lon.drop(columns = 'Community Area', inplace=True)
chicago_lat_lon.reset_index(inplace=True, drop=True)
chicago_lat_lon.head()

Unnamed: 0,Community Name,Latitude,Longitude
0,Rogers Park,42.003801,-87.657651
1,West Ridge,41.992071,-87.675487
2,Uptown,41.961609,-87.65537
3,Lincoln Square,41.968913,-87.674871
4,North Center,41.954342,-87.674439
