# PART 2 : New York Neighborhood and Population Data Web Scrapping

In [94]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd
import requests
import json
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

print('Libraries imported.')

Libraries imported.


### This notebook will fetch New York neighborhood data and population data of each neighborhood from Wikepedia using BeautifulSoup.

## NYC Neighborhood Data

The json file 'newyork_data.json'  is dowloaded from [New York City Neighborhoods Names](https://geo.nyu.edu/catalog/nyu_2451_34572).    
It contains the information we need for this project: Borough, Neighborhood, Latitude and Longitude.

In [6]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [11]:
newyork_data

{'type': 'FeatureCollection',
 'totalFeatures': 306,
 'features': [{'type': 'Feature',
   'id': 'nyu_2451_34572.1',
   'geometry': {'type': 'Point',
    'coordinates': [-73.84720052054902, 40.89470517661]},
   'geometry_name': 'geom',
   'properties': {'name': 'Wakefield',
    'stacked': 1,
    'annoline1': 'Wakefield',
    'annoline2': None,
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.84720052054902,
     40.89470517661,
     -73.84720052054902,
     40.89470517661]}},
  {'type': 'Feature',
   'id': 'nyu_2451_34572.2',
   'geometry': {'type': 'Point',
    'coordinates': [-73.82993910812398, 40.87429419303012]},
   'geometry_name': 'geom',
   'properties': {'name': 'Co-op City',
    'stacked': 2,
    'annoline1': 'Co-op',
    'annoline2': 'City',
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.82993910812398,
     40.87429419303012,
     -73.82993910812398,
     40.87429419303012]}},
  {'type': 'Feature',
 

In [14]:
neighborhoods_data = newyork_data['features']

In [15]:
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

### Transform json into DataFrame

In [13]:
 # instantiate a dataframe
neighborhoods = pd.DataFrame(columns = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'])

In [16]:
for data in neighborhoods_data:
    borough = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_lat = data['geometry']['coordinates'][1]
    neighborhood_lon = data['geometry']['coordinates'][0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [223]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
5,Bronx,Kingsbridge,40.881687,-73.902818
6,Manhattan,Marble Hill,40.876551,-73.910660
7,Bronx,Woodlawn,40.898273,-73.867315
8,Bronx,Norwood,40.877224,-73.879391
9,Bronx,Williamsbridge,40.881039,-73.857446


In [18]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


## Scrape Population Data per Neighborhood

### Use BeautifulSoup to fetech New York population per neighborhood

In [132]:
wiki_link = "https://en.wikipedia.org/wiki/Neighborhoods_in_New_York_City"
page = requests.get(wiki_link)
soup = BeautifulSoup(page.text, 'html.parser')

In [236]:
def get_wiki_neighborhood_df():
    neighborhood_df = pd.DataFrame(columns = ['Neighborhood','Anchor'])
    
    neighbor_list = []
    neighbor_anchor = []
    for i in range(1,60):
        for element in soup.select("table.wikitable tr")[i].findAll('a')[1:]:
            neighbor_list.append(element.text)
            neighbor_anchor.append(element['href'])
            
    neighborhood_df['Neighborhood'] = neighbor_list
    neighborhood_df['Anchor'] = neighbor_anchor
    neighborhood_df['Population'] = np.zeros(len(neighbor_list))
    
    return neighborhood_df

In [237]:
neighborhood_df = get_wiki_neighborhood_df()
neighborhood_df

Unnamed: 0,Neighborhood,Anchor,Population
0,Melrose,"/wiki/Melrose,_Bronx",0.0
1,Mott Haven,"/wiki/Mott_Haven,_Bronx",0.0
2,Port Morris,"/wiki/Port_Morris,_Bronx",0.0
3,Hunts Point,"/wiki/Hunts_Point,_Bronx",0.0
4,Longwood,"/wiki/Longwood,_Bronx",0.0
5,Claremont,"/wiki/Claremont,_Bronx",0.0
6,Concourse Village,"/wiki/Concourse_Village,_Bronx",0.0
7,Crotona Park,"/wiki/Crotona_Park,_Bronx",0.0
8,Morrisania,"/wiki/Morrisania,_Bronx",0.0
9,Concourse,"/wiki/Concourse,_Bronx",0.0


### Continue to fetch each neighborhood's population from its WIKI page

**Try to extract the population of 'Melrose' in Bronx from its wiki page**

In [200]:
neighborhood_page = requests.get('https://en.wikipedia.org//wiki/Melrose,_Bronx')
soup2 = BeautifulSoup(neighborhood_page.text, 'html.parser')
table = soup2.select("table.infobox tr")

for i in range(len(table)):
    try:
        if 'Population' in table[i].find('th').text:
            print ("Melrose's population:")
            print (table[i+1].find('td').text)
        else:
            None
    except:
        continue    

Melrose's population:
24,913


In [240]:
def get_neighborhood_pop(neighborhood_df):
    """
    Function that replace the population value of each neighborhood with values scrapped from Wikipedia
    Input: A dataframe with 3 columns: neighborhood name, anchor tag in wikipedia and population
    Output: A dataframe with
    """
    wiki_search = "https://en.wikipedia.org"

    for i, anchor in enumerate(neighborhood_df['Anchor']):
        neighborhood_page = requests.get(wiki_search+anchor)
        soup2 = BeautifulSoup(neighborhood_page.text, 'html.parser')
        table = soup2.select("table.infobox tr")
    
        for j in range(len(table)):
            try:
                if 'Population' in table[j].find('th').text: 
                    neighborhood_df['Population'][i] = int(table[j+1].find('td').text.replace(',', ''))
            except:
                pass 
    
    neighborhood_df = neighborhood_df[neighborhood_df['Population'] != 0]
    
    return neighborhood_df

In [241]:
neighborhood_pop = get_neighborhood_pop(neighborhood_df)
neighborhood_pop

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Neighborhood,Anchor,Population
0,Melrose,"/wiki/Melrose,_Bronx",24913.0
1,Mott Haven,"/wiki/Mott_Haven,_Bronx",52413.0
2,Port Morris,"/wiki/Port_Morris,_Bronx",3523.0
3,Hunts Point,"/wiki/Hunts_Point,_Bronx",12281.0
4,Longwood,"/wiki/Longwood,_Bronx",26196.0
5,Claremont,"/wiki/Claremont,_Bronx",16863.0
8,Morrisania,"/wiki/Morrisania,_Bronx",16863.0
10,Highbridge,"/wiki/Highbridge,_Bronx",37727.0
11,Fordham,"/wiki/Fordham,_Bronx",43394.0
12,Morris Heights,"/wiki/Morris_Heights,_Bronx",36779.0


## Combine Borough, Neighborhoods and Popolation

In [None]:
new_york_data = neighborhoods.merge(neighborhood_pop, how = 'inner', on = 'Neighborhood').drop('Anchor', axis = 1)

In [247]:
new_york_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Population
0,Bronx,Wakefield,40.894705,-73.847201,29158.0
1,Bronx,Co-op City,40.874294,-73.829939,43752.0
2,Bronx,Fieldston,40.895437,-73.905643,3292.0
3,Bronx,Riverdale,40.890834,-73.912585,48049.0
4,Bronx,Kingsbridge,40.881687,-73.902818,10669.0


In [246]:
new_york_data.to_csv('data_output/new_york_data.csv', index = False )