# This project clusters neighborhoods of San Francisco and Houston. Offer a initial comparsion between two cities.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
print("Libraries imported")

Libraries imported


### Test geogy.
### Cities are choosen in the United States since geogy can't extract latitude and longitude outside U.S.

In [2]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [3]:
geolocator = Nominatim(user_agent = "Lat_Lng_Neighbors")
def getLat_Lng(neighborhood, city):
    try:
        location = geolocator.geocode("{}, {}".format(neighborhood, city))
        return location
    except:
        getLat_Lng(neighborhood, city)


In [4]:
print(getLat_Lng("Alamo Square","San Francisco"))

Alamo Square, Western Addition, SF, California, USA


In [5]:
print(getLat_Lng("Willowbrook","Houston"))

Willowbrook, Houston, Harris County, Texas, USA


## 1. Data Collection

### Extract texts contains neighborhoods of two cities from websites.

Extract data for San Francisco.

In [6]:
data_sf = requests.get("https://en.wikipedia.org/wiki/List_of_neighborhoods_in_San_Francisco").text
soup_sf = BeautifulSoup(data_sf, 'lxml')

Extract data for Houston.

In [7]:
data_hou = requests.get("https://en.wikipedia.org/wiki/List_of_Houston_neighborhoods").text
soup_hou = BeautifulSoup(data_hou, 'lxml')

## 2.Data Preprocess

### 2.1 Data Cleaning

Store only neighborhoods of San Francisco.

In [8]:
sf_n = []
for content in soup_sf.find_all('span', class_ = "mw-headline"):
    sf_n.append(content.text)
print(len(sf_n))
print(sf_n)

123
['Alamo Square', 'Anza Vista', 'Ashbury Heights', 'Balboa Park', 'Balboa Terrace', 'Bayview', 'Belden Place', 'Bernal Heights', 'Buena Vista', 'Butchertown (Old and New)', 'Castro', 'Cathedral Hill', 'Cayuga Terrace', 'China Basin', 'Chinatown', 'Civic Center', 'Clarendon Heights', 'Cole Valley', 'Corona Heights', 'Cow Hollow', 'Crocker-Amazon', 'Design District', 'Diamond Heights', 'Dogpatch', 'Dolores Heights', 'Duboce Triangle', 'Embarcadero', 'Eureka Valley', 'Excelsior', 'Fillmore', 'Financial District', 'Financial District South', "Fisherman's Wharf", 'Forest Hill', 'Forest Knolls', 'Glen Park', 'Golden Gate Heights', 'Haight-Ashbury', 'Hayes Valley', 'Hunters Point', 'India Basin', 'Ingleside', 'Ingleside Terraces', 'Inner Sunset', 'Irish Hill', 'Islais Creek', 'Jackson Square', 'Japantown', 'Jordan Park', 'Laguna Honda', 'Lake Street', 'Lakeside', 'Lakeshore', 'Laurel Heights', 'Lincoln Manor', 'Little Hollywood', 'Little Russia', 'Little Saigon', 'Lone Mountain', 'Lower Ha

Store only neighborhoods of Houston.

In [9]:
hou_n = []
for content in soup_hou.find('table', class_ = "wikitable").find_all('td'):
    hou_n.append(content.text.split('\n')[0])
index = np.linspace(1,349,88,dtype = int)
hou_new = []
for index in index:
    hou_new.append(hou_n[index].split('/')[0])  
print(len(hou_new))
print(hou_new)

88
['Willowbrook', 'Greater Greenspoint', 'Carverdale', 'Fairbanks ', 'Greater Inwood', 'Acres Home', 'Hidden Valley', 'Westbranch', 'Addicks ', 'Spring Branch West', 'Langwood', 'Central Northwest (formerly Near Northwest)', 'Independence Heights', 'Lazybrook ', 'Greater Heights', 'Memorial', 'Eldridge ', 'Briar Forest', 'Westchase', 'Mid-West (formerly Woodlake', 'Greater Uptown', 'Washington Avenue Coalition ', 'Afton Oaks ', 'Neartown ', 'Alief', 'Sharpstown', 'Gulfton', 'University Place', 'Westwood', 'Braeburn', 'Meyerland', 'Braeswood', 'Medical Center', 'Astrodome Area', 'South Main', 'Brays Oaks (formerly Greater Fondren S.W.)', 'Westbury', 'Willow Meadows ', 'Fondren Gardens', 'Central Southwest', 'Fort Bend ', 'IAH Airport', 'Kingwood', 'Lake Houston', 'Northside ', 'Eastex - Jensen', 'East Little York ', 'Trinity ', 'East Houston', 'Settegast', 'Northside Village', 'Kashmere Gardens', 'El Dorado ', 'Hunterwood', 'Greater Fifth Ward', 'Denver Harbor ', 'Pleasantville Area', 

## There are total 211 neighborhoods of both cities.

Add latitude and longitude to each neighborhood.

In [10]:
df_sf = pd.DataFrame(columns = ["Neighborhood", "Latitude", "Longitude"])
for index, neighborhood in enumerate(sf_n):
    location = getLat_Lng(neighborhood, "San Francisco")
    if not isinstance(location, type(None)):
        df_sf = df_sf.append({"Neighborhood": neighborhood,
                             "Latitude": location.latitude,
                             "Longitude": location.longitude}, ignore_index = True)

In [11]:
print(df_sf.shape)
df_sf.head()

(83, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Anza Vista,37.780836,-122.443149
1,Balboa Park,37.724949,-122.444805
2,Balboa Terrace,-38.730438,-62.233556
3,Bayview,37.728889,-122.3925
4,Belden Place,37.791744,-122.403886


In [12]:
df_sf

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Anza Vista,37.780836,-122.443149
1,Balboa Park,37.724949,-122.444805
2,Balboa Terrace,-38.730438,-62.233556
3,Bayview,37.728889,-122.392500
4,Belden Place,37.791744,-122.403886
5,Bernal Heights,37.741001,-122.414214
6,Buena Vista,20.524568,-100.402780
7,Castro,9.690412,-84.113754
8,China Basin,37.776330,-122.391839
9,Chinatown,52.375207,4.900939


In [13]:
df_sf_new = df_sf[(df_sf['Latitude'] < 38) & (df_sf['Latitude'] > 37) &
              (df_sf['Longitude'] > -123) &(df_sf['Longitude'] < -122)]

### Remove incorrect latitude and longitude of neighborhoods.

In [14]:
print(df_sf_new.shape)
df_sf_new

(70, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Anza Vista,37.780836,-122.443149
1,Balboa Park,37.724949,-122.444805
3,Bayview,37.728889,-122.392500
4,Belden Place,37.791744,-122.403886
5,Bernal Heights,37.741001,-122.414214
8,China Basin,37.776330,-122.391839
10,Civic Center,37.779594,-122.416794
11,Cole Valley,37.765813,-122.449962
12,Corona Heights,37.764886,-122.439368
13,Cow Hollow,37.797262,-122.436248


In [15]:
df_hou = pd.DataFrame(columns = ["Neighborhood", "Latitude", "Longitude"])
for index, neighborhood in enumerate(hou_new):
    location = getLat_Lng(neighborhood, "Houston")
    if not isinstance(location, type(None)):
        df_hou = df_hou.append({"Neighborhood": neighborhood,
                             "Latitude": location.latitude,
                             "Longitude": location.longitude}, ignore_index = True)

In [16]:
print(df_hou.shape)
df_hou.head()

(68, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Willowbrook,29.660254,-95.456096
1,Greater Greenspoint,29.944719,-95.416074
2,Carverdale,29.848687,-95.53945
3,Fairbanks,29.852726,-95.524386
4,Acres Home,32.636256,-83.692962


In [17]:
df_hou

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Willowbrook,29.660254,-95.456096
1,Greater Greenspoint,29.944719,-95.416074
2,Carverdale,29.848687,-95.539450
3,Fairbanks,29.852726,-95.524386
4,Acres Home,32.636256,-83.692962
5,Hidden Valley,39.489543,-119.752991
6,Westbranch,29.839541,-95.551522
7,Addicks,29.782451,-95.642446
8,Spring Branch West,29.801410,-95.547879
9,Langwood,29.826282,-95.482795


### Remove incorrect latitude and longitude of neighborhoods.

In [20]:
df_hou_new = df_hou[(df_hou['Latitude'] < 30.5) & (df_hou['Latitude'] > 29) &
              (df_hou['Longitude'] < -95) & (df_hou['Longitude'] > -96)]

In [21]:
df_hou_new

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Willowbrook,29.660254,-95.456096
1,Greater Greenspoint,29.944719,-95.416074
2,Carverdale,29.848687,-95.539450
3,Fairbanks,29.852726,-95.524386
6,Westbranch,29.839541,-95.551522
7,Addicks,29.782451,-95.642446
8,Spring Branch West,29.801410,-95.547879
9,Langwood,29.826282,-95.482795
10,Independence Heights,29.828348,-95.388004
11,Lazybrook,29.803899,-95.439195


Concatenate neighborhoods data.

In [22]:
df_total = pd.concat([df_sf_new, df_hou_new])

In [23]:
print(df_total.shape)
df_total.head

(132, 3)


<bound method NDFrame.head of                 Neighborhood   Latitude   Longitude
0                 Anza Vista  37.780836 -122.443149
1                Balboa Park  37.724949 -122.444805
3                    Bayview  37.728889 -122.392500
4               Belden Place  37.791744 -122.403886
5             Bernal Heights  37.741001 -122.414214
8                China Basin  37.776330 -122.391839
10              Civic Center  37.779594 -122.416794
11               Cole Valley  37.765813 -122.449962
12            Corona Heights  37.764886 -122.439368
13                Cow Hollow  37.797262 -122.436248
14                  Dogpatch  37.760698 -122.389202
15           Dolores Heights  37.745435 -122.424480
16           Duboce Triangle  37.767138 -122.432230
17               Embarcadero  37.792864 -122.396912
18             Eureka Valley  37.760956 -122.435509
19                  Fillmore  37.789879 -122.434287
20        Financial District  37.793647 -122.398938
21  Financial District South  37.7

In [24]:
df_total.to_csv("NeighborhoodsOfTwoCities.csv", index = False)

## Because the limitation of geogy library, not all neighborhoods can extract latitude and longitude. Therefore, the total size is smaller.