# Fix problematic addresses that won't be geocoded 

## Setup

In [4]:

# Print all outputs from commnads in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#from urllib.request import Request, urlopen
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np
import re
import geocoder


In [8]:
# Import files 
df = pd.read_csv("/home/jae/analyzing-asian-american-latino-civic-infrastructure/processed_data/org_lat_logs.csv")

## Identify problematic addresses 

In [9]:
df.head()

Unnamed: 0,Address,F.year,Name,Type,latitude,longtitude,geo_method,category
0,"101 8th Street, Suite 101 Oakland, CA 94607",1974,Asian Health Services,Hybrid,,,osm,Asian
1,"1016 West Argyle Street Chicago, IL 60640",1981,Chinese Mutual Aid Association,CBO,41.973394,-87.655784,osm,Asian
2,"1031 25th St, San Diego",1974,Union of Pan Asian Communities,CBO,32.71637,-117.140274,census,Asian
3,"1038 Post Street San Francisco, CA 94109",1970,Chinatown Youth Center,CBO,37.787195,-122.419089,osm,Asian
4,"1055 Wilshire Blvd., Suite 1475 Los Angeles, C...",1976,Pacific Asian Consortium in Employment,CBO,,,osm,Asian


In [10]:
# Find NAs

df['latitude'].isnull().head()

0     True
1    False
2    False
3    False
4     True
Name: latitude, dtype: bool

In [12]:
# The count of the problematic addresses in each group

df.loc[df['latitude'].isnull() & (df['category'] == "Asian")].shape[0] #27
df.loc[df['latitude'].isnull() & (df['category'] != "Asian")].shape[0] #31

27

31

In [13]:
# Index by NAs
pr_addrs = df['Address'].loc[df['latitude'].isnull()]

pr_addrs.head()

0           101 8th Street, Suite 101 Oakland, CA 94607
4     1055 Wilshire Blvd., Suite 1475 Los Angeles, C...
9        1250 Chambers Road Room 2403, Aurora, CO 80011
13    1511 Third Avenue Suite 914, Seattle, Washingt...
15    16161 Ventura Boulevard Suite 388, Encino, CA ...
Name: Address, dtype: object

In [14]:
# This index is useful to recode values later 
pr_index = np.array(pr_addrs.index)

pr_index 


array([  0,   4,   9,  13,  15,  19,  23,  25,  26,  27,  31,  33,  35,
        36,  37,  39,  40,  42,  52,  54,  55,  56,  58,  62,  69,  70,
        71,  72,  80,  81,  85,  86,  87,  90,  93, 100, 101, 103, 105,
       107, 125, 127, 135, 140, 142, 143, 145, 148, 149, 152, 153, 154,
       158, 160, 161, 162, 171, 175])

In [21]:
# Remove row index (this is important for iterations)
pr_addrs = pr_addrs.reset_index(drop = True)

pr_addrs.head()


0          101 8th Street, Suite 101 Oakland, CA 94607
1    1055 Wilshire Blvd., Suite 1475 Los Angeles, C...
2       1250 Chambers Road Room 2403, Aurora, CO 80011
3    1511 Third Avenue Suite 914, Seattle, Washingt...
4    16161 Ventura Boulevard Suite 388, Encino, CA ...
Name: Address, dtype: object

## Geocode using bing API

In [22]:
# test 

geocoder.bing(pr_addrs[0], key = "bing_api_key")

<[OK] Bing - Geocode [101 8th St, Oakland, CA 94607]>

In [23]:

latlngs = []

for i in range(len(pr_addrs)):
    latlngs.append(geocoder.bing(pr_addrs[i], key = "bing_api_key"))

In [26]:
lat = []
long = []

for i in range(len(latlngs)):
    lat.append(latlngs[i].latlng[0])
    long.append(latlngs[i].latlng[1])

In [27]:
# Check whether the for loop worked well 
None in lat
None in long 

False

False

In [28]:
# Update the data 

df['latitude'].loc[pr_index,] = np.array(lat)
df['latitude'].loc[pr_index,][0:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


0     37.796717
4     34.021877
9     39.734480
13    47.609899
15    34.157100
Name: latitude, dtype: float64

In [29]:
df['longtitude'].loc[pr_index,] = np.array(long)
df['longtitude'].loc[pr_index,][0:5]

0    -122.266167
4    -118.499597
9    -104.808290
13   -122.338306
15   -118.485475
Name: longtitude, dtype: float64

In [30]:
df['geo_method'].loc[pr_index,] = "Bing"
df['geo_method'].loc[pr_index,][0:5]

0     Bing
4     Bing
9     Bing
13    Bing
15    Bing
Name: geo_method, dtype: object

## Export the result 

In [32]:
# Export the result as a csv  
df.to_csv("/home/jae/analyzing-asian-american-latino-civic-infrastructure/processed_data/org_lat_logs_fixed.csv")