# Fix problematic addresses that won't be geocoded 

In [3]:

# Print all outputs from commnads in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup 
import pandas as pd
import re 


In [4]:
# Import files 
df = pd.read_csv("/home/jae/analyzing-asian-american-latino-civic-infrastructure/processed_data/org_lat_logs.csv")

## Inspect the file and create an index variable 

In [5]:
df.head()

Unnamed: 0,Address,F.year,latitude,longtitude,geo_method,group
0,"101 8th Street, Suite 101 Oakland, CA 94607",1974,,,osm,Asian
1,"1016 West Argyle Street Chicago, IL 60640",1981,41.970223,-87.83731,osm,Asian
2,"1031 25th St, San Diego",1974,32.71637,-117.140274,census,Asian
3,"1038 Post Street San Francisco, CA 94109",1970,37.78696,-122.41893,census,Asian
4,"1055 Wilshire Blvd., Suite 1475 Los Angeles, C...",1976,,,osm,Asian


In [6]:
# Find NAs

df['latitude'].isnull().head()

0     True
1    False
2    False
3    False
4     True
Name: latitude, dtype: bool

In [7]:
# The count of the problematic addresses in each group

df.loc[df['latitude'].isnull() & (df['group'] == "Asian")].shape[0] #19
df.loc[df['latitude'].isnull() & (df['group'] != "Asian")].shape[0] #12

19

12

In [78]:
# Index by NAs
pr_addrs = df['Address'].loc[df['latitude'].isnull()]

pr_addrs.head()

0           101 8th Street, Suite 101 Oakland, CA 94607
4     1055 Wilshire Blvd., Suite 1475 Los Angeles, C...
9        1250 Chambers Road Room 2403, Aurora, CO 80011
13    1511 Third Avenue Suite 914, Seattle, Washingt...
15    16161 Ventura Boulevard Suite 388, Encino, CA ...
Name: Address, dtype: object

In [81]:
# Remove row index (this is important for iterations)

pr_addrs = pr_addrs.reset_index(drop = True)

pr_addrs.head()

0          101 8th Street, Suite 101 Oakland, CA 94607
1    1055 Wilshire Blvd., Suite 1475 Los Angeles, C...
2       1250 Chambers Road Room 2403, Aurora, CO 80011
3    1511 Third Avenue Suite 914, Seattle, Washingt...
4    16161 Ventura Boulevard Suite 388, Encino, CA ...
Name: Address, dtype: object

## Replace the list of the addresses with the ones working in Google maps 

### Test version 

I will use the first element in the `pr_list` for the test. 

In [82]:
pr_addrs

0           101 8th Street, Suite 101 Oakland, CA 94607
1     1055 Wilshire Blvd., Suite 1475 Los Angeles, C...
2        1250 Chambers Road Room 2403, Aurora, CO 80011
3     1511 Third Avenue Suite 914, Seattle, Washingt...
4     16161 Ventura Boulevard Suite 388, Encino, CA ...
5           1825 San Pablo Suite 200, Oakland, CA 94612
6           2166 Hayes St #206, San Francisco, CA 94117
7     231 E. Third St., Suite G-106, Los Angeles, CA...
8       268 Canal Street, 6th Floor New York, NY, 10013
9     3639 Martin Luther King Jr Way S, Seattle, WA ...
10    720 8th Avenue South, 2nd Floor Seattle, WA 98104
11         731 Sansome St #100, San Francisco, CA 94111
12         75 Kneeland St., Suite 204, Boston, MA 02111
13             827 Broadway St. San Francisco, CA 94133
14    8616 La Tijera Blvd Ste 200 Los Angeles, CA 90...
15     900 Kearny St Suite 600, San Francisco, CA 94133
16    North East Medical Services 1520 Stockton Stre...
17    South Street Clinic 145 South St. Boston, 

In [83]:
pr_list = []

for i in range(len(pr_addrs)):    
    pr_list.append(re.compile(r',').sub('', str(pr_addrs[i])))


In [87]:
pr_list[0:5]

['101 8th Street Suite 101 Oakland CA 94607',
 '1055 Wilshire Blvd. Suite 1475 Los Angeles CA 90177',
 '1250 Chambers Road Room 2403 Aurora CO 80011',
 '1511 Third Avenue Suite 914 Seattle Washington 98101-1626',
 '16161 Ventura Boulevard Suite 388 Encino CA 91436 USA']

In [99]:
query = pr_list[0]
query = query.replace(' ', '+')

url = f"https://maps.google.com/search?q={query}&amp;aqs=chrome.0.69i59.416j0j9&amp;sourceid=chrome&amp;ie=UTF-8"
url 

'https://maps.google.com/search?q=101+8th+Street+Suite+101+Oakland+CA+94607&amp;aqs=chrome.0.69i59.416j0j9&amp;sourceid=chrome&amp;ie=UTF-8'

Get the webpage and turn it into a BS object 

In [100]:

# headers is needed to avoid HTTP 403 Error 
resp = Request(url, 
              # desktop user-agent
              headers={'User-Agent': 'Mozilla/5.0'})


In [101]:
# Get the webpage 
webpage = urlopen(resp)

# Turn it into a BS object
bs = BeautifulSoup(webpage.read(), 'html.parser')

Find part of the web page that you want to scrape. I find using the `prettify()` function is easier than using a web browser selector (like Google Chrome developer mode).

In [108]:
# Print bs

# print(bs.prettify())

# Interested parts 

# <span>
#        <div class="BNeawe deIvCb AP7Wnd">
#         101 8th St #101
#        </div>
#       </span>

We're leveraging `class = "BNeawe deIvCb AP7Wnd"` to find the text we want. 

In [109]:
# New address 
new_address = bs.findAll("", {"class": "BNeawe deIvCb AP7Wnd"})[0:1]

new_address

[<div class="BNeawe deIvCb AP7Wnd">101 8th St #101</div>]

Using regular expression, we can easily remove the part (tags) we don't want. 

In [110]:
re.compile(r'<[^>]+>').sub('', str(new_address))

'[101 8th St #101]'

### Iteration 

In [111]:
def parsing_addr(old_address):
            
    # Select address  
    
    query = old_address.replace(' ', '+')

    url = f"https://www.google.com/search?q={query}&amp;aqs=chrome.0.69i59.416j0j9&amp;sourceid=chrome&amp;ie=UTF-8"

    # headers is needed to avoid HTTP 403 Error 
    resp = Request(url, 
                  # desktop user-agent
                  headers={'User-Agent': 'Mozilla/5.0'})

    # Get the webpage 
    webpage = urlopen(resp)

    # Turn it into a BS object
    bs = BeautifulSoup(webpage.read(), 'html.parser')

    # New address 
    new_address = bs.findAll("", {"class": "BNeawe deIvCb AP7Wnd"})[2]

    # Remove tags 
    new_address = re.compile(r'<[^>]+>').sub('', str(new_address))
    
    return(new_address)

In [112]:
parsing_addr(pr_list[2])

'Related searches'

In [113]:
addresses = []

for i in range(len(pr_list)):
    addresses.append(parsing_addr(pr_list[i]))

IndexError: list index out of range