## Get Street Names
---

#### Import Libraries

In [117]:
#! pip install usaddress

In [118]:
import spacy
import re
import pandas as pd
from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher
import numpy as np
import usaddress
import requests
nlp = spacy.load('en')

In [119]:
pd.options.display.max_colwidth = 1000

#### Refresh Stored Variables from Previous Notebooks

In [120]:
%store -r

#### Open Saved File and Print First Row

In [121]:
df = pd.read_csv('../data/transcripts.csv')
df.head(1)

Unnamed: 0,transcripts,confidence,tokens
0,we going to call Joe at the corner of BP's and Taraval,0.807508,"['we', 'going', 'to', 'call', 'joe', 'at', 'the', 'corner', 'of', ""bp's"", 'and', 'taraval']"


#### Check Shape - How Many Observations

In [122]:
df.shape

(79, 3)

#### Find Possible Streets from Transcripts

In [123]:
# Building the Matcher entity and instantiating
matcher = Matcher(nlp.vocab)

# specifies what spacy does when it finds a match in the document. Here we just want to return the matches
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name, the condition being that the lowercase entity in the doc should match 
# the lowercase verion of the road name, so that capitalization wouldn't affect the model
def build_pattern(road_name):
    list_words = road_name.split(' ')

    pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in streets_list:
    matcher.add(road, on_match, build_pattern(road))
    
# This function takes a string as input and returns it with every word capitalized
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None

    # loop through the matches and add them to the string to return
    # matches consist of an id and the indeces of the first and last word that constitute the pattern in the document
    # we use the ids to extrat the locations from the rules in the Matcher instance and not from the text itself,
    # to make sure they all follow the same format
    for match in matches:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column with the extracted locations
df['streets'] = df['transcripts'].map(location_extraction_context)

**Code Adapted from:** [Mitchell Bohman, Nour Zahlan, and Masiur Abik](https://github.com/mchbmn/radio-to-location) and [Joseph Hopkins, Carol, Chiu, Anthony Chapman, Kwamae Delva](https://github.com/delvakwa/police_radio_to_mapping)

In [124]:
df.head()

Unnamed: 0,transcripts,confidence,tokens,streets
0,we going to call Joe at the corner of BP's and Taraval,0.807508,"['we', 'going', 'to', 'call', 'joe', 'at', 'the', 'corner', 'of', ""bp's"", 'and', 'taraval']",Taraval
1,Flora ways for a 14 year old male conscious breathing laceration to the inside of his,0.832525,"['flora', 'ways', 'for', 'a', '14', 'year', 'old', 'male', 'conscious', 'breathing', 'laceration', 'to', 'the', 'inside', 'of', 'his']","Flora , Old"
2,40th the Safeway Taraval 730 Taraval top of seventies and eighties is possibly related and that because of EMA,0.79367,"['40th', 'the', 'safeway', 'taraval', '730', 'taraval', 'top', 'of', 'seventies', 'and', 'eighties', 'is', 'possibly', 'related', 'and', 'that', 'because', 'of', 'ema']","40th , Taraval , Taraval"
3,for 390 building across the hide in marking the language,0.766427,"['for', '390', 'building', 'across', 'the', 'hide', 'in', 'marking', 'the', 'language']",
4,negative about we split up Partners it's my music Concourse I am at a company G work,0.827363,"['negative', 'about', 'we', 'split', 'up', 'partners', ""it's"", 'my', 'music', 'concourse', 'i', 'am', 'at', 'a', 'company', 'g', 'work']",Music


#### Create Columns with Possible Addresses' Numbers

In [125]:
# Creat list to house data from all addresses
addresses = []

# Loop Through all DataFrame's rows
for row in df['transcripts']:
    # Create dictionary to house data for each row of the DataFrame
    d = {}
    
    # Parse through rows and house results in a list
    list_tuples = usaddress.parse(row)
    
    # Create variable to house list of possible numbers
    numbers = []
    
    # Loop through each value in the list created
    for i, n in enumerate(list_tuples):
        
        # Get addresses' numbers
        if list_tuples[i][1] == 'AddressNumber':
            
            # Append numbers to list
            numbers.append(n[0])
    
    # Include keys and values into d
    d['numbers'] = numbers
    
    # Append d to addresses
    addresses.append(d)

#### Create DataFrame with Address Numbers and Concatenate with Original DataFrame

In [126]:
df = pd.concat([df, pd.DataFrame(addresses)], axis=1)

#### Drop NaN and Reset Index

In [127]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [128]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers
0,we going to call Joe at the corner of BP's and Taraval,0.807508,"['we', 'going', 'to', 'call', 'joe', 'at', 'the', 'corner', 'of', ""bp's"", 'and', 'taraval']",Taraval,[]


#### Create List of All Possible Addresses for each Row

In [129]:
# Creat list to house data for possible addresses
possibilities = []

# Loop Through all DataFrame's rows
for i in range(0, df.shape[0]):
    
    # Create variables to temporarily house information
    final_poss = []
    d = {}
    number_poss = []
    
    # Loop through values in each row / numbers
    for row in df[i:i+1]['numbers']:
        for a_number in row:
            number_poss.append(a_number)
    
    # Loop through values in each row / streets
    street_poss = []
    for row2 in [x.split(',') for x in df[i:(i+1)]['streets']][0]:
        for j in row2.split(','):
            street_poss.append(j.strip())

    # Concatenate numbers and streets
    for i in number_poss:
        for j in street_poss:
            final_poss.append(i + ' ' + j)

    # Append all possibilities to list
    d['possibilities'] = list(set(final_poss))
    possibilities.append(d)

# Concatenate dataframes
df = pd.concat([df, pd.DataFrame(possibilities)], axis=1)

#### Drop Empty Lists

In [130]:
df['possibilities'] = df['possibilities'].map(lambda x: np.nan if len(x) == 0 else x)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [131]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']","Leavenworth , Silver",[70],"[70 Silver, 70 Leavenworth]"


#### Check DataFrame's Shape

In [132]:
df.shape

(11, 6)

#### Get Latitude And Longitude - Google Maps Geocoding API

In [133]:
# Input API Key
api_key = '<YOUR-API-KEY>'

In [134]:
def get_google_latlong(address, api_key=None, return_full_response=False):
    
    # Set up Google Maps Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    
    # Proceed if api_key is given
    if api_key is not None:
        geocode_url = geocode_url + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "latitude": None,
            "longitude": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng')
        }
        
    # Append some other details:    
#     output['input_string'] = address
#     output['number_of_results'] = len(results['results'])
#     output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

#### Adapted from https://www.shanelynn.ie/batch-geocoding-in-python-with-google-geocoding-api/

#### Loop Through Every Possible Address and Get Lat/Long Results

In [135]:
# Creat list to house data from all requests
lat_long = []

# Loop Through all DataFrame's rows
for row in df['possibilities']:
    
    # Create dictionary to house data for each row of the DataFrame
    d_poss = {}
    
    # Loop through each value in each row
    for i, n in enumerate(row): 
        
        # Create dictionary to house data for each request on the API
        d_address = {}
        
        # Print addresses being analyzed
        print(n + ', San Francisco, CA')
        
        # Include keys and values into d_poss
        d_address[i] = get_google_latlong(n + ', San Francisco, CA', api_key, return_full_response=False)
        d_poss['response' + str(i)] = d_address

    # Append to lat_long list    
    lat_long.append(d_poss)
    
# Concatenate DataFrames
df = pd.concat([df, pd.DataFrame(lat_long)], axis=1)

70 Silver, San Francisco, CA
70 Leavenworth, San Francisco, CA
4419 White, San Francisco, CA
4419 State, San Francisco, CA
4419 California, San Francisco, CA
4419 Octavia, San Francisco, CA
4419 Oak, San Francisco, CA
400 High, San Francisco, CA
400 Turk, San Francisco, CA
400 Drummond, San Francisco, CA
1871 Portal, San Francisco, CA
3 Ivy, San Francisco, CA
3 Mary, San Francisco, CA
3 Center, San Francisco, CA
1404 15th, San Francisco, CA
1404 Wright, San Francisco, CA
1404 Treat, San Francisco, CA
1404 California, San Francisco, CA
1404 Robert, San Francisco, CA
1404 Black, San Francisco, CA
12 Dorado, San Francisco, CA
12 Gennessee, San Francisco, CA
919 Mcrae, San Francisco, CA
1934 19th, San Francisco, CA
70 19th, San Francisco, CA
70 Old, San Francisco, CA
1934 Felton, San Francisco, CA
1934 Camp, San Francisco, CA
70 Camp, San Francisco, CA
70 Felton, San Francisco, CA
70 High, San Francisco, CA
1934 Old, San Francisco, CA
1934 20th, San Francisco, CA
1934 High, San Francisco, 

#### Print First 5 Rows

In [136]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities,response0,response1,response2,response3,response4,response5,response6,response7,response8,response9,response10,response11
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']","Leavenworth , Silver",[70],"[70 Silver, 70 Leavenworth]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}",,,,,,,,,,


In [137]:
df.fillna('No Value', inplace=True)

In [138]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities,response0,response1,response2,response3,response4,response5,response6,response7,response8,response9,response10,response11
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']","Leavenworth , Silver",[70],"[70 Silver, 70 Leavenworth]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value


#### Create Response Columns with Latitude and Logitude

In [139]:
# df['response0_lat'] = df['response0'].map(lambda x: x[0]['latitude'])
# df['response0_long'] = df['response0'].map(lambda x: x[0]['longitude'])

In [140]:
df

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities,response0,response1,response2,response3,response4,response5,response6,response7,response8,response9,response10,response11
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']","Leavenworth , Silver",[70],"[70 Silver, 70 Leavenworth]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value
1,Octavia an oak California Plate State atom Tom David 902 one aboard it's a white Honda Accord for door we're sufficient got on didn't like me Harvey 4419 hide,0.884444,"['octavia', 'an', 'oak', 'california', 'plate', 'state', 'atom', 'tom', 'david', '902', 'one', 'aboard', ""it's"", 'a', 'white', 'honda', 'accord', 'for', 'door', ""we're"", 'sufficient', 'got', 'on', ""didn't"", 'like', 'me', 'harvey', '4419', 'hide']","Octavia , Oak , Oak , Oak , California , California , California , State , State , State , White",[4419],"[4419 White, 4419 State, 4419 California, 4419 Octavia, 4419 Oak]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}","{2: {'latitude': None, 'longitude': None}}","{3: {'latitude': None, 'longitude': None}}","{4: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value
2,High tax rate 5 drunk and it's your time to shine Drummond of the 400 block of Turk,0.699905,"['high', 'tax', 'rate', '5', 'drunk', 'and', ""it's"", 'your', 'time', 'to', 'shine', 'drummond', 'of', 'the', '400', 'block', 'of', 'turk']","High , Drummond , Turk , Turk , Turk",[400],"[400 High, 400 Turk, 400 Drummond]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}","{2: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value
3,slide down and 250 West Portal some kind of for 1871 flag down to 51 some kind of 44,0.72139,"['slide', 'down', 'and', '250', 'west', 'portal', 'some', 'kind', 'of', 'for', '1871', 'flag', 'down', 'to', '51', 'some', 'kind', 'of', '44']",Portal,[1871],[1871 Portal],"{0: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value
4,my father's last clear no Mary's house with fucking banking on that final four the 3 out of 4 3 + 3 + 1 5 x 0.5 and this is one of the following four separate turn down turn Southbound on our from Center Ivy settings they lost visuals as subject possibly winning the,0.653916,"['my', ""father's"", 'last', 'clear', 'no', ""mary's"", 'house', 'with', 'fucking', 'banking', 'on', 'that', 'final', 'four', 'the', '3', 'out', 'of', '4', '3', '+', '3', '+', '1', '5', 'x', '0.5', 'and', 'this', 'is', 'one', 'of', 'the', 'following', 'four', 'separate', 'turn', 'down', 'turn', 'southbound', 'on', 'our', 'from', 'center', 'ivy', 'settings', 'they', 'lost', 'visuals', 'as', 'subject', 'possibly', 'winning', 'the']","Mary , Mary , Center , Ivy",[3],"[3 Ivy, 3 Mary, 3 Center]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}","{2: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value
5,David once again for the night 1910 we 1404 15th Street treat everyone sits out of can I just want to play for yeah but go back to the temporary place and the Seven Roberts Roberts zebra 211 should come back to black Infinity first like I was like a regular California place it's a paper plate but I have several Robert Roberts it was fabulous party Wright place we try it was he that does go back to a 2004 is that the entity that correct yeah okay,0.818094,"['david', 'once', 'again', 'for', 'the', 'night', '1910', 'we', '1404', '15th', 'street', 'treat', 'everyone', 'sits', 'out', 'of', 'can', 'i', 'just', 'want', 'to', 'play', 'for', 'yeah', 'but', 'go', 'back', 'to', 'the', 'temporary', 'place', 'and', 'the', 'seven', 'roberts', 'roberts', 'zebra', '211', 'should', 'come', 'back', 'to', 'black', 'infinity', 'first', 'like', 'i', 'was', 'like', 'a', 'regular', 'california', 'place', ""it's"", 'a', 'paper', 'plate', 'but', 'i', 'have', 'several', 'robert', 'roberts', 'it', 'was', 'fabulous', 'party', 'wright', 'place', 'we', 'try', 'it', 'was', 'he', 'that', 'does', 'go', 'back', 'to', 'a', '2004', 'is', 'that', 'the', 'entity', 'that', 'correct', 'yeah', 'okay']","15th , 15th , Treat , Treat , Black , California , California , California , Robert , Robert , Wright , Wright , Wright",[1404],"[1404 15th, 1404 Wright, 1404 Treat, 1404 California, 1404 Robert, 1404 Black]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}","{2: {'latitude': None, 'longitude': None}}","{3: {'latitude': None, 'longitude': None}}","{4: {'latitude': None, 'longitude': None}}","{5: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value
6,Dorado lemon Adam code for handled can gennessee thank you Adam for the night 12 at 20 weeks well after this nice,0.826101,"['dorado', 'lemon', 'adam', 'code', 'for', 'handled', 'can', 'gennessee', 'thank', 'you', 'adam', 'for', 'the', 'night', '12', 'at', '20', 'weeks', 'well', 'after', 'this', 'nice']","Dorado , Gennessee",[12],"[12 Dorado, 12 Gennessee]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value
7,McRae 2600 coffee 77 919 she was devastated,0.733898,"['mcrae', '2600', 'coffee', '77', '919', 'she', 'was', 'devastated']","Mcrae , Mcrae",[919],[919 Mcrae],"{0: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value
8,thank you Lola Lola 1826 three the city crappy thank you very much mr. high-camp further David once again take this 800 prior to 1934 this is between 19th and 20th of yelling at us know and they 45 years old Felton and Adams 70 of them,0.794633,"['thank', 'you', 'lola', 'lola', '1826', 'three', 'the', 'city', 'crappy', 'thank', 'you', 'very', 'much', 'mr.', 'high-camp', 'further', 'david', 'once', 'again', 'take', 'this', '800', 'prior', 'to', '1934', 'this', 'is', 'between', '19th', 'and', '20th', 'of', 'yelling', 'at', 'us', 'know', 'and', 'they', '45', 'years', 'old', 'felton', 'and', 'adams', '70', 'of', 'them']","High , Camp , 19th , 19th , 20th , 20th , Old , Felton","[1934, 70]","[1934 19th, 70 19th, 70 Old, 1934 Felton, 1934 Camp, 70 Camp, 70 Felton, 70 High, 1934 Old, 1934 20th, 1934 High, 70 20th]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}","{2: {'latitude': None, 'longitude': None}}","{3: {'latitude': None, 'longitude': None}}","{4: {'latitude': None, 'longitude': None}}","{5: {'latitude': None, 'longitude': None}}","{6: {'latitude': None, 'longitude': None}}","{7: {'latitude': None, 'longitude': None}}","{8: {'latitude': None, 'longitude': None}}","{9: {'latitude': None, 'longitude': None}}","{10: {'latitude': None, 'longitude': None}}","{11: {'latitude': None, 'longitude': None}}"
9,14100 equipment khakis interpreter for the day 4519 pulses and surgery if you have IV also have a a 3586 and nice and full support for one day that it's 98 dozen a dozen for car in 4586 - Folsom,0.767303,"['14100', 'equipment', 'khakis', 'interpreter', 'for', 'the', 'day', '4519', 'pulses', 'and', 'surgery', 'if', 'you', 'have', 'iv', 'also', 'have', 'a', 'a', '3586', 'and', 'nice', 'and', 'full', 'support', 'for', 'one', 'day', 'that', ""it's"", '98', 'dozen', 'a', 'dozen', 'for', 'car', 'in', '4586', '-', 'folsom']","Day , Day , Folsom",[4586],"[4586 Folsom, 4586 Day]","{0: {'latitude': None, 'longitude': None}}","{1: {'latitude': None, 'longitude': None}}",No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value,No Value


#### Save Addresses DataFrame to .csv

In [141]:
df.to_csv('../data/adresses.csv', index_label=False)