## Get Street Names
---

#### Import Libraries

In [85]:
import spacy
import re
import pandas as pd
from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher
import numpy as np
import usaddress
import requests
nlp = spacy.load('en')

In [86]:
pd.options.display.max_colwidth = 1000

#### Refresh Stored Variables from Previous Notebooks

In [87]:
%store -r

#### Open Saved File and Print First Row

In [88]:
df = pd.read_csv('./data/transcripts.csv')
df.head(1)

Unnamed: 0,transcripts,confidence,tokens
0,we going to call Joe at the corner of BP's and Taraval,0.807508,"['we', 'going', 'to', 'call', 'joe', 'at', 'the', 'corner', 'of', ""bp's"", 'and', 'taraval']"


#### Check Shape - How Many Observations

In [89]:
df.shape

(79, 3)

#### Find Possible Streets from Transcripts

In [90]:
# Building the Matcher entity and instantiating
matcher = Matcher(nlp.vocab)

# specifies what spacy does when it finds a match in the document. Here we just want to return the matches
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name, the condition being that the lowercase entity in the doc should match 
# the lowercase verion of the road name, so that capitalization wouldn't affect the model
def build_pattern(road_name):
    list_words = road_name.split(' ')
    
    # general words that appear a lot in the list. 
    # The reason why we do this is to still get a match if they are not present
    roads_general = ['ln','alley','rd', 'ct', 'dr', 'ave', 'way','blvd', 'path'
                     'st', 'cir','pl', 'terrace', 'area', 'bridge', 'highway', 'trail']
    if list_words[-1].lower() in roads_general:
        pattern = [{'LOWER': word.lower()} for word in list_words[:-1]]
        pattern.append({'op': '*', 'LOWER' : list_words[-1].lower()})
    else:
        pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in streets_list:
    matcher.add(road, on_match, build_pattern(road))
    
# This function takes a string as input and returns it with every word capitalized
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None
    indeces_to_pop = []
    
    # loop through the matches and delete those that are a subset of another. 
    # this was done because some road names have words in commond and we were getting 2 matches for some locations
    # here we eliminate the shorter one since the longest is clearly the one intended
    for a in range(len(matches)):
        for b in range(a+1, len(matches)):
            if (matches[a][2] == matches[b][2]):
                if (matches[a][1] < matches[b][1]):
                    indeces_to_pop.append(b)
                else:
                    indeces_to_pop.append(a)
    matches_final = [tup for index, tup in enumerate(matches) if index not in indeces_to_pop]
    
    # loop through the matches and add them to the string to return
    # matches consist of an id and the indeces of the first and last word that constitute the pattern in the document
    # we use the ids to extrat the locations from the rules in the Matcher instance and not from the text itself,
    # to make sure they all follow the same format
    for match in matches_final:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column with the extracted locations
df['streets'] = df['transcripts'].map(location_extraction_context)

**Code Adapted from:** [Mitchell Bohman, Nour Zahlan, and Masiur Abik](https://github.com/mchbmn/radio-to-location) and [Joseph Hopkins, Carol, Chiu, Anthony Chapman, Kwamae Delva](https://github.com/delvakwa/police_radio_to_mapping)

In [91]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets
0,we going to call Joe at the corner of BP's and Taraval,0.807508,"['we', 'going', 'to', 'call', 'joe', 'at', 'the', 'corner', 'of', ""bp's"", 'and', 'taraval']",


#### Split All List Elements for Possible Streets into Single Strings

In [92]:
streets_list[:5]

['10th Ave', '10th St', '11th Ave', '11th St', '12th Ave']

In [93]:
streets_list = [i.lower().split(' ') for i in streets_list]

complete_list = []
for i in streets_list:
    for j in i:
        complete_list.append(j)
complete_list = list(set(complete_list))

In [94]:
streets_list[:5]

[['10th', 'ave'],
 ['10th', 'st'],
 ['11th', 'ave'],
 ['11th', 'st'],
 ['12th', 'ave']]

#### Create Columns with Possible Addresses' Numbers

In [95]:
# Creat list to house data from all addresses
addresses = []

# Loop Through all DataFrame's rows
for row in df['transcripts']:
    # Create dictionary to house data for each row of the DataFrame
    d = {}
    
    # Parse through rows and house results in a list
    list_tuples = usaddress.parse(row)
    
    # Create variable to house list of possible numbers
    numbers = []
    
    # Loop through each value in the list created
    for i, n in enumerate(list_tuples):
        
        # Get addresses' numbers
        if list_tuples[i][1] == 'AddressNumber':
            
            # Append numbers to list
            numbers.append(n[0])
    
    # Include keys and values into d
    d['numbers'] = numbers
    
    # Append d to addresses
    addresses.append(d)

#### Create DataFrame with Address Numbers and Concatenate with Original DataFrame

In [96]:
df = pd.concat([df, pd.DataFrame(addresses)], axis=1)

#### Drop NaN and Reset Index

In [97]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [98]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers
0,40th the Safeway Taraval 730 Taraval top of seventies and eighties is possibly related and that because of EMA,0.79367,"['40th', 'the', 'safeway', 'taraval', '730', 'taraval', 'top', 'of', 'seventies', 'and', 'eighties', 'is', 'possibly', 'related', 'and', 'that', 'because', 'of', 'ema']",40th Ave,[]


#### Create List of All Possible Addresses for each Row

In [99]:
# Creat list to house data for possible addresses
possibilities = []

# Loop Through all DataFrame's rows
for i in range(0, df.shape[0]):
    
    # Create variables to temporarily house information
    final_poss = []
    d = {}
    number_poss = []
    
    # Loop through values in each row / numbers
    for row in df[i:i+1]['numbers']:
        for a_number in row:
            number_poss.append(a_number)
    
    # Loop through values in each row / streets
    street_poss = []
    for row2 in [x.split(',') for x in df[i:(i+1)]['streets']][0]:
        for j in row2.split(','):
            street_poss.append(j.strip())

    # Concatenate numbers and streets
    for i in number_poss:
        for j in street_poss:
            final_poss.append(i + ' ' + j)

    # Append all possibilities to list
    d['possibilities'] = list(set(final_poss))
    possibilities.append(d)

# Concatenate dataframes
df = pd.concat([df, pd.DataFrame(possibilities)], axis=1)

#### Drop Empty Lists

In [100]:
df['possibilities'] = df['possibilities'].map(lambda x: np.nan if len(x) == 0 else x)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [101]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']",Silver Ave,[70],[70 Silver Ave]


#### Check DataFrame's Shape

In [102]:
df.shape

(6, 6)

#### Get Latitude And Longitude - Google Maps Geocoding API

In [103]:
# Input API Key
api_key = '<YOUR-API-KEY>'

In [104]:
def get_google_latlong(address, api_key=None, return_full_response=False):
    
    # Set up Google Maps Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    
    # Proceed if api_key is given
    if api_key is not None:
        geocode_url = geocode_url + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "latitude": None,
            "longitude": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng')
        }
        
    # Append some other details:    
#     output['input_string'] = address
#     output['number_of_results'] = len(results['results'])
#     output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

#### Adapted from https://www.shanelynn.ie/batch-geocoding-in-python-with-google-geocoding-api/

#### Loop Through Every Possible Address and Get Lat/Long Results

In [105]:
# Creat list to house data from all requests
lat_long = []

# Loop Through all DataFrame's rows
for row in df['possibilities']:
    
    # Create dictionary to house data for each row of the DataFrame
    d_poss = {}
    
    # Loop through each value in each row
    for i, n in enumerate(row): 
        
        # Create dictionary to house data for each request on the API
        d_address = {}
        
        # Print addresses being analyzed
        print(n + ', San Francisco, CA')
        
        # Include keys and values into d_poss
        d_address[i] = get_google_latlong(n + ', San Francisco, CA', api_key, return_full_response=False)
        d_poss['response' + str(i)] = d_address

    # Append to lat_long list    
    lat_long.append(d_poss)
    
# Concatenate DataFrames
df = pd.concat([df, pd.DataFrame(lat_long)], axis=1)

70 Silver Ave, San Francisco, CA
4419 State Dr, San Francisco, CA
4419 California Ave, San Francisco, CA
400 Turk Blvd, San Francisco, CA
400 Drummond Alley, San Francisco, CA
1404 California Ave, San Francisco, CA
1404 15th Ave, San Francisco, CA
1404 Treat Ave, San Francisco, CA
1404 Black Pl, San Francisco, CA
12 Dorado Terrace, San Francisco, CA
1934 19th Ave, San Francisco, CA
70 20th Ave, San Francisco, CA
1934 20th Ave, San Francisco, CA
70 19th Ave, San Francisco, CA


#### Print First 5 Rows

In [106]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities,response0,response1,response2,response3
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']",Silver Ave,[70],[70 Silver Ave],"{0: {'latitude': 37.7299073, 'longitude': -122.4133028}}",,,


In [107]:
df.fillna('No Value', inplace=True)

In [108]:
df.head(1)

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities,response0,response1,response2,response3
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']",Silver Ave,[70],[70 Silver Ave],"{0: {'latitude': 37.7299073, 'longitude': -122.4133028}}",No Value,No Value,No Value


#### Create Response Columns with Latitude and Logitude

In [109]:
# df['response0_lat'] = df['response0'].map(lambda x: x[0]['latitude'])
# df['response0_long'] = df['response0'].map(lambda x: x[0]['longitude'])

In [111]:
df

Unnamed: 0,transcripts,confidence,tokens,streets,numbers,possibilities,response0,response1,response2,response3
0,I have 12 of them with a 585 at any in Leavenworth well the silver 12 out of the next person more than says welcome 13s Perkins over 30 never 12 over to 14 hours 70 for the night,0.715435,"['i', 'have', '12', 'of', 'them', 'with', 'a', '585', 'at', 'any', 'in', 'leavenworth', 'well', 'the', 'silver', '12', 'out', 'of', 'the', 'next', 'person', 'more', 'than', 'says', 'welcome', '13s', 'perkins', 'over', '30', 'never', '12', 'over', 'to', '14', 'hours', '70', 'for', 'the', 'night']",Silver Ave,[70],[70 Silver Ave],"{0: {'latitude': 37.7299073, 'longitude': -122.4133028}}",No Value,No Value,No Value
1,Octavia an oak California Plate State atom Tom David 902 one aboard it's a white Honda Accord for door we're sufficient got on didn't like me Harvey 4419 hide,0.884444,"['octavia', 'an', 'oak', 'california', 'plate', 'state', 'atom', 'tom', 'david', '902', 'one', 'aboard', ""it's"", 'a', 'white', 'honda', 'accord', 'for', 'door', ""we're"", 'sufficient', 'got', 'on', ""didn't"", 'like', 'me', 'harvey', '4419', 'hide']","California Ave , State Dr",[4419],"[4419 State Dr, 4419 California Ave]","{0: {'latitude': 37.72457199999999, 'longitude': -122.4832877}}","{1: {'latitude': 37.7845947, 'longitude': -122.4648947}}",No Value,No Value
2,High tax rate 5 drunk and it's your time to shine Drummond of the 400 block of Turk,0.699905,"['high', 'tax', 'rate', '5', 'drunk', 'and', ""it's"", 'your', 'time', 'to', 'shine', 'drummond', 'of', 'the', '400', 'block', 'of', 'turk']","Drummond Alley , Turk Blvd",[400],"[400 Turk Blvd, 400 Drummond Alley]","{0: {'latitude': 37.7825871, 'longitude': -122.4157746}}","{1: {'latitude': 37.7371707, 'longitude': -122.3957104}}",No Value,No Value
3,David once again for the night 1910 we 1404 15th Street treat everyone sits out of can I just want to play for yeah but go back to the temporary place and the Seven Roberts Roberts zebra 211 should come back to black Infinity first like I was like a regular California place it's a paper plate but I have several Robert Roberts it was fabulous party Wright place we try it was he that does go back to a 2004 is that the entity that correct yeah okay,0.818094,"['david', 'once', 'again', 'for', 'the', 'night', '1910', 'we', '1404', '15th', 'street', 'treat', 'everyone', 'sits', 'out', 'of', 'can', 'i', 'just', 'want', 'to', 'play', 'for', 'yeah', 'but', 'go', 'back', 'to', 'the', 'temporary', 'place', 'and', 'the', 'seven', 'roberts', 'roberts', 'zebra', '211', 'should', 'come', 'back', 'to', 'black', 'infinity', 'first', 'like', 'i', 'was', 'like', 'a', 'regular', 'california', 'place', ""it's"", 'a', 'paper', 'plate', 'but', 'i', 'have', 'several', 'robert', 'roberts', 'it', 'was', 'fabulous', 'party', 'wright', 'place', 'we', 'try', 'it', 'was', 'he', 'that', 'does', 'go', 'back', 'to', 'a', '2004', 'is', 'that', 'the', 'entity', 'that', 'correct', 'yeah', 'okay']","15th Ave , Treat Ave , Black Pl , California Ave",[1404],"[1404 California Ave, 1404 15th Ave, 1404 Treat Ave, 1404 Black Pl]","{0: {'latitude': 37.82086899999999, 'longitude': -122.3640404}}","{1: {'latitude': 37.761754, 'longitude': -122.4726672}}","{2: {'latitude': 37.7487631, 'longitude': -122.4127913}}","{3: {'latitude': 37.7997105, 'longitude': -122.4165822}}"
4,Dorado lemon Adam code for handled can gennessee thank you Adam for the night 12 at 20 weeks well after this nice,0.826101,"['dorado', 'lemon', 'adam', 'code', 'for', 'handled', 'can', 'gennessee', 'thank', 'you', 'adam', 'for', 'the', 'night', '12', 'at', '20', 'weeks', 'well', 'after', 'this', 'nice']",Dorado Terrace,[12],[12 Dorado Terrace],"{0: {'latitude': 37.7256755, 'longitude': -122.4609817}}",No Value,No Value,No Value
5,thank you Lola Lola 1826 three the city crappy thank you very much mr. high-camp further David once again take this 800 prior to 1934 this is between 19th and 20th of yelling at us know and they 45 years old Felton and Adams 70 of them,0.794633,"['thank', 'you', 'lola', 'lola', '1826', 'three', 'the', 'city', 'crappy', 'thank', 'you', 'very', 'much', 'mr.', 'high-camp', 'further', 'david', 'once', 'again', 'take', 'this', '800', 'prior', 'to', '1934', 'this', 'is', 'between', '19th', 'and', '20th', 'of', 'yelling', 'at', 'us', 'know', 'and', 'they', '45', 'years', 'old', 'felton', 'and', 'adams', '70', 'of', 'them']","19th Ave , 20th Ave","[1934, 70]","[1934 19th Ave, 70 20th Ave, 1934 20th Ave, 70 19th Ave]","{0: {'latitude': 37.7516449, 'longitude': -122.475991}}","{1: {'latitude': 37.78629, 'longitude': -122.4795663}}","{2: {'latitude': 37.7516416, 'longitude': -122.4771277}}","{3: {'latitude': 37.7862671, 'longitude': -122.478463}}"


#### Save Addresses DataFrame to .csv

In [90]:
df.to_csv('./data/adresses.csv', index_label=False)