# Imports

In [1]:
# !pip install spacy
# ! pip install en_core_web_sm
# !python -m spacy download en

In [2]:
import spacy
import re
import pandas as pd
import pickle
from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher
import numpy as np
nlp = spacy.load('en')

In [3]:
# Get dataframes from csv
df = pd.read_csv('../data/Data/GeoCoding _transcribe_radio/transcribed_radio.csv')
df_context = pd.read_csv('../data/Data/GeoCoding_transcribed_radio_with_street_context/transcribed_radio_with_street_context.csv')
with open("../data/Pickles/streets.pkl", "rb") as fp:
    streets = pickle.load(fp)
list_of_roads = list(streets)

In [4]:
# Check first five rows
df_context.head()

Unnamed: 0.1,Unnamed: 0,transcripts,confidence
0,0,powers of,0.703373
1,1,Brevard Africa for 1636 thank you,0.78988
2,2,is going to be,0.625621
3,3,Ester Drive robson's plane set the driver knoc...,0.785905
4,4,Marcus 11:30 Highway Street 1 1 3 0 hi,0.805028


# Named Entity Recognition

## Baseline

We’re using spaCy, a powerful Natural Language Processing API, to extract true locations from our Speech to Text transcriptions. SpaCy has a matcher that allows us to create a custom dictionary of things to look for in text. Words related to streets are passed in (street, place, trail, etc), and spaCy looks for those words in our transcripts. When a word matches one of those in our dictionary, spaCy then searches the words around it for context, and returns what it believes to be the address. We then compile extracted location in a dataframe. 

*** Code adapted from radio to location repository

In [5]:
# function to extract locations using spaCy pre trained labels
def location_extraction(string_in):
    doc = nlp(string_in)
    locations = []
    # loop through every entity in the transcript
    for X in doc.ents:
        if (X.label_ == 'FAC') or (X.label_ == 'GPE'):
            locations.append(X.text)
    if len(locations) != 0:
        return locations
    return None

# Add a column with the extracted locations
df['location_extraction'] = df['transcripts'].map(location_extraction)

##  spaCy Matcher

Here we use spaCy Matcher entity to be able to generate our own set of rules to look for in the text. Every rule corresponds to patterns which consists of sets of words, conditions and operators, where the word had to be found in the document following a specific condition and the operator determines how many times or how we have to observe the pattern.
Here we are looking for entities that correspond to a road name in Butte county, and since we already have a complete list of road names, we can set one rule for each road, where the pattern would specify and all words have to match exactly one time except if the name ended with a generic word like "street" or "Road" then that word could match 0 or more times.

In [6]:
# Building the Matcher entity

# Instantiate
matcher = Matcher(nlp.vocab)

# specifies what spacy does when it finds a match in the document. Here we just want to return the matches
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name, the condition being that the lowercase entity in the doc should match 
# the lowercase verion of the road name, so that capitalization wouldn't affect the model
def build_pattern(road_name):
    list_words = road_name.split(' ')
    # general words that appear a lot in the list. 
    # The reason why we do this is to still get a match if they are not present
    roads_general = ['lane', 'road', 
                 'court', 'drive', 
                 'avenue', 'way', 
                 'street', 'circle', 
                 'place', 'highway', 'trail']
    if list_words[-1].lower() in roads_general:
        pattern = [{'LOWER': word.lower()} for word in list_words[:-1]]
        pattern.append({'op': '*', 'LOWER' : list_words[-1].lower()})
    else:
        pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in list_of_roads:
    matcher.add(road, on_match, build_pattern(road))
    
# This function takes a string as input and returns it with every word capitalized
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None
    indeces_to_pop = []
    # loop through the matches and delete those that are a subset of another. 
    # this was done because some road names have words in commond and we were getting 2 matches for some locations
    # here we eliminate the shorter one since the longest is clearly the one intended
    for a in range(len(matches)):
        for b in range(a+1, len(matches)):
            if (matches[a][2] == matches[b][2]):
                if (matches[a][1] < matches[b][1]):
                    indeces_to_pop.append(b)
                else:
                    indeces_to_pop.append(a)
    matches_final = [tup for index, tup in enumerate(matches) if index not in indeces_to_pop]
    # loop through the matches and add them to the string to return
    # matches consist of an id and the indeces of the first and last word that constitute the pattern in the document
    # we use the ids to extract the locations from the rules in the Matcher instance and not from the text itself,
    # to make sure they all follow the same format
    for match in matches_final:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column with the extracted locations
df_context['location_extraction'] = df_context['transcripts'].map(location_extraction_context)

# Since we dont care about transcripts where we didn't find any locations we drop all NAs
df_context.dropna(inplace=True)

In [7]:
df['location_extraction'].head()

0    None
1    None
2    None
3    None
4    None
Name: location_extraction, dtype: object

In [8]:
df['filename'] = np.round(df['confidence'], 6).astype(str)
df_context['filename'] = np.round(df_context['confidence'], 6).astype(str)

In [9]:
df_context.head()

Unnamed: 0.1,Unnamed: 0,transcripts,confidence,location_extraction,filename
6,6,Peridot we close races here,0.825849,Peridot Place,0.825849
7,7,Library 612 to myself Street between 6th Avenu...,0.788517,"6th Street , Julia Street",0.788517


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,transcripts,confidence,location_extraction,filename
0,0,I was just,0.502997,,0.502997
1,1,our brothers are critical for 1636 thank you,0.842904,,0.842904
2,2,is going to be,0.652573,,0.652573
3,3,to drive robson's plane set the driver knocked...,0.700766,,0.700766
4,4,Market 11:30 high was 31130 hi,0.803496,,0.803496


In [39]:
# Save them as csv
df.to_csv(f"../data/Data/Long_and_Lat_result /{df['filename']}.csv")
df_context.to_csv(f"../data/Data/Long_and_Lat_result_w_o_context/{df_context['filename']}.csv")

In [40]:
with open('../data/Pickle.lat_long.pkl','wb') as pickle_out:
    pickle.dump(df, pickle_out)
    
with open('../data/Pickle.lat_long_context.pkl','wb') as pickle_out:
    pickle.dump(df_context, pickle_out)