# Named entity recognition

In [None]:
import spacy
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import datetime
import re
from textblob import TextBlob
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

In [None]:
fire_csv_list = sorted(glob.glob('../../tweets/megafires/clean_csvs/*.csv'))

In [None]:
def read_df_timestamps(csv_path):
    df = pd.read_csv(csv_path)
    # Drop invalid timestamps
    valid_ts = df.Timestamp.str[-3:] == "UTC"
    df = df.loc[valid_ts]
    df['Timestamp'] = pd.to_datetime(df.Timestamp, format='%Y-%m-%d %H:%M:%S UTC')
    return df

def remove_url(txt):
    """Replace URLs found in a text string with nothing 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's removed.
    """

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

def remove_special_characters(df):
    # Remove RT
    df['clean_text'] = df['clean_text'].str.replace(r'\b[Rr][Tt]\b', '')
    # Remove hashtags
    df['clean_text'] = df['clean_text'].str.replace(r'(#|@)', '')
    return df


def clean_tweet_text(df, lower_case=True):
    """Clean Tweet Text, optionally lowercasing all"""
    if lower_case:
        df['clean_text'] = df['Text'].str.lower().apply(remove_url)
    else:
        df['clean_text'] = df['Text'].apply(remove_url)
    df = remove_special_characters(df)

    return df

# Basic NER testing with spaCy

In [None]:
test_df = clean_tweet_text(read_df_timestamps('/home/tweets/megafires/clean_csvs/2013-yarnell-hill-az.csv'),
                lower_case=False)

In [None]:
for t in test_df.clean_text.values[1000:1020]:
    
    doc = nlp(t)
    if len(doc.ents)>0:
        svg = displacy.render(doc, style='ent', jupyter=True)