# Cleaning and processing speeches
I'll be doing some text cleaning following the guidelines in (this)[https://github.com/adashofdata/nlp-in-python-tutorial/blob/master/1-Data-Cleaning.ipynb] notebook. Then spaCy will be used to tokenize the text, find lemmatizations, etc.

In [2]:
from pymongo import MongoClient

import os

import re
import string

from pprint import pprint

from collections import defaultdict

import pickle as pkl

import numpy as np

import pandas as pd

import spacy

### Get data from mongo
Just load the data if it is already saved

In [3]:
load_speeches_df = False
if load_speeches_df:
    with open("speeches_df.pkl", "rb") as f:
        speeches_df = pkl.load(f)

In [4]:
config = {
    'host': '13.56.124.215:27017',
    'username': 'fisher',
    'password': 'mongoPassword',
    'authSource': 'speeches'
}

client = MongoClient(**config)
db = client.speeches

db.list_collection_names() # check the connection

['speeches']

In [5]:
# db.speeches.find_one({"speaker": "trump"})["content"]

In [6]:
cursor = db.speeches.find() # all speech documents
speeches_dict = defaultdict(list)
for speech in cursor:
    speeches_dict["speaker"].append(speech["speaker"])
    speeches_dict["date"].append(speech["date"])
    speeches_dict["content"].append(speech["content"])

In [7]:
speeches_df = pd.DataFrame(speeches_dict)
speeches_df.sort_values(by="date", ascending=True, inplace=True)

In [8]:
save_speeches_df = False
if save_speeches_df:
    with open("speeches_df.pkl", "wb") as f:
        pkl.dump(speeches_df, f)

### Cleaning and processing with spaCy
Just load the dataframe if it has already been created

In [9]:
load_spacy_speeches_df = False
# large file, takes a while to load
if load_spacy_speeches_df:
    with open("spacy_speeches_df.pkl", "rb") as f:
        speeches_df = pkl.load(f)

In [10]:
def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers, etc.
    '''
    text = text.lower()
    text = re.sub(r'<.*?>', '', text) # remove text within < >
    text = re.sub(r'\[.*?\]', '', text) # remove text within [ ]
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub(r'\w*\d\w*', '', text) # words with numbers
    text = re.sub(r'[‘’“”…]', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text

In [11]:
speeches_df["content"] = speeches_df["content"].apply(clean_text)

In [12]:
# load a language model for spaCy to use
nlp = spacy.load('en_core_web_sm')

In [13]:
# for some reason doing nlp.pipe on all of the documents wouldn't finish, but doing it in chunks does...
spacy_docs = []
last_idx = 0
for idx in np.linspace(0, 1044, 30):
    idx_int = int(idx)
    print("chunking docs {} to {}".format(last_idx, idx_int))
    chunk = list(nlp.pipe(speeches_df.iloc[last_idx:idx_int, 2].values, 
                          disable=["tagger", "parser", "textcat"]))
    spacy_docs += chunk
    last_idx = idx_int
    
speeches_df["spacy_doc"] = spacy_docs

chunking docs 0 to 0
chunking docs 0 to 36
chunking docs 36 to 72
chunking docs 72 to 108
chunking docs 108 to 144
chunking docs 144 to 180
chunking docs 180 to 216
chunking docs 216 to 252
chunking docs 252 to 288
chunking docs 288 to 324
chunking docs 324 to 360
chunking docs 360 to 396
chunking docs 396 to 432
chunking docs 432 to 468
chunking docs 468 to 504
chunking docs 504 to 540
chunking docs 540 to 576
chunking docs 576 to 612
chunking docs 612 to 648
chunking docs 648 to 684
chunking docs 684 to 720
chunking docs 720 to 756
chunking docs 756 to 792
chunking docs 792 to 828
chunking docs 828 to 864
chunking docs 864 to 900
chunking docs 900 to 936
chunking docs 936 to 972
chunking docs 972 to 1008
chunking docs 1008 to 1044


spaCy already calculated the lemmatizations for each word, we just need to create new text with just the lemmatizations.

We will also remove all named entities from the text at this point.

In [14]:
def spacy_lemmatize_remove_ents(doc):
    lemmatized_no_ents = []
    ents = [e.text for e in doc.ents] # get the list of named entities for this document
    for token in doc:
        if token.text not in ents: # if this token is not a named entity
            lemmatized_no_ents.append(token.lemma_) # add the lemmatized version of the word to the list
    lemmatized_no_ents = " ".join(lemmatized_no_ents) # convert to string
    return lemmatized_no_ents

In [15]:
lemmatized_no_ents = []
for doc in speeches_df["spacy_doc"].values:
    lemmatized_no_ents.append(spacy_lemmatize_remove_ents(doc))
    
speeches_df["lemmatized_no_ents"] = lemmatized_no_ents

In [16]:
# one of President Arthur's speeches was in the wrong century...
speeches_df.iloc[765, 1] = np.datetime64("1881-12-06")

In [17]:
save_spacy_speeches_df = True
if save_spacy_speeches_df:
    with open("spacy_speeches_df.pkl", "wb") as f:
        pkl.dump(speeches_df.drop(columns="spacy_doc"), f) # don't save the spacy docs column