In [None]:
"""
MVP approach of text data cleaning (basic)

- all lower case
- remove punctuation
- remove numerical values (?) #isalpha()
- remove common non-sensical text
- tokenize text
- remove stop words

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import re
import string

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

### 1. Load data

In [2]:
speech_df = pd.read_pickle('../dump/speech_df')

In [3]:
speech_df

Unnamed: 0,speaker,year,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,I wish someone had told me at my own commenc...,14487
1,ABBY WAMBACK,2018,"Failure is not something to be ashamed of, ...",15866
2,JON B. FISHER,2018,\r\n Commencement Speech Transcript ...,8544
3,MINDY KALING,2018,"Good morning to the Class of 2018, the facul...",15391
4,JESMYN WARD,2018,Persist. Be patient. Be well. Good morning....,14063
...,...,...,...,...
436,CARRIE CHAPMAN,1936,I bring a message to Sweet Briar College and e...,22942
437,FRANKLIN D ROOSEVELT,1932,"For me, as for you, this is a day of honorable...",16017
438,OPRAH WINFREY,1918,Thank you Wallis Annenberg and a special thank...,15301
439,RALPH WALDO,1838,"In this refulgent summer, it has been a luxury...",40403


In [4]:
# Some transcripts are only excerpts
# But I'm going to let them go for now
# speech_df[speech_df.transcript.str.contains('full transcript')==True]

### 2. First round of text cleaning

In [5]:
def clean_text_1(text):
    text = text.lower()
    text = re.sub('commencement speech transcript','',text)
    text = re.sub('\[.*?\]','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('[-]','',text)
    text = re.sub('[–’“”…]','',text)
    text = re.sub('\xa0','',text)
    text = re.sub('\n','',text)
    text = re.sub('\r','',text)
    return text

In [6]:
speech_clean = speech_df.copy()
speech_clean['transcript'] = speech_clean['transcript'].apply(lambda x: clean_text_1(x))

In [7]:
speech_clean

Unnamed: 0,speaker,year,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,i wish someone had told me at my own commenc...,14487
1,ABBY WAMBACK,2018,failure is not something to be ashamed of it...,15866
2,JON B. FISHER,2018,thank you very much my fa...,8544
3,MINDY KALING,2018,good morning to the class of the faculty the...,15391
4,JESMYN WARD,2018,persist be patient be well good morning it i...,14063
...,...,...,...,...
436,CARRIE CHAPMAN,1936,i bring a message to sweet briar college and e...,22942
437,FRANKLIN D ROOSEVELT,1932,for me as for you this is a day of honorable a...,16017
438,OPRAH WINFREY,1918,thank you wallis annenberg and a special thank...,15301
439,RALPH WALDO,1838,in this refulgent summer it has been a luxury ...,40403


In [8]:
# Pickle
speech_clean.to_pickle('../dump/speech_clean')

In [9]:
text = speech_clean.transcript.loc[5]
text

'whats up dragons alright lets startwith how grateful i ami met john frye at an eagles game beforewe were the super bowl champs what athing trachsel as a community i want tocheck thank the trustees and i and iunderstand theres like  to thousand people here i want to thank allof you for coming here didnt let me beyour speaker today its quite an honoractually came here guys to ask you aquestion how many of you gonna changethe world now ideally all of you wouldhave stood up and would have yelled andscreamed and cheered and chilled thisdamn stadium for those of you dont knowme im a filmmaker i was born in indiaand i spent my whole life here inphiladelphia and im gonna tell you twoversions of my life so far both equallytrue this is an attempt to be real for asecond with each other this is gonna bea really unusual like commencementspeech well he was the first versionwent to nyu film school and twoscholarships graduated in a little overthree years made my first film at making me one of the yo

In [10]:
# text

### 3. Document-Term Matrix

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(speech_clean.transcript)

In [12]:
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = speech_clean.index
data_dtm = data_dtm.iloc[:,:-1]
data_dtm

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandoned,abandoning,abandonment,abandons,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Pickle
data_dtm.to_pickle("../dump/data_dtm.pkl")