In [None]:
"""
MVP approach of text data cleaning

- all lower case
- remove punctuation
- remove numerical values (?)
- remove common non-sensical text
- tokenize text
- remove stop words


After tokenization:

- stemming/lemmatization
- parts of speech tagging
- create bi-grams or tri-grams
- handle typos

"""

In [1]:
import pandas as pd
import numpy as np

import re
import string

from sklearn.feature_extraction.text import CountVectorizer

### 1. Load data

In [18]:
speech_df = pd.read_pickle('../dump/speech_df')

In [19]:
speech_df

Unnamed: 0,speaker,year,transcript
0,SIDDHARTHA MUKHERJEE,2018,I wish someone had told me at my own commenc...
1,JESMYN WARD,2018,Persist. Be patient. Be well. Good morning....
2,DAVID SEDARIS,2018,"Thank you so much for having me, and for prese..."
3,OPRAH WINFREY,2018,Thank you Wallis Annenberg and a special thank...
4,NIGHT SHYAMALAN,2018,\nwhat's up dragons alright let's start\n\nwit...
...,...,...,...
466,WILLIAM ALLEN,1936,About all that a commencement orator can do fo...
467,CARRIE CHAPMAN,1936,I bring a message to Sweet Briar College and e...
468,FRANKLIN D ROOSEVELT,1932,"For me, as for you, this is a day of honorable..."
469,RALPH WALDO,1838,"In this refulgent summer, it has been a luxury..."


In [20]:
# Some transcripts are only excerpts
# But I'm going to let them go for now
# speech_df[speech_df.transcript.str.contains('full transcript')==True]

### 2. First round of text cleaning

In [21]:
def clean_text_1(text):
    text = text.lower()
    text = re.sub('commencement speech transcript','',text)
    text = re.sub('\[.*?\]','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('[-]','',text)
    text = re.sub('[–’“”…]','',text)
    text = re.sub('\xa0','',text)
    text = re.sub('\n','',text)
    text = re.sub('\r','',text)
    return text

In [22]:
speech_clean = speech_df.copy()
speech_clean['transcript'] = speech_clean['transcript'].apply(lambda x: clean_text_1(x))

In [23]:
speech_clean

Unnamed: 0,speaker,year,transcript
0,SIDDHARTHA MUKHERJEE,2018,i wish someone had told me at my own commenc...
1,JESMYN WARD,2018,persist be patient be well good morning it i...
2,DAVID SEDARIS,2018,thank you so much for having me and for presen...
3,OPRAH WINFREY,2018,thank you wallis annenberg and a special thank...
4,NIGHT SHYAMALAN,2018,whats up dragons alright lets startwith how gr...
...,...,...,...
466,WILLIAM ALLEN,1936,about all that a commencement orator can do fo...
467,CARRIE CHAPMAN,1936,i bring a message to sweet briar college and e...
468,FRANKLIN D ROOSEVELT,1932,for me as for you this is a day of honorable a...
469,RALPH WALDO,1838,in this refulgent summer it has been a luxury ...


In [38]:
# Pickle
speech_clean.to_pickle('../dump/speech_clean')

In [34]:
text = speech_clean.transcript.loc[5]
text



In [25]:
# text

### 3. Document-Term Matrix

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(speech_clean.transcript)

In [36]:
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = speech_clean.index
data_dtm = data_dtm.iloc[:,:-1]
data_dtm

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandoned,abandoning,abandonment,abandons,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Pickle
data_dtm.to_pickle("../dump/data_dtm.pkl")

In [13]:
data_dtm[data_dtm['über']==1]

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandoned,abandoning,abandonment,abandons,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
speech_df.transcript.loc[198]

"TRANSCRIPT\n\nThank you President Schlesinger. Your leadership and vision has kept Babson at the very top of its game and I\x92m honored to be here today. Congratulations graduates! You have earned something special and your journey is just beginning.\n \n    When I was a little kid, I told my mom that when I grew up I was going to Babson to become a businessman. Now, quite unbelievably, I stand here with you today. The road I took to get here was not Forest Street. It was my own unique path.\n \n    Today, I\x92d like to share four stories about opportunity, creativity, failure, and empathy. These four stories have companion insights that have colored the way I view business, happiness, and my own definition of success.\n \nMy First Story Is About Opportunity \n \n    From age six to ten, my mom had placed me in a program called Boy Rangers. It was a predecessor to Boy Scouts\x97we were modeled after Native American Tribes. We made feathered headdresses, we paid wampum. In order to a

In [17]:
speech_df.loc[5:6]

Unnamed: 0,speaker,year,transcript
5,NIGHT SHYAMALAN,2018,\n\n A person who concentrates on what they ...
6,OPRAH WINFREY,2018,Thank you Wallis Annenberg and a special thank...


### 4. Second round of text cleaning

#### A. Stemming

#### B. n-grams