In [None]:
"""
MVP approach of text data cleaning (basic)

- all lower case
- remove punctuation
- remove numerical values (?) #isalpha()
- remove common non-sensical text
- tokenize text
- remove stop words

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import re
import string

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

### 1. Load data

In [2]:
speech_df = pd.read_pickle('../dump/speech_df')

In [3]:
speech_df

Unnamed: 0,speaker,year,school,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,Univ. of Southern California,I wish someone had told me at my own commenc...,14487
1,JESMYN WARD,2018,Tulane University,Persist. Be patient. Be well. Good morning....,14063
2,DAVID SEDARIS,2018,OBERLIN COLLEGE,"Thank you so much for having me, and for prese...",10587
3,NIGHT SHYAMALAN,2018,DREXEL UNIVERSITY,\nwhat's up dragons alright let's start\n\nwit...,26406
4,TERRY TEACHOUT,2018,HAMILTON HOLT SCHOOL,"I’m supposed to keep it short, and I approve o...",6288
...,...,...,...,...,...
436,WILLIAM ALLEN,1936,WHITE NORTHWESTERN UNIVERSITY,About all that a commencement orator can do fo...,14953
437,FRANKLIN D ROOSEVELT,1932,OGLETHORPE UNIVERSITY,"For me, as for you, this is a day of honorable...",16017
438,OPRAH WINFREY,1918,USC,Thank you Wallis Annenberg and a special thank...,15301
439,RALPH WALDO,1838,EMERSON HARVARD UNIVERSITY,"In this refulgent summer, it has been a luxury...",40403


In [4]:
# Some transcripts are only excerpts
# But I'm going to let them go for now
# speech_df[speech_df.transcript.str.contains('full transcript')==True]

### 2. First round of text cleaning

In [4]:
def clean_text_1(text):
    text = text.lower()
    text = re.sub('commencement speech transcript','',text)
    text = re.sub('\[.*?\]','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('[-]','',text)
    text = re.sub('[–’“”…]','',text)
    text = re.sub('\xa0','',text)
    text = re.sub('\n','',text)
    text = re.sub('\r','',text)
    return text

In [5]:
speech_clean = speech_df.copy()
speech_clean['transcript'] = speech_clean['transcript'].apply(lambda x: clean_text_1(x))

In [7]:
speech_clean.head()

Unnamed: 0,speaker,year,school,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,Univ. of Southern California,i wish someone had told me at my own commenc...,14487
1,JESMYN WARD,2018,Tulane University,persist be patient be well good morning it i...,14063
2,DAVID SEDARIS,2018,OBERLIN COLLEGE,thank you so much for having me and for presen...,10587
3,NIGHT SHYAMALAN,2018,DREXEL UNIVERSITY,whats up dragons alright lets startwith how gr...,26406
4,TERRY TEACHOUT,2018,HAMILTON HOLT SCHOOL,im supposed to keep it short and i approve of ...,6288


In [8]:
# Pickle
speech_clean.to_pickle('../dump/speech_clean')

In [8]:
text = speech_clean.transcript.loc[5]

In [10]:
# text

### 3. Document-Term Matrix

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(speech_clean.transcript)

In [12]:
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = speech_clean.index
data_dtm = data_dtm.iloc[:,:-1]
data_dtm

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandoned,abandoning,abandonment,abandons,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Pickle
data_dtm.to_pickle("../dump/data_dtm.pkl")