# Importing Libraries

In [145]:
import pandas as pd
import re
import string
import unicodedata
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer

nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
pd.set_option('max_colwidth',100)

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'importlib_metadata'

# Importing Datasets

In [66]:
first_debate = pd.read_csv("datasets/us_election_2020_1st_presidential_debate.csv")
second_debate = pd.read_csv("datasets/us_election_2020_2nd_presidential_debate.csv")

In [67]:
first_debate.head()

Unnamed: 0,speaker,minute,text
0,Chris Wallace,01:20,Good evening from the Health Education Campus of Case Western Reserve University and the Clevela...
1,Chris Wallace,02:10,This debate is being conducted under health and safety protocols designed by the Cleveland Clini...
2,Vice President Joe Biden,02:49,"How you doing, man?"
3,President Donald J. Trump,02:51,How are you doing?
4,Vice President Joe Biden,02:51,I’m well.


In [68]:
first_debate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 789 entries, 0 to 788
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   speaker  789 non-null    object
 1   minute   788 non-null    object
 2   text     789 non-null    object
dtypes: object(3)
memory usage: 18.6+ KB


In [69]:
second_debate.head()

Unnamed: 0,speaker,minute,text
0,Kristen Welker,00:18,"Good evening, everyone. Good evening. Thank you so much for being here. It is such an honor for ..."
1,Donald Trump,07:37,How are you doing? How are you?
2,Kristen Welker,07:58,And I do want to say a very good evening to both of you. This debate will cover six major topics...
3,Kristen Welker,08:27,The goal is for you to hear each other and for the American people to hear every word of what yo...
4,Kristen Welker,09:03,… during this next stage of the coronavirus crisis. Two minutes uninterrupted.


In [70]:
second_debate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512 entries, 0 to 511
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   speaker  512 non-null    object
 1   minute   512 non-null    object
 2   text     512 non-null    object
dtypes: object(3)
memory usage: 12.1+ KB


### Speaker dataframe

In [71]:
#Let's check unique speakers in first debate
first_debate["speaker"].unique()

array(['Chris Wallace', 'Vice President Joe Biden',
       'President Donald J. Trump', 'Chris Wallace:'], dtype=object)

As we see in our unique speakers, we have 2 moderators in our dataframe. They are same person but written uncorrect therefore I will rename moderator name by correcting mistake.

In [72]:
first_debate["speaker"] = first_debate["speaker"].replace({"Chris Wallace:": "Chris Wallace"})

In [73]:
# We are going to create corpus by combining text in all rows and making one text row for each speaker.
dict_dt={'transcript':', '.join(first_debate[first_debate["speaker"]=="President Donald J. Trump"]["text"])}
dt_df = pd.DataFrame(data=dict_dt, index=["Donald Trump"])

dict_cw={'transcript':', '.join(first_debate[first_debate["speaker"]=="Chris Wallace"]["text"])}
cw_df = pd.DataFrame(data=dict_cw, index=["Chris Wallace"])

dict_jb={'transcript':', '.join(first_debate[first_debate["speaker"]=="Vice President Joe Biden"]["text"])}
jb_df = pd.DataFrame(data=dict_jb, index=["Joe Biden"])

first_data = pd.concat([dt_df, cw_df,jb_df])
first_data.head()

Unnamed: 0,transcript
Donald Trump,"How are you doing?, Thank you very much, Chris. I will tell you very simply. We won the election..."
Chris Wallace,Good evening from the Health Education Campus of Case Western Reserve University and the Clevela...
Joe Biden,"How you doing, man?, I’m well., Well, first of all, thank you for doing this and looking forward..."


In [74]:
# We can either create different dataframes for each speaker.
first_CW = first_debate[first_debate["speaker"]=="Chris Wallace"]
first_DT = first_debate[first_debate["speaker"]=="President Donald J. Trump"]
first_JB = first_debate[first_debate["speaker"]=="Vice President Joe Biden"]

In [75]:
#Let's check unique speakers in second debate
second_debate["speaker"].unique()

array(['Kristen Welker', 'Donald Trump', 'Joe Biden'], dtype=object)

In [76]:
# We are going to create corpus by combining text in all rows and making one text row for each speaker.
dt_dict={'transcript':', '.join(second_debate[second_debate["speaker"]=="Donald Trump"]["text"])}
df_dt = pd.DataFrame(data=dt_dict, index=["Donald Trump"])

kw_dict={'transcript':', '.join(second_debate[second_debate["speaker"]=="Kristen Welker"]["text"])}
df_kw = pd.DataFrame(data=kw_dict, index=["Kristen Welker"])

jb_dict={'transcript':', '.join(second_debate[second_debate["speaker"]=="Joe Biden"]["text"])}
df_jb = pd.DataFrame(data=jb_dict, index=["Joe Biden"])

second_data = pd.concat([df_dt, df_kw,df_jb])
second_data.head()

Unnamed: 0,transcript
Donald Trump,"How are you doing? How are you?, So as you know, 2.2 million people modeled out, were expected t..."
Kristen Welker,"Good evening, everyone. Good evening. Thank you so much for being here. It is such an honor for ..."
Joe Biden,"220,000 Americans dead. You hear nothing else I say tonight, hear this. Anyone who is responsibl..."


# Cleaning the Data

### 1- Expanding Contractions

In [90]:
def expand_contractions(text):
    
    '''Make text lowercase.'''
    text = text.lower()
    
    ''' Expanding contractions'''
    text = re.sub("that’s","that is",text)
    text = re.sub("there’s","there is",text)
    text = re.sub("here’s","here is",text)
    text = re.sub("what’s","what is",text)
    text = re.sub("where’s","where is",text)
    text = re.sub("who’s","who is",text)
    text = re.sub("i’m","i am",text)
    text = re.sub("it’s","it is",text)
    text = re.sub("she’s","she is",text)
    text = re.sub("he’s","he is",text)
    text = re.sub("they’re","they are",text)
    text = re.sub("we’re","we are",text)
    text = re.sub("you’re","you are",text)
    text = re.sub("who’re","who are",text)
    text = re.sub("i’ll","i will",text)
    text = re.sub("you’ll","you will",text)
    text = re.sub("we’ll","we will",text)
    text = re.sub("didn’t","did not",text)
    text = re.sub("doesn’t","does not",text)
    text = re.sub("aren’t","are not",text)
    text = re.sub("don’t","do not",text)
    text = re.sub("i’ve","i have",text)
    text = re.sub("you’ve","you have",text)
    text = re.sub("we’ve","we have",text)
    text = re.sub("they’ve","they have",text)
    text = re.sub("ain’t","am not",text)
    text = re.sub("wouldn’t","would not",text)
    text = re.sub("shouldn’t","should not",text)
    text = re.sub("can’t","can not",text)
    text = re.sub("couldn’t","could not",text)
    text = re.sub("won’t","will not",text)
    
    return text

### 2- Removing Special Characters

In [95]:
def text_cleaner(text):
    
    '''Make text lowercase.'''
    text = text.lower()
    
    '''Removing text in square brackets, remove punctuation and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text) 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) 
    text = re.sub('\w*\d\w*', '', text) 
    text = re.sub(r'\[[0-9]*\]',' ',text)
    
    '''Removing extra spaces'''
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'\s+[a-z]\s+',' ',text)
    text = re.sub(r'^[a-z]\s+',' ',text)

    '''Get rid of some additional punctuation and non-sensical text.'''
    text = re.sub('[‘’“”…]', '', text)
        
    ''' Get rid of accented characters'''
    text = unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8','ignore')
    
    return text

### 3- Stemming

In [106]:
def text_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

### 4- Lemmatization

In [126]:
def text_lemmatizer(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

### 5- Removing Stopwords

In [127]:
def stopwords_remover(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

### Building The Corpus

In [128]:
first_data["clean_text"] = pd.DataFrame(first_data["transcript"].apply(expand_contractions))
first_data["clean_text"] = pd.DataFrame(first_data["clean_text"].apply(text_cleaner))
first_data["clean_text"] = pd.DataFrame(first_data["clean_text"].apply(text_stemmer))
first_data["clean_text"] = pd.DataFrame(first_data["clean_text"].apply(text_lemmatizer))
first_data["clean_text"] = pd.DataFrame(first_data["clean_text"].apply(stopwords_remover))


second_data["clean_text"] = pd.DataFrame(second_data["transcript"].apply(expand_contractions))
second_data["clean_text"] = pd.DataFrame(second_data["clean_text"].apply(text_cleaner))
second_data["clean_text"] = pd.DataFrame(second_data["clean_text"].apply(text_stemmer))
second_data["clean_text"] = pd.DataFrame(second_data["clean_text"].apply(text_lemmatizer))
second_data["clean_text"] = pd.DataFrame(second_data["clean_text"].apply(stopwords_remover))

NameError: name 'nlp' is not defined

In [None]:
first_data

In [None]:
second_data