# Data Cleaning and Processing

In [1]:
import pandas as pd
import numpy as np
import os
import re
import unicodedata
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
import gender_guesser.detector as gender

## Load datasets
Load all 6 datasets (`true`, `mostly-true`, `half-true`, `barely-true`, `false`, `pants-fire`) and combine them into a single dataframe.

In [2]:
# Read in and append dataframes
data_path = 'data/'
df_quotes = pd.DataFrame()
for filename in os.listdir(data_path):
    if filename in ['true.csv', 'mostly-true.csv', 'half-true.csv', 'barely-true.csv', 'false.csv', 'pants-fire.csv']:
        df_quotes = df_quotes.append(pd.read_csv(data_path + filename, dtype={'label':str}, na_values='unspecified'), ignore_index=True)

print(f'There are {df_quotes.shape[0]} rows and {df_quotes.shape[1]} columns.')
df_quotes.head()

There are 18052 rows and 8 columns.


Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,viral-image,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,joe-biden,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,jon-ossoff,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,ted-nugent,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,facebook-posts,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke


## Standardize Variables from Quotes Dataset

### Deduplicate Entries
Ensure that there are no duplicated entries. For some reason, there are duplicated entries for certain quotes.

In [3]:
# These are quote entries that are identical across the board... we remove them
dup_quotes = df_quotes.loc[df_quotes.duplicated()]

print(f'Before: {df_quotes.shape[0]} rows')
df_quotes = df_quotes.drop_duplicates()
print(f'After: {df_quotes.shape[0]} rows')

Before: 18052 rows
After: 18046 rows


### Create Binary True False Target Variable
Although it is great to have a more detailed classification of each quote by having as many as 6 different levels of *realness*, it would nevertheless be interesting to simply have a binary output of whether a quote is **true or false**.

In [4]:
df_quotes['label_binary'] = np.where(df_quotes['label'].isin(['true', 'mostly-true']), 'true', 'false')

### Label Target Variable Encoding

In [5]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
df_quotes['label'] = encoder.fit_transform(df_quotes['label'])
df_quotes['label_binary'] = encoder.fit_transform(df_quotes['label_binary'])

### Clean and Normalize Context
There are more than **5000** unique strings in the `context` variable, which makes it difficult to use as features. As such, we attempt to clean up the entries by reducing the number of possible categories. For instance, contexts that contain the word *email* or *e-mail* would be classified as simply *email*. 

In [6]:
# Create new variable context_clean
df_quotes['context_clean'] = df_quotes['context']

# While there are still sub-strings that can be cleaned, continue
n_rows = 0
while len(df_quotes['context_clean'].value_counts()) != n_rows:
    
    n_rows = len(df_quotes['context_clean'].value_counts())
    
    # Set to lower
    df_quotes['context_clean'] = df_quotes['context_clean'].str.lower()
    # Remove leading and trailing . and spaces
    df_quotes['context_clean'] = df_quotes['context_clean'].str.strip('\. ')
    # Remove leading words (in, on, his, her, the, our, their)
    df_quotes['context_clean'] = df_quotes['context_clean'].str.replace('^(in |on |his |her |the |our |their )', '', regex=True)
    # Remove leading words (a, an) Note: Done after previous line because of cases like "in a"
    df_quotes['context_clean'] = df_quotes['context_clean'].str.replace('^(a |an )', '', regex=True)
    # Triage of contexts based on words contained (ie. the television episode about climate change ---> TV)
    context_dict = {'commercial$':'ad', 'advertisement':'ad', '.*e-mail.*':'email', '.*(^| )email.*':'email', '.*op-ed.*':'op-ed', '.*(^| )oped.*':'op-ed', '.*facebook.*':'facebook', 'television':'tv', '.*(^| )tv( |$).*':'tv', '.*(^| )video( |$).*':'video', '.*image.*':'image', '.*twitter.*':'tweet', '.*tweet.*':'tweet', '.*(^| )reddit.*':'reddit', '.*meet the press.*':'mtp', '.*(^| )press( |$).*':'press', '.*blog.*':'blog', '.*radio.*':'radio', '.*instagram.*':'instagram', '.*nbc.*':'msnbc', '.*fox news sunday.*':'fns', '.*cnn.*':'cnn', '.*abc.*':'abc', '.*this week.*':'abc', '.*fox.*':'fox news', '.*cbs.*':'cbs', '.*face the nation.*':'cbs', '.*cpac.*':'cpac', '.*the daily show.*':'the daily show', '.*hbo.*':'hbo', '.*senate floor speech.*':'senate floor speech', '.*newspaper.*':'newspaper', '.*senate floor.*':'senate floor', '.*(^| )letter.*':'letter', '.*newsletter.*':'newsletter', '.*(^| )debate.*':'debate', '.*(^| )flier.*':'flier', '.*flyer.*':'flier', '.*hearing.*':'hearing', '.*mailer.*':'mailer', '.*social media.*':'social media', '.*petition.*':'petition', '.*(^| )book.*':'book', '.*(^| )ad.*':'ad', '.*(^| )speech.*':'speech', '.*interview.*':'interview', '.*(^| )web.*':'web', '.*article.*':'article', '.*(^| )comment.*':'comment', '.*(^| )story.*':'story', '.*(^| )remarks.*':'remarks', '.*conversation.*':'conversation', '.*medium.*':'medium', '.*statement.*':'statement', '.*editorial.*':'editorial', '.*meeting.*':'meeting', '.*campaign.*':'campaign', '.*(^| )news( |$).*':'news', '.*column.*':'column', '.*reporters.*':'reporters', '.*online.*':'online', '.*survey.*':'survey', '.*study.*':'study', '.*briefing.*':'briefing', '.*questionnaire.*':'survey', '.*internet.*':'online', '.*rally.*':'rally', '.*town hall.*':'town hall', '.*brochure.*':'brochure', '.*monologue.*':'speech', '.*senate.*':'senate', '.*presentation.*':'presentation', '.*forum.*':'forum', '.*opinion.*':'opinion', '.*call.*':'call', '.*congress.*':'congress', '.*youtube.*':'video', '.*telev.*':'tv', '.*(^| )show.*':'show', '.*testimony.*':'testimony', '.*discussion.*':'discussion', '.*(^| )text.*':'text', '.*(^| )event.*':'event', '.*(^| )episode.*':'episode', '.*(^| )resolution.*':'resolution', '.*(^| )pamphlet.*':'brochure', '.*(^| )question.*':'question', '.*(^| )document.*':'document', '.*(^| )report.*':'report', '.*(^| )appearance.*':'appearance', '.*(^| )response.*':'response', '.*(^| )protest.*':'protest', '.*(^| )broadcast.*':'broadcast', '.*(^| )proposal.*':'proposal', '.*(^| )essay.*':'essay', '.*(^| )sign.*':'sign', '.*(^| )summit.*':'summit', '.*(^| )conference.*':'conference', '.*(^| )trial.*':'trial', '.*(^| )session.*':'session', '.*(^| )snapchat.*':'snapchat', '.*(^| )paper.*':'paper', '.*(^| )graphic.*':'graphic', '.*(^| )magazine.*':'magazine', '.*(^| )convention.*':'convention', '.*(^| )memo.*':'memo', '.*(^| )roundtable.*':'roundtable', '.*(^| )chat.*':'conversation', '.*(^| )rebuttal.*':'rebuttal', '.*(^| )message.*':'message', '.*(^| )guide.*':'guide', '.*(^| )meme.*':'meme', '.*(^| )plan.*':'plan', '.*(^| )fair.*':'conference', '.*(^| )sketch.*':'sketch', '.*(^| )screenshot.*':'screenshot', '.*(^| )attachment.*':'attachment', '.*(^| )infographic.*':'graphic', '.*(^| )widget.*':'widget', '.*(^| )bill.*':'bill', '.*(^| )townhall.*':'town hall', '.*(^| )comic.*':'comic', '.*(^| )poll.*':'poll', '.*(^| )talk.*':'presentation', '.*(^| )ordinance.*':'ordinance', '.*(^| )decision.*':'decision', '.*(^| )meetup.*':'meetup', '.*(^| )direct-mail.*':'mail', '.*(^| )teleconference.*':'conference', '.*(^| )gathering.*':'gathering', '.*(^| )mailing.*':'mail', '.*(^| )fundraiser.*':'event', '.*(^| )post.*':'post', '.*(^| )show.*':'show', '.*(^| )segment.*':'segment', '.*(^| )cartoon.*':'comic', '.*(^| )brief.*':'briefing', '.*(^| )announcement.*':'announcement', '.*(^| )luncheon.*':'gathering', '.*(^| )panel.*':'panel', '.*(^| )emai.*':'email', '.*(^| )speaking.*':'presentation', '.*(^| )argument.*':'argument', '.*(^| )visit.*':'visit', '.*(^| )media.*':'media', '.*(^| )site.*':'web', '.*(^| )remaks.*':'remarks', '.*(^| )announcing.*':'announcement', '.*(^| )mail.*':'mail', '.*(^| )stories.*':'story', '.*(^| )photo.*':'image', '.*(^| )leaflet.*':'brochure', '.*(^| )concert.*':'concert', '.*(^| )ceremony.*':'ceremony', '.*(^| )reception.*':'reception', '.*quote.*':'quote', '.*(^| )encyclopedia.*':'encyclopedia', '.*(^| )journal.*':'journal', '.*(^| )orientation.*':'orientation', '.*(^| )fundraising.*':'event', '.*(^| )pitch.*':'pitch', '.*(^| )lecture.*':'presentation', '.*(^| )confrontation.*':'argument', '.*(^| )ruling.*':'ruling', '.*(^| )retreat.*':'gathering', '.*(^| )symposium.*':'conference', '.*(^| )segment.*':'segment'}
    df_quotes['context_clean'] = df_quotes['context_clean'].replace(context_dict, regex=True)
    
    print(f'{n_rows} unique contexts. Cleaning...')

print(f'Number of unique contexts after cleaning: {n_rows}.')


5984 unique contexts. Cleaning...
284 unique contexts. Cleaning...
279 unique contexts. Cleaning...
Number of unique contexts after cleaning: 279.


Even after cleaning and grouping many of the contexts, there are still a lot of categories with very few occurrences. In an attempt to further simplify the classification, any context class with fewer than 10 occurrences will be group together as *other*.

In [7]:
# Set all contexts with less than 10 occurrences to the category 'other'
df_quotes.loc[df_quotes.groupby('context_clean')['context_clean'].transform('count').lt(10), 'context_clean'] = 'other'
print(f"Final number of unique contexts: {len(df_quotes['context_clean'].value_counts())}")

Final number of unique contexts: 76


In [8]:
# Data verficiation checks
#df_quotes.loc[df_quotes['context_clean'].str.contains('other')]
#df_quotes['context_clean'].value_counts().loc[df_quotes['context_clean'].value_counts().index.str.contains('tiktok')][:50]

### Clean and Normalize Authors

In [9]:
# Data verification checks
df_quotes['author_id'].value_counts()[:50]
df_quotes['author_id'].value_counts().loc[df_quotes['author_id'].value_counts() < 2]

# In the event we want to replace authors with fewer than X occurrences
#df_quotes['author_clean'] = df_quotes['author_id']
#df_quotes.loc[df_quotes.groupby('author_clean')['author_clean'].transform('count').lt(2), 'author_clean'] = 'other'
#df_quotes['author_clean'].value_counts()[:50]

markel-hutchins          1
ideas-illinois           1
politico-news            1
thomas-norment-jr        1
doug-muder               1
                        ..
marco-rubios-heckler     1
martha-robertson         1
jack-seiler              1
pro-conservative-news    1
southwest-farm-press     1
Name: author_id, Length: 2616, dtype: int64

## Load metadata
Load additional information on authors and merge to main dataset.

In [10]:
# Read in meta data
metadata_path = 'metadata/'
df_personalities = pd.read_csv(metadata_path + 'personalities.csv')
df_personalities.head()

Unnamed: 0,author_id,author_name,affiliation,description,link
0,13th-district-gop-slate,13th District GOP slate,Republican,The 13th District GOP slate includes state Sen...,
1,18-percent-american-public,18% of the American public,,,
2,60-plus-association,60 Plus Association,,The 60 Plus Association is a conservative advo...,http://www.60plus.org/
3,AARP,AARP,,"AARP is a nonprofit, nonpartisan organization ...",http://www.aarp.org/
4,greg-abbott,Greg Abbott,Republican,Greg Abbott won election as governor of Texas ...,http://gregabbott.com/


### Deduplicate Entries
Ensure that there are no duplicated entries. For some reason, there are duplicated entries for certain personalities.

In [11]:
dup_personalities = df_personalities['author_id'][df_personalities['author_id'].duplicated()].unique()
df_dup = df_personalities.loc[df_personalities['author_id'].isin(dup_personalities)]
exceptions_to_drop = ['Billboard at Spaghetti Junction'] # Maps to another page...

print(f'Before: {df_personalities.shape[0]} rows')
df_personalities = df_personalities.drop_duplicates()
df_personalities = df_personalities.loc[~df_personalities['author_name'].isin(exceptions_to_drop)]
print(f'After: {df_personalities.shape[0]} rows')

# Check to see if there are still duplicated author_id after
if any(df_personalities.loc[df_personalities['author_id'].isin(dup_personalities)]['author_id'].duplicated()):
    print('Warning! There are still duplicated author_id record(s).')

Before: 4643 rows
After: 4610 rows


### Derive Author's Gender
Using `gender-guesser`, we attempt to derive additional information of the author's gender based on their **first** names. Each author is classied into either `male`, `female`, `mostly male` and `mostly female`, whereas entries that are not names (ie. organizations, entities, etc.) are assigned `unknown`.

In [12]:
# Derive gender from first name whenever possible
gd = gender.Detector()
unk_word_list = ['The', 'Young', 'My', 'In', 'Ban', 'Free'] # List of first words known to be misclassified
unk_list = ['Al Jazeera America', 'Austin Board of Realtors PAC', 'Austin Fund for Quality Healthcare', 'Austin Independent School District', 'Austin Water Utility', 'Austin for a Better Future', 'Brady Campaign to Prevent Gun Violence', 'Christian Broadcasting Network', 'Clayton County Government', 'Clayton County Schools', 'Dane County Republican Party', 'Dustin Inman Society', 'Elle', 'Fair Districts Florida', 'Forbes blog', 'Georgia Association of Homes and Services for Children', 'Georgia Association of Latino Elected Officials', 'Georgia Craft Brewers Guild', 'Georgia Democrats', 'Georgia Department of Economic Development', 'Georgia Department of Public Health', 'Georgia Department of Transportation', 'Georgia Family Council', 'Georgia Farm Bureau', 'Georgia Green Party', 'Georgia Gun Owners', 'Georgia House Democratic Caucus on behalf of Elena Parent', 'Georgia Lottery', 'Georgia Restaurant Association', 'Georgia State Road and Tollway Authority', 'Georgia Voice', 'Georgia politicians', 'Georgia state senators', 'Gun Free UT Gun Free UT', 'Gun Owners of America', 'Hal Turner Radio Show', 'Save America\'s Postal Service', 'Save Flexible Spending Plans', 'Save My Care', 'Save Our City, Milwaukeeans Can\'t Wait', 'Save Our Springs Alliance', 'Sierra Club', 'Travis County Republican Party', 'Urban Intellectuals', 'Virginia Center for Public Safety', 'Virginia Education Association', 'Virginia First Foundation', 'Virginia House Democratic Caucus' ,'Virginia Interfaith Center for Public Policy', 'Virginia Lottery', 'Virginia Senate Democratic Caucus', 'Virginia Senate Democrats', 'Virginia Society for Human Life', 'Virginia Tea Party Patriots']
df_personalities['gender'] = df_personalities['author_name'].apply(lambda x: gd.get_gender(x.split()[0]))
df_personalities['gender'] = np.where(df_personalities['author_name'].str.split().str[0].isin(unk_word_list), 'unknown', df_personalities['gender'])
df_personalities['gender'] = np.where(df_personalities['author_name'].isin(unk_list), 'unknown', df_personalities['gender'])
df_personalities['gender'].value_counts()

male             2293
unknown          1312
female            706
mostly_male       174
mostly_female      96
andy               29
Name: gender, dtype: int64

### Correct Misclassified Genders
We leverage the description of each author to correct some of the misclassified genders by checking to see if there are contradictions between the assigned gender and the description. For instance, if an author is assigned the gender `male`, but the description only contains `she` and `her`, there are high chances that it is a misclassification. 

In [13]:
# List of known males and females (hardcoded due to exceptions in following rules)
male_list = ['Mike Martinez', 'Matt Tighe']
female_list = ['Marco Rubio\'s heckler', 'Sharron Angle', 'Ann Marie Buerkle', 'Kaya Jones', 'Jane O’Meara Sanders', 'Tiffany Trump', 'Lauren Kane', 'María Teresa Kumar', 'Rick Scott\'s Starbucks heckler']

tmp = df_personalities.copy()

# Boolean variables to indicate presence of he/his or she/her in description
tmp['male_check'] = tmp['description'].apply(lambda x: np.nan if pd.isnull(x) else any(word in x.lower().split() for word in ['he', 'his']))
tmp['female_check'] = tmp['description'].apply(lambda x: np.nan if pd.isnull(x) else any(word in x.lower().split() for word in ['she', 'her']))

# Correct mis-classified records by generating lists of authors who should actually be male or female
# Logic: ie. if classified as male, but description only contains she/her, high probability is female
m2f = [x for x in tmp.loc[(tmp['gender'] == 'male') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]
f2m = [x for x in tmp.loc[(tmp['gender'] == 'female') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
mm2f = [x for x in tmp.loc[(tmp['gender'] == 'mostly_male') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]
mm2m = [x for x in tmp.loc[(tmp['gender'] == 'mostly_male') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
mf2f = [x for x in tmp.loc[(tmp['gender'] == 'mostly_female') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]
mf2m = [x for x in tmp.loc[(tmp['gender'] == 'mostly_female') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
a2m = [x for x in tmp.loc[(tmp['gender'] == 'andy') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
a2f = [x for x in tmp.loc[(tmp['gender'] == 'andy') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]

# Correct mis-classified genders 
df_personalities['gender'] = np.where(df_personalities['author_name'].isin(male_list + f2m + mm2m + mf2m + a2m), 'male', 
                                      np.where(df_personalities['author_name'].isin(female_list + m2f + mm2f + mf2f + a2f), 'female', 
                                          df_personalities['gender']))

df_personalities['gender'].value_counts()

male             2387
unknown          1312
female            750
mostly_male        92
mostly_female      52
andy               17
Name: gender, dtype: int64

### Merge metadata

In [14]:
# Merge meta data
df_full = df_quotes.merge(df_personalities, on=['author_id', 'author_name'], how='left')
if df_quotes.shape[0] != df_full.shape[0]:
    print('Warning! There are more rows than before!')

## Derive Additional Features

In [15]:
df_full.head()

Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff,label_binary,context_clean,affiliation,description,link,gender
0,0,“Pennsylvania just banned alcohol sales.”,a Facebook post,viral-image,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke,0,facebook,,"Graphics, pictures and charts shared on social...",,unknown
1,0,"“666,000 teachers have been laid off already s...",a virtual roundtable,joe-biden,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy,0,roundtable,Democrat,Joe Biden is President-elect of the United Sta...,https://www.joebiden.com/,male
2,0,“David Perdue says he'll do everything in his ...,an ad,jon-ossoff,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher,0,ad,Democrat,Jon Ossoff is a Democrat running to succeed fo...,https://electjon.com/,male
3,0,Says “47 additional counties used the same sof...,a Facebook post,ted-nugent,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman,0,facebook,Republican,"Ted Nugent, who lives near Waco, performed aft...",http://www.tednugent.com/,male
4,0,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,facebook-posts,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke,0,video,,Posters on Facebook and other social media net...,https://www.facebook.com/,unknown


In [16]:
# Date related features
# TODO: Fix unspecified dates...
df_full.loc[df_full['date'] == 'unspecified', 'date'] = np.nan
df_full['date_formatted'] = pd.to_datetime(df_full['date'], format='%B %d, %Y')
df_full['date_year'] = df_full['date_formatted'].dt.year.astype('Int64')
df_full['date_month'] = df_full['date_formatted'].dt.month.astype('Int64')
df_full['date_day'] = df_full['date_formatted'].dt.day.astype('Int64')

In [17]:
# Quote related features
df_full['num_words'] = df_full['quote'].str.split().str.len()
df_full['num_chars'] = df_full['quote'].str.len()
df_full['avg_word_len'] = df_full['quote'].apply(lambda x: round((sum(len(word) for word in x.split()) / len(x.split())), 1))
df_full['num_stopwords'] = df_full['quote'].apply(lambda x: len([w for w in x.split() if w.lower() in stopwords.words('english')]))

In [18]:
# Normalize numerical variables
scaler = preprocessing.MinMaxScaler()
features = ['num_words', 'num_chars', 'avg_word_len', 'num_stopwords']
for feat in features:
    df_full[[feat]] = scaler.fit_transform(df_full[[feat]])

In [19]:
# One-Hot encode author id
df_full = df_full.merge(pd.get_dummies(df_full['author_id'].str.lower(), prefix='author'),
                         left_index=True, right_index=True)

In [20]:
# One-Hot encode context
df_full = df_full.merge(pd.get_dummies(df_full['context_clean'].str.lower(), prefix='context'),
                         left_index=True, right_index=True)

In [21]:
# Clean up affiliation variables
# Any category with fewer than 10 occurrences are set to "Other"
df_full.loc[df_full.groupby('affiliation')['affiliation'].transform('count').lt(10), 'affiliation'] = 'Other'
# One-Hot encode affiliation
df_full = df_full.merge(pd.get_dummies(df_full['affiliation'].str.lower(), prefix='affiliation'),
                         left_index=True, right_index=True)

In [22]:
# One-Hot encode gender
df_full = df_full.merge(pd.get_dummies(df_full['gender'].str.lower(), prefix='gender'),
                         left_index=True, right_index=True)


In [23]:
def remove_stopwords(text):
    new_words = []
    stop_words = stopwords.words('english')
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return ' '.join(new_words)

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return ''.join(new_words)

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()   
    new_words = []
    words = word_tokenize(text)
    for word in words:
        new_words.append(str(lemmatizer.lemmatize(word)))
    return ' '.join(new_words)

def preprocess(df, t):
    df[t] = df[t].apply(lambda x : x.lower()) #Lower case everything
    df[t] = df[t].apply(lambda x : re.sub(r'[^\w\s]', '', x)) #Remove punctuation 
    df[t] = df[t].apply(lambda x : remove_non_ascii(x))  #Removing Non ASCII Words
    df[t] = df[t].apply(lambda x : remove_stopwords(x))#Removing Stopwords
    df[t] = df[t].apply(lambda x : lemmatize_words(x))#Lemmatize words 
    return df[t]

In [24]:
df_full['quote'] = preprocess(df_full,'quote')

In [25]:
df_full.head()

Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff,label_binary,context_clean,...,affiliation_other,affiliation_republican,affiliation_state official,affiliation_talk show host,gender_andy,gender_female,gender_male,gender_mostly_female,gender_mostly_male,gender_unknown
0,0,pennsylvania banned alcohol sale,a Facebook post,viral-image,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke,0,facebook,...,0,0,0,0,0,0,0,0,0,1
1,0,666000 teacher laid already since march,a virtual roundtable,joe-biden,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy,0,roundtable,...,0,0,0,0,0,0,1,0,0,0
2,0,david perdue say hell everything power make su...,an ad,jon-ossoff,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher,0,ad,...,0,0,0,0,0,0,1,0,0,0
3,0,say 47 additional county used software caused ...,a Facebook post,ted-nugent,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman,0,facebook,...,0,1,0,0,0,0,1,0,0,0
4,0,voter fraud exposed georgia 2600 vote found,in a Live video,facebook-posts,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke,0,video,...,0,0,0,0,0,0,0,0,0,1


## Discard Rows with Missing Data

In [26]:
# Some rows don't have dates (bug in scraping?), so we will just discard them for now...
print(f'Before: {df_full.shape[0]} rows.')
df_full = df_full.loc[~df_full['date_formatted'].isnull()]
print(f'After: {df_full.shape[0]} rows.')

Before: 18046 rows.
After: 17906 rows.


## Export Dataset

In [27]:
df_full.to_csv('data/df_quotes_clean.csv', index=False)