# Data Cleaning and Processing

In [1]:
import pandas as pd
import numpy as np
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler
import gender_guesser.detector as gender

## Load datasets
Load all 6 datasets (true, mostly-true, half-true, barely-true, false, pants-fire) and combine them into a single one.

In [2]:
# Read in and append dataframes
data_path = 'data/'
df_quotes = pd.DataFrame()
for filename in os.listdir(data_path):
     df_quotes = df_quotes.append(pd.read_csv(data_path + filename, dtype={'label':str}, na_values='Unspecified'), ignore_index=True)

df_quotes.head()

Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,viral-image,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,joe-biden,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,jon-ossoff,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,ted-nugent,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,facebook-posts,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke


## Load metadata
Load additional information on authors and merge to main dataset.

In [3]:
# Read in meta data
metadata_path = 'metadata/'
df_personalities = pd.read_csv(metadata_path + 'personalities.csv')
df_personalities.head()

Unnamed: 0,author_id,author_name,affiliation,description,link
0,13th-district-gop-slate,13th District GOP slate,Republican,The 13th District GOP slate includes state Sen...,
1,18-percent-american-public,18% of the American public,,,
2,60-plus-association,60 Plus Association,,The 60 Plus Association is a conservative advo...,http://www.60plus.org/
3,AARP,AARP,,"AARP is a nonprofit, nonpartisan organization ...",http://www.aarp.org/
4,greg-abbott,Greg Abbott,Republican,Greg Abbott won election as governor of Texas ...,http://gregabbott.com/


### Deduplicate Entries
Ensure that there are no duplicated entries.

In [4]:
dup_personalities = df_personalities['author_id'][df_personalities['author_id'].duplicated()].unique()
df_dup = df_personalities.loc[df_personalities['author_id'].isin(dup_personalities)]
exceptions_to_drop = ['Billboard at Spaghetti Junction'] # Maps to another page...

print(f'Before: {df_personalities.shape[0]} rows')
df_personalities = df_personalities.drop_duplicates()
df_personalities = df_personalities.loc[~df_personalities['author_name'].isin(exceptions_to_drop)]
print(f'After: {df_personalities.shape[0]} rows')

# Check to see if there are still duplicated author_id after
if any(df_personalities.loc[df_personalities['author_id'].isin(dup_personalities)]['author_id'].duplicated()):
    print('Warning! There are still duplicated author_id record(s).')

Before: 4643 rows
After: 4610 rows


### Derive Author's Gender
Using `gender-guesser`, we attempt to derive additional information of the author's gender based on their **first** names. Each author is classied into either `male`, `female`, `mostly male` and `mostly female`, whereas entries that are not names (ie. organizations, entities, etc.) are assigned `unknown`.

In [5]:
# Derive gender from first name whenever possible
gd = gender.Detector()
unk_word_list = ['The', 'Young', 'My', 'In', 'Ban', 'Free'] # List of first words known to be misclassified
unk_list = ['Al Jazeera America', 'Austin Board of Realtors PAC', 'Austin Fund for Quality Healthcare', 'Austin Independent School District', 'Austin Water Utility', 'Austin for a Better Future', 'Brady Campaign to Prevent Gun Violence', 'Christian Broadcasting Network', 'Clayton County Government', 'Clayton County Schools', 'Dane County Republican Party', 'Dustin Inman Society', 'Elle', 'Fair Districts Florida', 'Forbes blog', 'Georgia Association of Homes and Services for Children', 'Georgia Association of Latino Elected Officials', 'Georgia Craft Brewers Guild', 'Georgia Democrats', 'Georgia Department of Economic Development', 'Georgia Department of Public Health', 'Georgia Department of Transportation', 'Georgia Family Council', 'Georgia Farm Bureau', 'Georgia Green Party', 'Georgia Gun Owners', 'Georgia House Democratic Caucus on behalf of Elena Parent', 'Georgia Lottery', 'Georgia Restaurant Association', 'Georgia State Road and Tollway Authority', 'Georgia Voice', 'Georgia politicians', 'Georgia state senators', 'Gun Free UT Gun Free UT', 'Gun Owners of America', 'Hal Turner Radio Show', 'Save America\'s Postal Service', 'Save Flexible Spending Plans', 'Save My Care', 'Save Our City, Milwaukeeans Can\'t Wait', 'Save Our Springs Alliance', 'Sierra Club', 'Travis County Republican Party', 'Urban Intellectuals', 'Virginia Center for Public Safety', 'Virginia Education Association', 'Virginia First Foundation', 'Virginia House Democratic Caucus' ,'Virginia Interfaith Center for Public Policy', 'Virginia Lottery', 'Virginia Senate Democratic Caucus', 'Virginia Senate Democrats', 'Virginia Society for Human Life', 'Virginia Tea Party Patriots']
df_personalities['gender'] = df_personalities['author_name'].apply(lambda x: gd.get_gender(x.split()[0]))
df_personalities['gender'] = np.where(df_personalities['author_name'].str.split().str[0].isin(unk_word_list), 'unknown', df_personalities['gender'])
df_personalities['gender'] = np.where(df_personalities['author_name'].isin(unk_list), 'unknown', df_personalities['gender'])
df_personalities['gender'].value_counts()

male             2293
unknown          1312
female            706
mostly_male       174
mostly_female      96
andy               29
Name: gender, dtype: int64

### Correct Misclassified Genders
We leverage the description of each author to correct some of the misclassified genders by checking to see if there are contradictions between the assigned gender and the description. For instance, if an author is assigned the gender `male`, but the description only contains `she` and `her`, there are high chances that it is a misclassification. 

In [6]:
# List of known males and females (hardcoded due to exceptions in following rules)
male_list = ['Mike Martinez', 'Matt Tighe']
female_list = ['Marco Rubio\'s heckler', 'Sharron Angle', 'Ann Marie Buerkle', 'Kaya Jones', 'Jane O’Meara Sanders', 'Tiffany Trump', 'Lauren Kane', 'María Teresa Kumar', 'Rick Scott\'s Starbucks heckler']

tmp = df_personalities.copy()

# Boolean variables to indicate presence of he/his or she/her in description
tmp['male_check'] = tmp['description'].apply(lambda x: np.nan if pd.isnull(x) else any(word in x.lower().split() for word in ['he', 'his']))
tmp['female_check'] = tmp['description'].apply(lambda x: np.nan if pd.isnull(x) else any(word in x.lower().split() for word in ['she', 'her']))

# Correct mis-classified records by generating lists of authors who should actually be male or female
# Logic: ie. if classified as male, but description only contains she/her, high probability is female
m2f = [x for x in tmp.loc[(tmp['gender'] == 'male') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]
f2m = [x for x in tmp.loc[(tmp['gender'] == 'female') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
mm2f = [x for x in tmp.loc[(tmp['gender'] == 'mostly_male') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]
mm2m = [x for x in tmp.loc[(tmp['gender'] == 'mostly_male') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
mf2f = [x for x in tmp.loc[(tmp['gender'] == 'mostly_female') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]
mf2m = [x for x in tmp.loc[(tmp['gender'] == 'mostly_female') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
a2m = [x for x in tmp.loc[(tmp['gender'] == 'andy') & (tmp['male_check'] == True) & (tmp['female_check'] == False), 'author_name'] if x not in female_list]
a2f = [x for x in tmp.loc[(tmp['gender'] == 'andy') & (tmp['male_check'] == False) & (tmp['female_check'] == True), 'author_name'] if x not in male_list]

# Correct mis-classified genders 
df_personalities['gender'] = np.where(df_personalities['author_name'].isin(male_list + f2m + mm2m + mf2m + a2m), 'male', 
                                      np.where(df_personalities['author_name'].isin(female_list + m2f + mm2f + mf2f + a2f), 'female', 
                                          df_personalities['gender']))

df_personalities['gender'].value_counts()

male             2387
unknown          1312
female            750
mostly_male        92
mostly_female      52
andy               17
Name: gender, dtype: int64

### Merge metadata

In [7]:
# Merge meta data
df_full = df_quotes.merge(df_personalities, on=['author_id', 'author_name'], how='left')
if df_quotes.shape[0] != df_full.shape[0]:
    print('Warning! There are more rows than before!')

## Derive Additional Features

In [8]:
df_full.head()

Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff,affiliation,description,link,gender
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,viral-image,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke,,"Graphics, pictures and charts shared on social...",,unknown
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,joe-biden,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy,Democrat,Joe Biden is President-elect of the United Sta...,https://www.joebiden.com/,male
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,jon-ossoff,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher,Democrat,Jon Ossoff is a Democrat running to succeed fo...,https://electjon.com/,male
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,ted-nugent,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman,Republican,"Ted Nugent, who lives near Waco, performed aft...",http://www.tednugent.com/,male
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,facebook-posts,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke,,Posters on Facebook and other social media net...,https://www.facebook.com/,unknown


In [9]:
# Date related features
# TODO: Fix unspecified dates...
df_full.loc[df_full['date'] == 'unspecified', 'date'] = np.nan
df_full['date_formatted'] = pd.to_datetime(df_full['date'], format='%B %d, %Y')
df_full['year'] = df_full['date_formatted'].dt.year.astype('Int64')
df_full['month'] = df_full['date_formatted'].dt.month.astype('Int64')
df_full['day'] = df_full['date_formatted'].dt.day.astype('Int64')

In [10]:
# Quote related features
df_full['num_words'] = df_full['quote'].str.split().str.len()
df_full['num_chars'] = df_full['quote'].str.len()
df_full['avg_word_len'] = df_full['quote'].apply(lambda x: round((sum(len(word) for word in x.split()) / len(x.split())), 1))
df_full['num_stopwords'] = df_full['quote'].apply(lambda x: len([w for w in x.split() if w.lower() in stopwords.words('english')]))

In [11]:
# Normalize numerical variables
scaler = MinMaxScaler()
features = ['num_words', 'num_chars', 'avg_word_len', 'num_stopwords']
for feat in features:
    df_full[[feat]] = scaler.fit_transform(df_full[[feat]])

In [12]:
df_full.head()

Unnamed: 0,label,quote,context,author_id,author_name,date,categories,staff,affiliation,description,link,gender,date_formatted,year,month,day,num_words,num_chars,avg_word_len,num_stopwords
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,viral-image,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke,,"Graphics, pictures and charts shared on social...",,unknown,2020-11-24,2020,11,24,0.026316,0.053738,0.611111,0.022727
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,joe-biden,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy,Democrat,Joe Biden is President-elect of the United Sta...,https://www.joebiden.com/,male,2020-11-18,2020,11,18,0.078947,0.093458,0.361111,0.068182
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,jon-ossoff,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher,Democrat,Jon Ossoff is a Democrat running to succeed fo...,https://electjon.com/,male,2020-11-17,2020,11,17,0.157895,0.149533,0.208333,0.090909
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,ted-nugent,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman,Republican,"Ted Nugent, who lives near Waco, performed aft...",http://www.tednugent.com/,male,2020-11-17,2020,11,17,0.407895,0.443925,0.291667,0.295455
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,facebook-posts,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke,,Posters on Facebook and other social media net...,https://www.facebook.com/,unknown,2020-11-16,2020,11,16,0.078947,0.093458,0.361111,0.045455
