# Data Cleaning and Processing

In [35]:
import pandas as pd
import numpy as np
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read in and append dataframes
data_path = 'data/'
df = pd.DataFrame()
for filename in os.listdir(data_path):
     df = df.append(pd.read_csv(data_path + filename, dtype={'label':str}, na_values='Unspecified'), ignore_index=True)

df.head()

Unnamed: 0,label,quote,context,author,date,categories,staff
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke


In [3]:
# Read in meta data
metadata_path = 'metadata/'
df_personalities = pd.read_csv(metadata_path + 'personalities.csv')
df_personalities.head()

Unnamed: 0,personality,affiliation,description,link
0,13th District GOP slate,Republican,The 13th District GOP slate includes state Sen...,
1,18% of the American public,,,
2,60 Plus Association,,The 60 Plus Association is a conservative advo...,http://www.60plus.org/
3,AARP,,"AARP is a nonprofit, nonpartisan organization ...",http://www.aarp.org/
4,Greg Abbott,Republican,Greg Abbott won election as governor of Texas ...,http://gregabbott.com/


In [4]:
# Merge meta data
df = df.merge(df_personalities, left_on='author', right_on='personality', how='left')
df.head()

Unnamed: 0,label,quote,context,author,date,categories,staff,personality,affiliation,description,link
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke,Viral image,,"Graphics, pictures and charts shared on social...",
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy,Joe Biden,Democrat,Joe Biden is President-elect of the United Sta...,https://www.joebiden.com/
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher,Jon Ossoff,Democrat,Jon Ossoff is a Democrat running to succeed fo...,https://electjon.com/
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman,Ted Nugent,Republican,"Ted Nugent, who lives near Waco, performed aft...",http://www.tednugent.com/
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke,Facebook posts,,Posters on Facebook and other social media net...,https://www.facebook.com/


## Derive Additional Features

In [9]:
# Date related features
df['date_formatted'] = pd.to_datetime(df['date'], format='%B %d, %Y')
df['year'] = df['date_formatted'].dt.year.astype('Int64')
df['month'] = df['date_formatted'].dt.month.astype('Int64')
df['day'] = df['date_formatted'].dt.day.astype('Int64')

In [33]:
# Quote related features
df['num_words'] = df['quote'].str.split().str.len()
df['num_chars'] = df['quote'].str.len()
df['avg_word_len'] = df['quote'].apply(lambda x: round((sum(len(word) for word in x.split()) / len(x.split())), 1))
df['num_stopwords'] = df['quote'].apply(lambda x: len([w for w in x.split() if w.lower() in stopwords.words('english')]))

In [36]:
# Normalize numerical variables
scaler = MinMaxScaler()
features = ['num_words', 'num_chars', 'avg_word_len', 'num_stopwords']
for feat in features:
    df[[feat]] = scaler.fit_transform(df[[feat]])

In [37]:
df.head()

Unnamed: 0,label,quote,context,author,date,categories,staff,personality,affiliation,description,link,date_formatted,year,month,day,num_words,num_chars,avg_word_len,num_stopwords
0,barely-true,“Pennsylvania just banned alcohol sales.”,a Facebook post,Viral image,"November 24, 2020","Facebook Fact-checks, Coronavirus",Ciara O'Rourke,Viral image,,"Graphics, pictures and charts shared on social...",,2020-11-24,2020,11,24,0.038961,0.060606,0.657143,0.022727
1,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy,Joe Biden,Democrat,Joe Biden is President-elect of the United Sta...,https://www.joebiden.com/,2020-11-18,2020,11,18,0.090909,0.100233,0.4,0.068182
2,barely-true,“David Perdue says he'll do everything in his ...,an ad,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher,Jon Ossoff,Democrat,Jon Ossoff is a Democrat running to succeed fo...,https://electjon.com/,2020-11-17,2020,11,17,0.168831,0.156177,0.242857,0.090909
3,barely-true,Says “47 additional counties used the same sof...,a Facebook post,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman,Ted Nugent,Republican,"Ted Nugent, who lives near Waco, performed aft...",http://www.tednugent.com/,2020-11-17,2020,11,17,0.415584,0.449883,0.328571,0.295455
4,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke,Facebook posts,,Posters on Facebook and other social media net...,https://www.facebook.com/,2020-11-16,2020,11,16,0.090909,0.100233,0.4,0.045455


## Data Cleaning and Processing

In [4]:
# Format date variable and extract information
df['date_formatted'] = pd.to_datetime(df['date'], format='%B %d, %Y')
df['year'] = df['date_formatted'].dt.year.astype('Int64')## Data Cleaning and Processing

the        13947
of          8048
in          7919
to          7799
a           6171
and         4960
Says        4522
for         3415
that        3041
is          3001
on          2152
have        2061
The         2044
are         1975
has         1823
than        1764
was         1543
by          1396
with        1386
from        1385
percent     1346
more        1308
as          1159
not         1056
people       975
at           952
it           939
be           936
our          932
tax          919
dtype: int64