In [139]:
import pandas as pd
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re, unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing

In [140]:
df = pd.DataFrame()
for filename in os.listdir('datasets'):
     df = df.append(pd.read_csv('datasets/' + filename, dtype={'label':str}), ignore_index=True)

df.shape

(18008, 8)

In [141]:
df.head()

Unnamed: 0,index,label,quote,context,author,date,categories,staff
0,0,barely-true,“When large numbers of people in Butler (Pa.) ...,a Fox News broadcast,Tucker Carlson,"November 2, 2020","National, Drugs, Pundits, PunditFact",Jon Greenberg
1,1,barely-true,"Broken voting machines in Russellville, Ark., ...",a post on Facebook,Viral image,"October 21, 2020","Elections, Facebook Fact-checks",Bill McCarthy
2,2,barely-true,Pennsylvania officials are “attempting to sile...,a Facebook post,Facebook posts,"November 2, 2020","Elections, Facebook Fact-checks, Coronavirus",Ciara O'Rourke
3,3,barely-true,"In Massachusetts, ""anybody can go vote for any...",a video posted on TikTok,TikTok posts,"September 1, 2020","Elections, Voter ID Laws",Miriam Valverde
4,4,barely-true,“Joe Biden and Kamala Harris’ government-run h...,a TV ad,Donald Trump,"October 28, 2020","National, Health Care",Amy Sherman


In [142]:


def remove_stopwords(text):
    new_words = []
    stop_words = stopwords.words('english')
    words = nltk.word_tokenize(text)
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return ' '.join(new_words)

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return ''.join(new_words)

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()   
    new_words = []
    words = nltk.word_tokenize(text)
    for word in words:
        new_words.append(str(lemmatizer.lemmatize(word)))
    return ' '.join(new_words)

def preprocess(df, t):
     df[t] = df[t].apply(lambda x : x.lower()) #Lower case everything
     df[t] = df[t].apply(lambda x : re.sub(r'[^\w\s]', '', x)) #Remove punctuation 
     #df[t] = df[t].apply(lambda x : remove_non_ascii(x))  #Removing Non ASCII Words
     df[t] = df[t].apply(lambda x : remove_stopwords(x))#Removing Stopwords
     df[t] = df[t].apply(lambda x : lemmatize_words(x))#Lemmatize words 
     return df[t]



In [143]:
df['quote'] = preprocess(df,'quote')

In [144]:
df.head()

Unnamed: 0,index,label,quote,context,author,date,categories,staff
0,0,barely-true,large number people butler pa started killing ...,a Fox News broadcast,Tucker Carlson,"November 2, 2020","National, Drugs, Pundits, PunditFact",Jon Greenberg
1,1,barely-true,broken voting machine russellville ark tried a...,a post on Facebook,Viral image,"October 21, 2020","Elections, Facebook Fact-checks",Bill McCarthy
2,2,barely-true,pennsylvania official attempting silence voter...,a Facebook post,Facebook posts,"November 2, 2020","Elections, Facebook Fact-checks, Coronavirus",Ciara O'Rourke
3,3,barely-true,massachusetts anybody go vote anybody long inf...,a video posted on TikTok,TikTok posts,"September 1, 2020","Elections, Voter ID Laws",Miriam Valverde
4,4,barely-true,joe biden kamala harris governmentrun health c...,a TV ad,Donald Trump,"October 28, 2020","National, Health Care",Amy Sherman


In [15]:
quote = df['quote'].to_numpy()
label = df['label'].to_numpy()

“When large numbers of people in Butler (Pa.) started killing themselves with narcotics, no one in Washington or New York or Los Angeles said a word about it.”


In [18]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(quote, label)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [24]:
print(valid_x)


['As the usage [of synthetic marijuana] has dramatically increased, instances of violence, bodily harm and even death have risen with it.'
 'The Nevada Gaming Commission did not find Trump ‘trustworthy’ enough for a gaming license.'
 'Says former Ukraine President Petro Poroshenko "actively worked for Secretary Clinton.'
 ... 'U.S. taxpayers paid $71,500 per job created by the stimulus bill.'
 'Sen. Obama has declared, and repeatedly reaffirmed his intention to meet the president of Iran without any preconditions.'
 'Says Melania Trump’s dress at Mount Rushmore speech was designed from “drawings of several young victims of sex trafficking.']


In [30]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english', max_df=0.7)
tfidf_vect.fit(quote)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)