In [1]:
import pandas as pd
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re, unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing

In [2]:
df = pd.DataFrame()
for filename in os.listdir('datasets'):
     df = df.append(pd.read_csv('datasets/' + filename, dtype={'label':str}), ignore_index=True)

df.shape

(18099, 7)

In [3]:
df.head()

Unnamed: 0,label,quote,context,author,date,categories,staff
0,barely-true,"“666,000 teachers have been laid off already s...",a virtual roundtable,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy
1,barely-true,“David Perdue says he'll do everything in his ...,an ad,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher
2,barely-true,Says “47 additional counties used the same sof...,a Facebook post,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman
3,barely-true,"""Voter FRAUD exposed in Georgia. Over 2600 vot...",in a Live video,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke
4,barely-true,"Says Raphael Warnock ""ran over his wife"" and w...",a tweet,Erick Erickson,"November 13, 2020","Georgia, Candidate Biography, Crime, PunditFact",Tom Kertscher


In [4]:
def remove_stopwords(text):
    new_words = []
    stop_words = stopwords.words('english')
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return ' '.join(new_words)

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return ''.join(new_words)

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()   
    new_words = []
    words = word_tokenize(text)
    for word in words:
        new_words.append(str(lemmatizer.lemmatize(word)))
    return ' '.join(new_words)

def preprocess(df, t):
    df[t] = df[t].apply(lambda x : x.lower()) #Lower case everything
    df[t] = df[t].apply(lambda x : re.sub(r'[^\w\s]', '', x)) #Remove punctuation 
    #df[t] = df[t].apply(lambda x : remove_non_ascii(x))  #Removing Non ASCII Words
    df[t] = df[t].apply(lambda x : remove_stopwords(x))#Removing Stopwords
    #df[t] = df[t].apply(lambda x : lemmatize_words(x))#Lemmatize words
    return df[t]



In [5]:
df['quote'] = preprocess(df,'quote')

In [6]:
df.head()

Unnamed: 0,label,quote,context,author,date,categories,staff
0,barely-true,666000 teachers laid already since march,a virtual roundtable,Joe Biden,"November 18, 2020","Education, Coronavirus",Bill McCarthy
1,barely-true,david perdue says hell everything power make s...,an ad,Jon Ossoff,"November 17, 2020","Georgia, Negative Campaigning",Tom Kertscher
2,barely-true,says 47 additional counties used software caus...,a Facebook post,Ted Nugent,"November 17, 2020","Elections, Facebook Fact-checks",Samantha Putterman
3,barely-true,voter fraud exposed georgia 2600 votes found,in a Live video,Facebook posts,"November 16, 2020","Georgia, Elections, Facebook Fact-checks",Daniel Funke
4,barely-true,says raphael warnock ran wife arrested obstruc...,a tweet,Erick Erickson,"November 13, 2020","Georgia, Candidate Biography, Crime, PunditFact",Tom Kertscher


In [7]:
quote = df['quote'].to_numpy()
label = df['label'].to_numpy()

In [8]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(quote, label)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [9]:
print(valid_x)

['donald trump called military served disaster'
 'photo shows time magazine cover barack obama says treason'
 'says conor lamb ran campaign said nice things' ...
 'n95 masks block covid19 particles due size'
 'america owns 3 percent worlds oil consumes 25 percent global reserves'
 'personally prochoice always made qualms elected governor']


In [10]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english', max_df=0.7)
tfidf_vect.fit(quote)
xtrain_tfidf =  tfidf_vect.transform(train_x).toarray()
xvalid_tfidf =  tfidf_vect.transform(valid_x).toarray()

In [11]:
from sklearn.naive_bayes import GaussianNB
from time import time
t0 = time()
model = GaussianNB()
model.fit(xtrain_tfidf, train_y)
print(f"\nTraining time: {round(time()-t0, 3)}s")
t0 = time()
score_train = model.score(xtrain_tfidf, train_y)
print(f'Prediction time (train): {round(time()-t0, 3)}s')
t0 = time()
score_test = model.score(xvalid_tfidf, valid_y)
print(f'Prediction time (test): {round(time()-t0, 3)}s')
print('\nTrain set score:', score_train)
print('Test set score:', score_test)


Training time: 7.539s
Prediction time (train): 35.424s
Prediction time (test): 9.954s

Train set score: 0.7003094150581995
Test set score: 0.19646408839779006
