In [268]:
#Import all the necessary libraries

import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics import classification_report

In [269]:
#Load the four JSON files

politifact_hf = pd.read_json('../raw_data/politifact_hf.json', orient='index')
politifact_hr = pd.read_json('../raw_data/politifact_hr.json', orient='index')

gossipcop_hf = pd.read_json('../raw_data/gossipcop_hf.json', orient='index')
gossipcop_hr = pd.read_json('../raw_data/gossipcop_hr.json', orient='index')

In [270]:
#Add results column

politifact_hf[['fake']] = 1
politifact_hr[['fake']] = 0

gossipcop_hf[['fake']] = 1
gossipcop_hr[['fake']] = 0

In [271]:
#Create dataframe

files = [politifact_hf, politifact_hr, gossipcop_hf, gossipcop_hr]

data = pd.concat(files, ignore_index=True)
data

Unnamed: 0,id,text,title,description,fake
0,politifact11773,Republican attacks on transgendered Americans ...,Virginia Republican Wants Schools To Check Chi...,Republican attacks on transgendered Americans ...,1
1,politifact13827,Whoopi Goldberg is in hot water after comments...,Whoopi Goldberg: Navy SEAL Widow was “Looking ...,Whoopi Goldberg is in hot water after comments...,1
2,politifact13570,"Washington, DC — A former Secret Service agent...",Secret Service Agent Says Obama Is Muslim & Ga...,"Washington, DC — A former Secret Service agent...",1
3,politifact14947,Bill Clinton’s hitman has confessed to more th...,Bill Clinton’s Hitman Confesses On His Deathbe...,Bill Clinton’s hitman has confessed to more th...,1
4,politifact14517,About Trendolizer™\n\nTrendolizer™ (patent pen...,UPDATE: Florida Governor Rick Scott Now Listed...,Scott&#8217;s prognosis isn&#8217;t good. (via...,1
...,...,...,...,...,...
12538,gossipcop-875489,For free real time breaking news alerts sent s...,The top interior design trends for millennials,From hand-baked clay tiles to LED lights that ...,0
12539,gossipcop-844263,Gilmore Girls: A Year in the Life made its Net...,"Gilmore Girls Video: Lauren Graham, Alexis Ble...",Gilmore Girls: A Year in the Life made its Net...,0
12540,gossipcop-917467,Why Is It Airing Now?\n\nAccording to the exec...,"The O.J. Simpson Interview on Fox: Gripping, G...",On Sunday Fox aired “O.J. Simpson: The Lost Co...,0
12541,gossipcop-924877,Just when you thought this season of Vanderpum...,Kristen Doute and James Kennedy Hooked Up Rumo...,Just when you thought this season of Vanderpum...,0


In [272]:
def remove_duplicates_errors(data: pd.DataFrame) -> pd.DataFrame:
    """
    Clean raw data by
    - removing duplicates within fake-category (keep 1)
    - removing duplicates across fake-categories (delete both)
    - deleting texts that are shorter than their title (error messages, headers etc.)
    """
    # Remove duplicates within fake-category
    data =data.drop_duplicates(subset=("text", "fake"), keep='first', ignore_index=True)

    # Remove duplicates across fake-category
    data=data.drop_duplicates(subset=("text"), keep=False, ignore_index=True)

    # Delete false texts
    data["text_len"] = data['text'].str.len()
    data["title_len"]=data['title'].str.len()
    data=data[data["text_len"]>=data["title_len"]]

    return data

In [273]:
data = remove_duplicates_errors(data)
data.shape

(11233, 7)

In [274]:
true = data[data['fake'] == 0].sample(n=3500)
true.shape

(3500, 7)

In [275]:
false = data[data['fake'] == 1].sample(n=3500)
false.shape

(3500, 7)

In [276]:
files = [true, false]

data = pd.concat(files, ignore_index=True)
data.shape

(7000, 7)

In [277]:
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt_tab')

In [278]:
def preprocessing(text):

    # Removing whitespaces
    text = text.strip()
    # Lowercasing
    text = text.lower()
    # Removing numbers
    text = ''.join(char for char in text if not char.isdigit())
    # Removing punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # Tokenizing
    tokenized = word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english')) 
    without_stopwords = [word for word in tokenized if not word in stop_words]
    
    cleaned_sentence = " ".join(without_stopwords)
    
    return cleaned_sentence

In [279]:
#X = data.text.apply(preprocessing)

In [280]:
# Split the dataframe into features and target

X = data['text']
y = data['fake']

In [281]:
# Check the balance of the dataset

round(y.value_counts(normalize = True), 2)

fake
0    0.5
1    0.5
Name: proportion, dtype: float64

In [282]:
# Create a train/test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

In [283]:
# Create the model pipeline

pipe = make_pipeline(CountVectorizer(), MultinomialNB())
pipe

In [284]:
# Train the pipeline and score

pipe.fit(X_train, y_train)

#pipe.score(X_test, y_test)

In [285]:
y_pred = pipe.predict(X_test)

classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.7847113884555382,
  'recall': 0.7321688500727802,
  'f1-score': 0.7575301204819277,
  'support': 687.0},
 '1': {'precision': 0.7575757575757576,
  'recall': 0.8064516129032258,
  'f1-score': 0.78125,
  'support': 713.0},
 'accuracy': 0.77,
 'macro avg': {'precision': 0.771143573015648,
  'recall': 0.7693102314880029,
  'f1-score': 0.7693900602409638,
  'support': 1400.0},
 'weighted avg': {'precision': 0.7708915993003357,
  'recall': 0.77,
  'f1-score': 0.7696103162650603,
  'support': 1400.0}}