# Importing libraries

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import re
import string
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

### Tensorflow dependencies ###
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, Sequential 
from tensorflow.keras import backend as K

### Scikit-learn dependencies ###
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

### nltk dependencies ###
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

### Download nltk data ###
!python3 -m nltk.downloader punkt
!python3 -m nltk.downloader stopwords
!python3 -m nltk.downloader wordnet

### Some constants ###
true_path = "/content/drive/My Drive/True.csv"
fake_path = "/content/drive/My Drive/Fake.csv"

Mounted at /content/drive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading and preprocessing text data

### Loading data

In [2]:
df_true = pd.read_csv(true_path, header=0).sample(n=10000, random_state=np.random.randint(0,40000))
df_fake = pd.read_csv(fake_path, header=0).sample(n=10000, random_state=np.random.randint(0,40000))

df_true['label'] = 1
df_fake['label'] = 0
df = pd.concat([df_true, df_fake])

df.head(10)

Unnamed: 0,title,text,subject,date,label
12150,Protesters injured in Honduras clashes as elec...,TEGUCIGALPA (Reuters) - Over two dozen people ...,worldnews,"December 15, 2017",1
15510,Casualties in religious attacks in Afghanistan...,KABUL (Reuters) - Civilian casualties in relig...,worldnews,"November 7, 2017",1
4400,"U.S. attack on Syrian base killed five, Homs g...",BEIRUT (Reuters) - A U.S. missile strike on an...,politicsNews,"April 7, 2017",1
4050,U.S. says strategy on North Korea centers on s...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"April 26, 2017",1
951,White House chief of staff calls for special c...,WASHINGTON (Reuters) - White House Chief of St...,politicsNews,"October 31, 2017",1
19706,China's highest-profile fugitive assailed by b...,BEIJING (Reuters) - China s highest profile fu...,worldnews,"September 19, 2017",1
8118,Mexico president following California marijuan...,(This Sept 15 story corrects “asked” in first...,politicsNews,"September 15, 2016",1
19864,"Venezuela's Maduro upbeat on talks, opposition...",CARACAS (Reuters) - President Nicolas Maduro h...,worldnews,"September 16, 2017",1
6046,Any UK-U.S. trade deal will put Britain first:...,LONDON (Reuters) - British Prime Minister Ther...,politicsNews,"January 25, 2017",1
2635,McCain illness deprives Senate of crucial vote...,WASHINGTON (Reuters) - If John McCain’s illnes...,politicsNews,"July 20, 2017",1


### Cleaning text data

In [3]:
def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

stop_words = set(stopwords.words('english')) # Assuming all nltk data is installed
lemmatizer = WordNetLemmatizer()

def filter_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered = [lemmatizer.lemmatize(w) for w in word_tokens if not w in stop_words]
    filtered_sentence = ' '.join(filtered)

    ### Removing all special characters ###
    special_chars = ['…', '–', '’', '‘', '”', '“']

    for char_ in special_chars:
      filtered_sentence = filtered_sentence.replace(char_, "")

    # Removal of extra spaces
    filtered_sentence = re.sub(' +',' ',filtered_sentence)
    return filtered_sentence

def preprocessing_text(text):
  ### 1. Removing tags and accented data ###
  text = cleanhtml(text)
  text = remove_accents(text)

  ### 2. Expanding contractions ###
  text = expand_contractions(text)

  ### 3. Removing punctuations + lowercasing ###
  text = text.lower()
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

  ### 4. Removing special characters ### 
  #text = re.sub('[^A-Za-z0-9]+', ' ', text) # Remove non-text chars
  #text = re.sub(' +',' ',text) # Remove extra spaces
  #text = re.sub('\w*\d\w*','', text) # Remove digits
  
  ### 5. Lemmatization and stopwords removal ###
  text = filter_stopwords(text)

  return text

In [4]:
df['title'] = df['title'].apply(lambda x : preprocessing_text(x))
df['text']  = df['text'].apply(lambda x : preprocessing_text(x))
df['title'].head(10)

12150    protester injured honduras clash electoral cri...
15510    casualty religious attack afghanistan rise ste...
4400     u attack syrian base killed five homs governor...
4050     u say strategy north korea center sanction ope...
951      white house chief staff call special counsel p...
19706    china highestprofile fugitive assailed busines...
8118     mexico president following california marijuan...
19864    venezuela maduro upbeat talk opposition fear i...
6046              ukus trade deal put britain first pm may
2635     mccain illness deprives senate crucial vote tr...
Name: title, dtype: object

# Feature engineering methods comparision

In [None]:
count_vec = CountVectorizer(min_df=5)
bi_count_vec = CountVectorizer(min_df=3, ngram_range=(1,2))
tf_idf_vec = TfidfVectorizer(min_df=5)

features_text = df['text'].values
targets = df['label'].values

### Using Bag-of-words ###
model = LogisticRegression()
features = count_vec.fit_transform(features_text).toarray()
x_train, x_val, y_train, y_val = train_test_split(features, targets, test_size=0.33333)
model.fit(x_train, y_train)
predictions = model.predict(x_val)
accuracy_bow = accuracy_score(y_val, predictions)
print(f'[INFO] Accuracy of logistic regression when using BOW : {accuracy_bow}')

### Using N-gram Bow ###
model = LogisticRegression()
features = bi_count_vec.fit_transform(features_text).toarray()
x_train, x_val, y_train, y_val = train_test_split(features, targets, test_size=0.33333)
model.fit(x_train, y_train)
predictions = model.predict(x_val)
accuracy_ngram = accuracy_score(y_val, predictions)
print(f'[INFO] Accuracy of logistic regression when using Ngram-BOW : {accuracy_ngram}')

### Using Tf-Idf ###
model = LogisticRegression()
features = tf_idf_vec.fit_transform(features_text).toarray()
x_train, x_val, y_train, y_val = train_test_split(features, targets, test_size=0.33333)
model.fit(x_train, y_train)
predictions = model.predict(x_val)
accuracy_tf_idf = accuracy_score(y_val, predictions)
print(f'[INFO] Accuracy of logistic regression when using BOW : {accuracy_tf_idf}')

[INFO] Accuracy of logistic regression when using BOW : 0.9946002699865006
