# Importing libraries

In [1]:
### Mount to drive ###
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
import re
import tqdm
import string
import unicodedata
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

### Tensorflow dependencies ###
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence # For custom data generator

### nltk dependencies ###
!python3 -m nltk.downloader stopwords
!python3 -m nltk.downloader wordnet 
!python3 -m nltk.downloader punkt
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

### Some constants ###
true_path = '/content/drive/My Drive/Fake.csv'
fake_path  = '/content/drive/My Drive/True.csv'

Mounted at /content/drive
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# I. Loading and cleaning data

In [2]:
# Sample data with each class taking 10000 instances
df_fake = pd.read_csv(fake_path, header=0).sample(n=10000, random_state=np.random.randint(0,40000))
df_true = pd.read_csv(true_path, header=0).sample(n=10000, random_state=np.random.randint(0,40000))

In [3]:
# Concatenate the two data frames 
df_fake['label'] = 0
df_true['label'] = 1

df = pd.concat([df_fake, df_true])
df.index = list(range(len(df.index))) # re-index the dataframe
df # print out the data frame

Unnamed: 0,title,text,subject,date,label
0,Problems pile up for unlucky village near epic...,"HUAUTLA, Mexico (Reuters) - Yesenia Vergara gr...",worldnews,"September 24, 2017",0
1,Immigrants in Central Florida nervous about se...,"APOPKA, Fla. (Reuters) - With Hurricane Irma b...",worldnews,"September 9, 2017",0
2,U.S. appeals court will not put Trump travel b...,SAN FRANCISCO (Reuters) - A federal appeals co...,politicsNews,"February 27, 2017",0
3,Clinton will hold election night rally in New ...,WASHINGTON (Reuters) - U.S. Democratic preside...,politicsNews,"October 26, 2016",0
4,Senate Republican leader starts clock ticking ...,WASHINGTON (Reuters) - Senate Republican leade...,politicsNews,"April 4, 2017",0
...,...,...,...,...,...
19995,GERMANY’S DEFENSE MINISTER Refuses To Wear Hij...,Germany s defense minister refused to wear a t...,politics,"Dec 14, 2016",1
19996,HERE WE GO: GEORGIA POLITICIAN Calls for Remov...,The politicians are using our monuments to his...,politics,"Aug 15, 2017",1
19997,Conservative Conspiracy Theorist ARRESTED Aft...,"Lucy Richards, a Tampa woman who believes luna...",News,"December 8, 2016",1
19998,‘Might Have Been Faked By Liberals’: Top Advi...,"I mean, that headline, right? Did someone actu...",News,"August 8, 2017",1


## 1. Removing tags and accented chars

In [4]:
def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

print('[INFO] Clearing html tags ...')
df['title'] = df['title'].apply(lambda x : cleanhtml(x))
df['text'] = df['text'].apply(lambda x : cleanhtml(x))
print('Done!')

[INFO] Clearing html tags ...
Done!


In [5]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

print("[INFO] Removing accented characters ... ")
df['title'] = df['title'].apply(lambda x : remove_accents(x))
df['text'] = df['text'].apply(lambda x : remove_accents(x))
print("Done!")

[INFO] Removing accented characters ... 
Done!


## 2. Removing contractions

In [6]:
# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

print("[INFO] Expanding contraction ... ")
df['title'] = df['title'].apply(lambda x : expand_contractions(x))
df['text'] = df['text'].apply(lambda x : expand_contractions(x))
print("[INFO] Done!")

[INFO] Expanding contraction ... 
[INFO] Done!


## 3. Lowercasing text and punctuations removal

In [7]:
print('[INFO] Lowercasing text ... ')
df['title'] = df['title'].apply(lambda x : x.lower())
df['text'] = df['text'].apply(lambda x : x.lower())
print('[INFO] Done!')

[INFO] Lowercasing text ... 
[INFO] Done!


In [8]:
print('[INFO] Removing punctuations ... ')
df['title']=df['title'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
df['text']=df['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
print('[INFO] Done!')

[INFO] Removing punctuations ... 
[INFO] Done!


## 4. Removing special characters

In [9]:
print('[INFO] Removing non-text characters ...')
df['title'] = df['title'].apply(lambda x : re.sub('[^A-Za-z0-9]+', ' ', x))
df['text'] = df['text'].apply(lambda x : re.sub('[^A-Za-z0-9]+', ' ', x))
print('Done!')

[INFO] Removing non-text characters ...
Done!


In [10]:
print('[INFO] Removing extra spaces ...')
df['title']=df['title'].apply(lambda x: re.sub(' +',' ',x))
df['text']=df['text'].apply(lambda x: re.sub(' +',' ',x))
print('[INFO] Done!')

[INFO] Removing extra spaces ...
[INFO] Done!


In [11]:
print('[INFO] Removing digits ... ')
df['title']=df['title'].apply(lambda x: re.sub('\w*\d\w*','', x))
df['text']=df['text'].apply(lambda x: re.sub('\w*\d\w*','', x))
print('[INFO] Done!')

[INFO] Removing digits ... 
[INFO] Done!


## 5. Stemming & lemmatization and stopwords removal

In [12]:
stop_words = set(stopwords.words('english')) # Assuming all nltk data is installed
lemmatizer = WordNetLemmatizer()

def filter_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered = [lemmatizer.lemmatize(w) for w in word_tokens if not w in stop_words]
    filtered_sentence = ' '.join(filtered)

    ### Removing all special characters ###
    special_chars = ['…', '–', '’', '‘', '”', '“']

    for char_ in special_chars:
      filtered_sentence = filtered_sentence.replace(char_, "")

    # Removal of extra spaces
    filtered_sentence = re.sub(' +',' ',filtered_sentence)
    return filtered_sentence

print('[INFO] Removing stop words and lemmatizing ... ')
df['title'] = df['title'].apply(lambda x : filter_stopwords(x))
df['text'] = df['text'].apply(lambda x : filter_stopwords(x))
print('[INFO] Done!')

[INFO] Removing stop words and lemmatizing ... 
[INFO] Done!


# II. Text data vectorization

In [13]:
features_text = df['text'].values
labels = df['label'].values
vfunc = np.vectorize(lambda x : len(x.split(' ')))
all = ' '.join(features_text)
vocab_size = len(list(set(all.split(" ")))) + 1
max_seq_len = vfunc(features_text).max()
max_vocab = 10000

tokenizer = Tokenizer(char_level=False, oov_token='<PAD>')
tokenizer.fit_on_texts(features_text)
features = tokenizer.texts_to_sequences(features_text)
features = pad_sequences(features, maxlen=max_seq_len)

print(np.max(np.array(features).flatten()))
print(f'[INFO] Vocabulary size {vocab_size}')
print(f'[INFO] Max sequence length : {max_seq_len}')

124427
[INFO] Vocabulary size 124428
[INFO] Max sequence length : 4561


In [None]:
def get_co_occurrence_frequency(x1, x2, corpus):
  if(x1 > 0 and x2 > 0): # if both words are non oov_token
    contains_x1 = [x for x in corpus if x1 in x]
    contains_x1_and_x2 = [x for x in contains_x1 if x2 in x]

    return len(contains_x1_and_x2)
  else:
    return 0

def get_docs_with_term(x, corpus):
  if(x > 0):
    contains_x = [x_ for x_ in corpus if x in x_]
    return np.array(contains_x)
  else:
    return np.array([])

word2word_cooccurrence = {}
with tqdm.tqdm(total=vocab_size) as pbar:
  for x in range(1, vocab_size):
    contains_x = [x_ for x_ in features if x in x_]
    contains_x = np.array(contains_x)
    co_occur_with_x = np.unique(contains_x.flatten())

    word2word_cooccurrence[x] = co_occur_with_x
    pbar.update(1)

class WordPairGenerator(Sequence):
  def __init__(self, corpus, vocab_size, batch_size):
    self.corpus = corpus 
    self.vocab_size = vocab_size 
    self.batch_size = batch_size
    self.vocabs = np.unique(corpus.flatten())

  def __len__(self):
    ''' Denotes number of batches per epoch '''
    return self.corpus.shape[0] // self.batch_size

  def __getitem__(self, index):
    ''' Generate one batch of data '''
    words1 = []
    words2 = []
    frequencies = []
    num_items = 0

    while(num_items < self.batch_size):
      x1 = np.random.randint(1, self.vocab_size + 1)
      docs_with_x1 = get_docs_with_term(x1, self.corpus)
      x2 = np.random.choice(np.unique(docs_with_x1.flatten()), 1)[0]

      frequency = get_co_occurrence_frequency(x1, x2, self.corpus)
      if(frequency == 0):
        continue

      words1.append(x1)
      words2.append(x2)
      frequencies.append(float(frequency))
      num_items += 1

    return [np.array(words1), np.array(words2)], np.array(frequencies)


  0%|          | 601/124428 [03:32<7:59:24,  4.30it/s]

# III. Model and loss function

In [None]:
X_MAX = 1000
a = 3.0 / 4.0

def glove_model(vocab_size=10, vector_dim=3):
    """
    A Keras implementation of the GloVe architecture
    :param vocab_size: The number of distinct words
    :param vector_dim: The vector dimension of each word
    :return:
    """
    input_target = Input((1,), name='central_word_id')
    input_context = Input((1,), name='context_word_id')

    central_embedding = Embedding(vocab_size, vector_dim, input_length=1)
    central_bias = Embedding(vocab_size, 1, input_length=1)

    context_embedding = Embedding(vocab_size, vector_dim, input_length=1)
    context_bias = Embedding(vocab_size, 1, input_length=1)

    vector_target = central_embedding(input_target)
    vector_context = context_embedding(input_context)

    bias_target = central_bias(input_target)
    bias_context = context_bias(input_context)

    dot_product = Dot(axes=-1)([vector_target, vector_context])
    dot_product = Reshape((1, ))(dot_product)
    bias_target = Reshape((1,))(bias_target)
    bias_context = Reshape((1,))(bias_context)

    prediction = Add()([dot_product, bias_target, bias_context])

    model = Model(inputs=[input_target, input_context], outputs=prediction)
    model.compile(loss=custom_loss, optimizer=Adam())

    return model


def custom_loss(y_true, y_pred):
    """
    This is GloVe's loss function
    :param y_true: The actual values, in our case the 'observed' X_ij co-occurrence values
    :param y_pred: The predicted (log-)co-occurrences from the model
    :return: The loss associated with this batch
    """
    return K.sum(K.pow(K.clip(y_true / X_MAX, 0.0, 1.0), a) * K.square(y_pred - K.log(y_true)), axis=-1)

In [None]:
model = glove_model(vocab_size=vocab_size, vector_dim=3)
train_generator = WordPairGenerator(features, vocab_size, 32)
model.fit(train_generator, epochs=10)

Epoch 1/10
 78/625 [==>...........................] - ETA: 1:05:30 - loss: 0.0116

KeyboardInterrupt: ignored