<a href="https://colab.research.google.com/github/iamdsc/HINT4/blob/master/flagNewsModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# download the data
!wget https://storage.googleapis.com/researchably-fake-news-recognition/news_cleaned_2018_02_13.csv.zip 

--2019-03-31 04:13:35--  https://storage.googleapis.com/researchably-fake-news-recognition/news_cleaned_2018_02_13.csv.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.187.128, 2404:6800:4008:c02::80
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.187.128|:443... connected.
HTTP request sent, awaiting response... ^C


In [0]:
!unzip news_cleaned_2018_02_13.csv.zip

Archive:  news_cleaned_2018_02_13.csv.zip
  inflating: news_cleaned_2018_02_13.csv  


In [0]:
import pandas as pd

news_df = pd.read_csv('news_cleaned_2018_02_13.csv', nrows=1000000, usecols=['content','type'])

In [0]:
print(news_df.shape)
print(news_df.head())

(1000000, 2)
    type                                            content
0  rumor  Life is an illusion, at least on a quantum lev...
1   hate  Unfortunately, he hasn’t yet attacked her for ...
2   hate  The Los Angeles Police Department has been den...
3   hate  The White House has decided to quietly withdra...
4   hate  “The time has come to cut off the tongues of t...


In [0]:
news_df.groupby(['type']).size().reset_index(name='counts')

Unnamed: 0,type,counts
0,2018-02-02 01:19:41.756664,34838
1,bias,111174
2,clickbait,15733
3,conspiracy,103646
4,fake,77302
5,hate,3218
6,junksci,14517
7,political,297562
8,reliable,2643
9,rumor,26916


In [0]:
print(news_df.columns)

Index(['type', 'content'], dtype='object')


In [0]:
# only keeping 11 categories
news_df = news_df[~news_df['type'].isin(['2018-02-02 01:19:41.756664','unknown'])]

In [0]:
news_df.groupby(['type']).size().reset_index(name='counts')

Unnamed: 0,type,counts
0,bias,111174
1,clickbait,15733
2,conspiracy,103646
3,fake,77302
4,hate,3218
5,junksci,14517
6,political,297562
7,reliable,2643
8,rumor,26916
9,satire,12853


In [0]:
news_df.dropna(axis=0, inplace=True)
news_df.shape

(672089, 2)

In [0]:
import re
# preprocessing the content
# defining the helper functions
def cleanPunc(content):
    """ To remove punctuations and special characters """
    cleaned=re.sub(r'[?|!|\'|"|#]', r'', content)
    cleaned=re.sub(r'[.|,|)|(|\|/]', r' ', content)
    cleaned=cleaned.strip()
    cleaned=cleaned.replace('\n',' ')
    return cleaned

def keepAlpha(content):
    """ To remove non-alphabetic characters """
    alpha_sent=''
    for word in content.split():
        alpha_word=re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent+=alpha_word
        alpha_sent+=' '
    alpha_sent=alpha_sent.strip()
    return alpha_sent
  
# applying the two functions on the dataframe
news_df['content'] = news_df['content'].str.lower().apply(cleanPunc).apply(keepAlpha)
print(news_df.head())

    type                                            content
0  rumor  life is an illusion at least on a quantum leve...
1   hate  unfortunately he hasn t yet attacked her for i...
2   hate  the los angeles police department has been den...
3   hate  the white house has decided to quietly withdra...
4   hate  the time has come to cut off the tongues of th...


In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Removing stop words
stop_words=set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven',
                   'eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words=re.compile(r'\b('+'|'.join(stop_words)+')\\W',re.I)

def removeStopWords(content):
    global re_stop_words
    return re_stop_words.sub(' ', content)

news_df['content']=news_df['content'].apply(removeStopWords)
news_df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,type,content
0,rumor,life illusion least quantum level theor...
1,hate,unfortunately attacked islamic terrorism...
2,hate,los angeles police department denied mill...
3,hate,white house decided quietly withdraw tie...
4,hate,time come cut tongues support peace ...


In [0]:
# saved the pre-processed model into a pickle file
news_df.to_pickle('./preprocessed_df.pkl')

In [0]:
clean_df = pd.read_pickle('./preprocessed_df.pkl')
clean_df.head()

Unnamed: 0,type,content
0,rumor,life illusion least quantum level theor...
1,hate,unfortunately attacked islamic terrorism...
2,hate,los angeles police department denied mill...
3,hate,white house decided quietly withdraw tie...
4,hate,time come cut tongues support peace ...


In [0]:
# adding the data
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# copy the pre-processed data from drive
%cp 'gdrive/My Drive/preprocessed_df.zip' './'

In [0]:
!unzip preprocessed_df.zip

In [0]:
import pandas as pd

# read in the pickle file into the dataframe
df = pd.read_pickle('preprocessed_df.pkl')

In [0]:
print(df.shape)

(672089, 2)


In [0]:
# converting into binary classification
# fake: fake, junksci, unreliable, 
# real: clickbait,reliable, political

# removing other classes
df = df[~df['type'].isin(['bias', 'satire', 'conspiracy', 'hate', 'rumor', 'satire'])]

In [0]:
df.groupby(['type']).size().reset_index(name='counts')

Unnamed: 0,type,counts
0,clickbait,15733
1,fake,77302
2,junksci,14517
3,political,297562
4,reliable,2643
5,unreliable,6525


In [0]:
# converting categorical labels to binary
replace_dict = {'type':{'clickbait':0,'fake':1,'junksci':1,'reliable':0, 'unreliable':1,'political':0}}
df = df.replace(replace_dict)

In [0]:
df.groupby(['type']).size().reset_index(name='counts')

Unnamed: 0,type,counts
0,0,315938
1,1,98344


In [0]:
new_df = pd.concat([df[df.type==0][:90000], df[df.type==1][:90000]], ignore_index=True)

In [0]:
new_df.groupby(['type']).size().reset_index(name='counts')

Unnamed: 0,type,counts
0,0,90000
1,1,90000


In [0]:
# storing the first 100 words of articles
labels = []
texts = []
new_df.values[:5]
for label, text in new_df.values:
  labels.append(label)
  t = list(text.split())[:100]
  texts.append(' '.join(t))

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100 # cut off reviews after 100 words
max_words = 10000 # only top 10000 words in dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.array(labels)

print('Shape of data tensor:',data.shape)
print('Shape of label tensor:',labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Using TensorFlow backend.


Shape of data tensor: (180000, 100)
Shape of label tensor: (180000,)


In [0]:
training_samples = 175000 # train on 175000 samples
test_samples = 5000 # test on 5000 samples

x_train = data[:training_samples]
y_train = labels[:training_samples]
print(len(x_train), len(y_train))

x_test = data[training_samples:training_samples+test_samples]
y_test = labels[training_samples:training_samples+test_samples]
print(x_test.shape, y_test.shape)

175000 175000
(5000, 100) (5000,)


In [0]:
# required imports
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint

In [0]:
# building the model
model = Sequential()
model.add(layers.Embedding(max_words, 128, input_length=maxlen))
model.add(layers.Conv1D(32, 5, activation='relu'))
model.add(layers.MaxPooling1D(3))
model.add(layers.GRU(32, activation='relu', dropout=0.1, recurrent_dropout=0.5, return_sequences=True))
model.add(layers.GRU(64, activation='relu', dropout=0.1, recurrent_dropout=0.5))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

# checkpoint
filepath="2nd-best-model-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(x_train,y_train,epochs=20,batch_size=128, validation_split=0.2, callbacks=callbacks_list)

In [0]:
# plotting the results
def plot_results(history):
  acc = history.history['acc']
  val_acc = history.history['val_acc']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(1, len(acc)+1)

  plt.plot(epochs, acc, label='Training acc')
  plt.plot(epochs, val_acc, label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.legend()

  plt.figure()

  plt.plot(epochs, loss, label='Training loss')
  plt.plot(epochs, val_loss, label='Validation loss')
  plt.title('Training and validation loss')
  plt.legend()

  plt.show()

plot_results(history)

In [0]:
print(x_test.shape)
print(y_test.shape)

(5000, 100)
(5000, 100)


In [0]:
# loading the saved best model
from keras.models import load_model

loaded_model = load_model('best-model-05-0.96.hdf5')

# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# calculating accuracy on test data
score = loaded_model.evaluate(x_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

acc: 96.56%


In [0]:
# test prediction
loaded_model.predict(np.array([x_test[0],]))

array([[1.]], dtype=float32)

In [0]:
print(y_test[0])

1
