## Convolutional DNN

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
MAX_SEQUENCE_LENGTH = 5000
MAX_NUM_WORDS = 25000
EMBEDDING_DIM = 300
TEST_SPLIT = 0.2
SAVE_MODEL_PATH = './save/model/'
TEXT_DATA = 'data/fake_or_real_news.csv'

In [3]:
# read in our data and preprocess it

df = pd.read_csv(TEXT_DATA)
df.drop(labels=['id','title'], axis='columns', inplace=True)
# only select stories with lengths gt 0 -- there are some texts with len = 0
mask = list(df['text'].apply(lambda x: len(x) > 0))
df = df[mask]

In [4]:
# prepare text samples and their labels

texts = df['text']
labels = df['label']

print('Found %s texts.' %texts.shape[0])

Found 6335 texts.


In [5]:
# print("Sentences for articale 1: ",texts[0].split('\n').__len__())

In [6]:
# vectorize the text samples into a 2D integer tensor 
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
data = pad_sequences(sequences, 
                     maxlen=MAX_SEQUENCE_LENGTH, 
                     padding='pre', 
                     truncating='pre')

print('Found %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Using TensorFlow backend.


Found 99682 unique tokens.
Shape of data tensor: (6335, 5000)
Shape of label tensor: (6335,)


In [7]:
# split the data into a training set and a validation set   

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(data, 
                                                  labels.apply(lambda x: 0 if x == 'FAKE' else 1), 
                                                  test_size=TEST_SPLIT)

In [8]:
# build a 1D convnet with global maxpooling                                                                      

from keras import layers
from keras.models import Sequential

model = Sequential(
    [
        # part 1: word and sequence processing
        layers.Embedding(num_words,
                         EMBEDDING_DIM, 
                         input_length=MAX_SEQUENCE_LENGTH,
                         trainable=True),
        layers.Conv1D(128, 5, activation='relu'),
        layers.GlobalMaxPooling1D(),
        
        # part 2: classification
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5000, 300)         7500300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4996, 128)         192128    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 7,709,069
Trainable params: 7,709,069
Non-trainable params: 0
_________________________________________________________________


In [9]:
# define a function that allows us to evaluate our models

from sklearn.metrics import accuracy_score

def evaluate_model(predict_fun, X_train, y_train, X_test, y_test):
    '''
    evaluate the model, both training and testing errors are reported
    '''
    # training error
    y_predict_train = predict_fun(X_train)
    train_acc = accuracy_score(y_train,y_predict_train)
    
    # testing error
    y_predict_test = predict_fun(X_test)
    test_acc = accuracy_score(y_test,y_predict_test)
    
    return train_acc, test_acc

In [27]:
# define a function that allows us to evaluate our models

from sklearn.metrics import accuracy_score

def evaluate_ourfakes(model, Fake_texts):
    '''
    evaluate the model, both training and testing errors are reported
    '''
    missclasified_fakes = []
    correct_clasified_fakes = []
    def predict(X):
        return np.rint(model.predict(X))
    # training error
    y_predict = predict(Fake_texts)
    y = np.zeros(len(y_predict))
    missclasified_fakes.append(Fake_texts[i]) if y_predict[i] == 1 else correct_clasified_fakes.append(Fake_texts[i])
    acc = accuracy_score(y,y_predict) #model.evaluate(Fake_texts,y)
    return acc, missclasified_fakes, correct_clasified_fakes

In [21]:
model.load_weights(SAVE_MODEL_PATH+'model_5.h5') # replace X with the last saved checkpoint number


In [22]:
faker = list(df['label'].apply(lambda x: x == 'FAKE'))
fakes = df[faker]
fake_texts = fakes['text']
# fake_labels = fakes['label']

In [23]:
fake_text_vectors = tokenizer.texts_to_sequences(fake_texts)
fake_text_vectors = pad_sequences(fake_text_vectors, 
                     maxlen=MAX_SEQUENCE_LENGTH, 
                     padding='pre', 
                     truncating='pre')

In [25]:
sum(model.predict(fake_text_vectors))

array([3.8440177], dtype=float32)

In [28]:
acc, missclasified_fakes, correct_clasified_fakes = evaluate_ourfakes(model, fake_text_vectors)

NameError: name 'i' is not defined

In [None]:
tokenizer.sequences_to_texts(missclasified_fakes[0])

In [None]:
print(acc)

In [None]:
print(len(missclasified_fakes),len(correct_clasified_fakes))