# Neural Network ML project

## Abstract

A collection of article headlines from the satire newspaper The Onion and headlines from the satire-of-satire subreddit r/nottheonion. Data originally collected by reddit user u/333luke.
#NOTE: Articles originally from the Onion are marked 1, and those from the subreddit are 0.

The model appeared to overfit after the first few epochs, but still achieved ~83% accuracy when tested using the test data set (which is probably a lot better than many of the readers of these subreddits!)

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
sns.set()

import tensorflow as tf

## Get data

In [2]:
raw_data = pd.read_csv('OnionOrNot.csv') #1 = onion, 2=not onion
raw_data.head()

Unnamed: 0,text,label
0,Entire Facebook Staff Laughs As Man Tightens P...,1
1,Muslim Woman Denied Soda Can for Fear She Coul...,0
2,Bold Move: Hulu Has Announced That They’re Gon...,1
3,Despondent Jeff Bezos Realizes He’ll Have To W...,1
4,"For men looking for great single women, online...",1


### Get Rid of punctuation etc

In [3]:
data_no_punk = raw_data.copy()

data_no_punk['text'] = data_no_punk['text'].str.lower()
data_no_punk['text'] = data_no_punk['text'].str.replace(r'&amp','and')
data_no_punk['text'] = data_no_punk['text'].str.replace(r'-','')
data_no_punk['text'] = data_no_punk['text'].str.replace(r'[^\s\w]','')
#maybe drop articles in future, dunno how to write the code that won't gut other words tho

data_no_punk.head()

Unnamed: 0,text,label
0,entire facebook staff laughs as man tightens p...,1
1,muslim woman denied soda can for fear she coul...,0
2,bold move hulu has announced that theyre gonna...,1
3,despondent jeff bezos realizes hell have to wo...,1
4,for men looking for great single women online ...,1


In [4]:
data_no_punk.shape

(24000, 2)

## Convert words to dictionary and map words to numbers

In [5]:
vocab_set = set()
sentence_length = []

for i in range(len(data_no_punk)):
    sentence_words = re.split(r'\s',data_no_punk.iloc[i]['text'])
    vocab_set.update(sentence_words)
    sentence_length.append(len(sentence_words))


In [6]:
vocab_list = list(vocab_set)
vocab_dict = {vocab_list[i-1]: i for i in range(1,len(vocab_list)+1)}

In [7]:
max_length = max(sentence_length)

def toNumbers(row):
    words = re.findall(r'([\w]+)', row['text'])
    nums =  np.array([vocab_dict[words[j]] for j in range(len(words))])
    return np.pad(nums, (0, max_length - len(nums)), mode='constant')

In [8]:
nums = data_no_punk.apply(lambda row: toNumbers(row), axis=1) 
data_no_punk['nums'] = nums

data_no_punk['nums'].head()

0    [6433, 14431, 5611, 5486, 24624, 19381, 12651,...
1    [2263, 10071, 21918, 1821, 1547, 21085, 14043,...
2    [26813, 9210, 942, 4754, 24893, 13836, 23810, ...
3    [17005, 5570, 21566, 11499, 229, 6698, 24316, ...
4    [21085, 5606, 6070, 21085, 14006, 6820, 13132,...
Name: nums, dtype: object

In [9]:
data_preprocessed = data_no_punk.copy()


In [10]:
data_preprocessed.head()

Unnamed: 0,text,label,nums
0,entire facebook staff laughs as man tightens p...,1,"[6433, 14431, 5611, 5486, 24624, 19381, 12651,..."
1,muslim woman denied soda can for fear she coul...,0,"[2263, 10071, 21918, 1821, 1547, 21085, 14043,..."
2,bold move hulu has announced that theyre gonna...,1,"[26813, 9210, 942, 4754, 24893, 13836, 23810, ..."
3,despondent jeff bezos realizes hell have to wo...,1,"[17005, 5570, 21566, 11499, 229, 6698, 24316, ..."
4,for men looking for great single women online ...,1,"[21085, 5606, 6070, 21085, 14006, 6820, 13132,..."


## Split data set into train, validation and test


In [11]:
samples_count = data_preprocessed.shape[0]

train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = tf.convert_to_tensor(data_preprocessed['nums'][:train_samples_count])
train_targets = tf.convert_to_tensor(data_preprocessed['label'][:train_samples_count])

validation_inputs = tf.convert_to_tensor(data_preprocessed['nums'][train_samples_count:train_samples_count+validation_samples_count])
validation_targets = tf.convert_to_tensor(data_preprocessed['label'][train_samples_count:train_samples_count+validation_samples_count])

test_inputs = tf.convert_to_tensor(data_preprocessed['nums'][train_samples_count+validation_samples_count:])
test_targets = tf.convert_to_tensor(data_preprocessed['label'][train_samples_count+validation_samples_count:])

In [12]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

7144 19200 0.3720833333333333
924 2400 0.385
932 2400 0.3883333333333333


In [13]:
np.savez('Onion_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Onion_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Onion_data_test', inputs=test_inputs, targets=test_targets)

## Model

In [17]:
def get_compiled_model():

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocab_set)+1, 64),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(2, activation='softmax')
    ])


    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    
    return model

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)
max_epochs = 100
model = get_compiled_model()
model.fit(train_inputs,
          train_targets,
          batch_size=50,
          epochs=max_epochs, 
          callbacks = [early_stopping],
          validation_data = (validation_inputs,validation_targets),
          verbose=2)

Train on 19200 samples, validate on 2400 samples
Epoch 1/100
19200/19200 - 39s - loss: 0.4168 - accuracy: 0.8065 - val_loss: 0.3217 - val_accuracy: 0.8671
Epoch 2/100
19200/19200 - 33s - loss: 0.1674 - accuracy: 0.9394 - val_loss: 0.3711 - val_accuracy: 0.8658
Epoch 3/100
19200/19200 - 34s - loss: 0.0646 - accuracy: 0.9790 - val_loss: 0.5252 - val_accuracy: 0.8429


<tensorflow.python.keras.callbacks.History at 0x2386cb99dc8>

## Test!

In [18]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.59. Test accuracy: 83.25%
