In [1]:
# loading the necessary libraries

import json
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-01-02 11:25:16.837428: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# Then load the data

df = pd.read_json('sarcasm.json')

df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [3]:
''' Article link is unnecessary, so drop that and then check the work. 
NOTE - when dropping, remember to save the work into a variable name, else Pandas will just
do the work in place and you'll still have that column you thought you removed.
That totally didn't happen to me.
'''

df = df.drop(['article_link'], axis=1)
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
# Checking data distribution

is_sarcastic = df['is_sarcastic'].value_counts()[1]
print(is_sarcastic/len(df), len(df))

0.43895316185555433 26709


### 44% of the headlines are satirical, which is a fairly even data set. 

## Train / Test / Validate Split

The tutorial I'm using for this only focuses on a Train/Test split, but I'm hoping to do some tweaking and test the accuracy a bit. So, for that I'm going to divide into three groups. First, I want a validation set, a decent enough group to test against my final trained model just to see how it performs with brand new data. 

Then, I want a test set, which I will use to see how well each model iteration learns. We have a decent amount of data here (26k items for a binary classifier) so I'm holding out 20% for the Validation set, and then of the remaining 80, an equal number for testing (25%) and training (75%) which should give me:

Validation - 5342
Testing - 5342
Training - 16025

If the model has trouble learning, I can tweak those down to 15/15/70 or even 12/12/76

In [5]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
print(len(df_train_full), len(df_test))

21367 5342


In [7]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

print(len(df_val), len(df_test), len(df_train))

# Checking my math

5342 5342 16025


In [8]:
# Printing this so that after I drop the target variable from the training 
df_train.head()

Unnamed: 0,headline,is_sarcastic
18631,uncle strikes out hard with book gift,1
24365,cities in this state have the worst smog,0
8924,the world economic forum is giving goosebumps ...,0
23860,period of time in which parents proud of how m...,1
11689,why shrimp scampi has been on america's mind a...,0


In [9]:
y_train = df_train.is_sarcastic.values
y_val = df_val.is_sarcastic.values

del df_train['is_sarcastic']
del df_val['is_sarcastic']

In [10]:
# with these separated out, just making sure that things line up

for i in range(5):
    print(df_train.iloc[i]['headline'], "value: ", y_train.item(i))

uncle strikes out hard with book gift value:  1
cities in this state have the worst smog value:  0
the world economic forum is giving goosebumps to some 'game of thrones' fans value:  0
period of time in which parents proud of how much child can eat quickly dwindling value:  1
why shrimp scampi has been on america's mind all week value:  0


The values match. So, we're good to proceed.

In [11]:
vocab_size = 10000
oov_tok = "<oov>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['headline'])

word_index = tokenizer.word_index

max_length = 100
trunc_type='post'
padding_type='post'

train_sequences = tokenizer.texts_to_sequences(df_train['headline'])
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(df_val['headline'])
val_padded = pad_sequences(val_sequences, maxlen = max_length, padding=padding_type, truncating=trunc_type)

Had to modify the tutorial again since I am working with a dataframe instead of a list, but we have tokenized values. 

Of note, I had to specify the column I wanted to tokenize else things got wonky. Also, running locally the tokenizing process happened so quickly that I expected it didn't work and spent several minutes before finally out putting the final 

In [12]:
# Create the model

embedding_dim = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

2023-01-02 11:27:04.030790: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-01-02 11:27:04.061497: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-01-02 11:27:04.403676: E tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:927] could not open file to read NUMA node: /sys/bus/pci/devices/0000:10:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-02 11:27:04.426794: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:10:00.0 name: NVIDIA GeForce GTX 1060 3GB computeCapability: 6.1
coreClock: 1.7085GHz coreCount: 9 deviceMemorySize: 3.00GiB deviceMemoryBandwidth: 178.99GiB/s
2023-01-02 11:27:04.426867: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-01-02 11:27:05.363515: I tensorflow/stream_executor/pl

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Convert trainied lists to numpy arrays for Tensorflow

train_padded = np.array(train_padded)
y_train = np.array(y_train)
val_padded = np.array(val_padded)
y_val = np.array(y_val)

In [14]:
# Train the model

num_epochs = 8

history = model.fit(train_padded, y_train, epochs=num_epochs, validation_data=(val_padded, y_val), verbose=2)

2023-01-02 11:27:36.548080: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-01-02 11:27:36.854609: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3593145000 Hz


Epoch 1/8


2023-01-02 11:27:42.946369: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


501/501 - 78s - loss: 0.6805 - accuracy: 0.5604 - val_loss: 0.6607 - val_accuracy: 0.5610
Epoch 2/8
501/501 - 7s - loss: 0.5255 - accuracy: 0.7574 - val_loss: 0.4280 - val_accuracy: 0.8160
Epoch 3/8
501/501 - 7s - loss: 0.3411 - accuracy: 0.8681 - val_loss: 0.3765 - val_accuracy: 0.8272
Epoch 4/8
501/501 - 6s - loss: 0.2780 - accuracy: 0.8929 - val_loss: 0.3421 - val_accuracy: 0.8486
Epoch 5/8
501/501 - 8s - loss: 0.2363 - accuracy: 0.9120 - val_loss: 0.3393 - val_accuracy: 0.8502
Epoch 6/8
501/501 - 8s - loss: 0.2033 - accuracy: 0.9266 - val_loss: 0.3401 - val_accuracy: 0.8519
Epoch 7/8
501/501 - 9s - loss: 0.1778 - accuracy: 0.9366 - val_loss: 0.3543 - val_accuracy: 0.8502
Epoch 8/8
501/501 - 8s - loss: 0.1545 - accuracy: 0.9451 - val_loss: 0.3604 - val_accuracy: 0.8512


Now validation accuracy is right at 85, rather than the ~80% we were seeing with longer training. Feel good about stopping there for the day. 

In [15]:
model.save("satire_headline.h5")

## Same Task, But with NLTK

Deploying Tensorflow is a bit of a bear. I'm going to follow the book and try and get Tensorflow Lite running on Django, but it's clear that I need to learn Tensorflow Serving to really get Tensorflow in production. 

**However** what is clear is that Tensorflow also might be overkill for a pretty basic NLP-focused binary classification system. Towards that end, I want to do the same thing over, this time using the Natural Language Tool Kit (NLTK). 

At just 3.8MB all day, the NLTK is >10 times smaller than TF Lite and >300 times smaller than Tensorflow. 

Let's see what kind of accuracy I can get. 

In [18]:
'''It looks like NLTK works based on a list of tuples. So, my first step it turn my nice modern dataframe into a list 
of tuples.

But first, I want to convert the labels into words. 
'''

df2 = df
df2.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [22]:
df2['is_sarcastic'] = df2['is_sarcastic'].replace(to_replace=[0,1], value=["not_satire","satire"])
df2.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,not_satire
1,the 'roseanne' revival catches up to our thorn...,not_satire
2,mom starting to fear son's web series closest ...,satire
3,"boehner just wants wife to listen, not come up...",satire
4,j.k. rowling wishes snape happy birthday in th...,not_satire


In [23]:
headlines = list(df.itertuples(index=False, name=None))
print(headlines[:5])

[("former versace store clerk sues over secret 'black code' for minority shoppers", 'not_satire'), ("the 'roseanne' revival catches up to our thorny political mood, for better and worse", 'not_satire'), ("mom starting to fear son's web series closest thing she will have to grandchild", 'satire'), ('boehner just wants wife to listen, not come up with alternative debt-reduction ideas', 'satire'), ('j.k. rowling wishes snape happy birthday in the most magical way', 'not_satire')]


Well, that was easy. Now to build a NLTK-powered Naive Bayes classifier. 

First step, build the token libary. 

In [26]:
import nltk 
from nltk import word_tokenize
nltk.download('punkt')

def get_features(text):
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features 

all_features = [(get_features(headline), label) for (headline, label) in headlines]

print(len(all_features))



[nltk_data] Downloading package punkt to /home/knownhuman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


26709


In [27]:
from nltk import NaiveBayesClassifier, classify

def train(features, proportion):
    train_size = int(len(features) * proportion)
    # Building training and test sets on the fly
    train_set, test_set = features[:train_size], features[train_size:]
    print(f"Training set size = {str(len(train_set))} headlines")
    print(f"Test set size = {str(len(test_set))} headlines")
    # training
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier

train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 21367 headlines
Test set size = 5342 headlines


In [29]:
def evaluate(train_set, test_set, classifier):
    print(f"Training Set Accuracy = {str(classify.accuracy(classifier, train_set))}")
    print(f"Test Set Accuracy = {str(classify.accuracy(classifier, test_set))}")
    classifier.show_most_informative_features(50)
          
evaluate(train_set, test_set, classifier)

Training Set Accuracy = 0.9517012215098049
Test Set Accuracy = 0.8614751029576937
Most Informative Features
                    area = True           satire : not_sa =     56.4 : 1.0
                  hoping = True           satire : not_sa =     28.3 : 1.0
                  donald = True           not_sa : satire =     27.8 : 1.0
                       % = True           satire : not_sa =     26.0 : 1.0
                    2015 = True           not_sa : satire =     25.0 : 1.0
                 reasons = True           not_sa : satire =     24.4 : 1.0
                  muslim = True           not_sa : satire =     22.3 : 1.0
                 protest = True           not_sa : satire =     22.3 : 1.0
                  nation = True           satire : not_sa =     21.8 : 1.0
                  slowly = True           satire : not_sa =     21.6 : 1.0
                 elderly = True           satire : not_sa =     20.7 : 1.0
                  allows = True           satire : not_sa =     19.

Well damn, 86% accuracy in a training time of seconds. Let's see how this performs in real life. 

In [33]:
while True:
    headline = input("Type in your headline: ")
    if len(headline)==0:
        break
    else:
        prediction = classifier.classify(get_features(headline))
        print(f"This headline is likely {prediction}\n")

Type in your headline: this is a test headline
This headline is likely not_satire

Type in your headline: Area man hoping his team really pulls it off this year
This headline is likely satire

Type in your headline: Invite the neighbors
This headline is likely not_satire

Type in your headline: 


Based on a small sample size, the model performed better with real news than it did with satire, and if definitely picked out some of the most common words used by the Onion. 

Alright, time to pickle this model. 

In [34]:
import pickle
with open('satire_classifier.pickle', 'wb') as out:
    pickle.dump(classifier, out)

