## Part 1: Data Loading

In [1]:
!ls

README.md                         playing_around.ipynb
Sarcasm_Headlines_Dataset_v2.json


In [2]:
%%capture
pip install pandas matplotlib numpy nltk sklearn keras tensorflow

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import pad_sequences

2022-11-11 14:41:57.058182: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
data = pd.read_json("./Sarcasm_Headlines_Dataset_v2.json",lines=True)

In [5]:
data

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [6]:
# check for duplicate headlines
data.describe(include='object')

Unnamed: 0,headline,article_link
count,28619,28619
unique,28503,28617
top,"'no way to prevent this,' says only nation whe...",https://politics.theonion.com/nation-not-sure-...
freq,12,2


In [7]:
# remove duplicate headlines
data=data.drop(data[data['headline'].duplicated()].index,axis=0)

In [8]:
sarc_cnt = len(data.query('is_sarcastic==1'))
non_sarc_cnt = len(data.query('is_sarcastic==0'))

# Summary of sarcastic lines
print(f'There are {sarc_cnt} sarcastic headlines and {non_sarc_cnt} non-sarcastic headlines')

There are 13552 sarcastic headlines and 14951 non-sarcastic headlines


---

## Part 2: Data Processing/Cleaning

In [9]:
# import stopwords from nltk
stwrds = set(stopwords.words('english'))


In [10]:
# method to clean a given headline by lowercasing the string, removing spaces, and removing stopwords
def clean_headlines(headline):
    headline = headline.lower()
    headline_split = headline.split()
    cleaned_headline = []
    for word in headline_split:
        if word not in stwrds:
            cleaned_headline.append(word)
    
    cleaned_line = " ".join(cleaned_headline)
    return cleaned_line

In [11]:
# applies function to all entries
data['headline'].apply(clean_headlines)

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails congress falling short ...
2             eat veggies: 9 deliciously different recipes
3             inclement weather prevents liar getting work
4        mother comes pretty close using word 'streamin...
                               ...                        
28614               jews celebrate rosh hashasha something
28615    internal affairs investigator disappointed con...
28616    beautiful acceptance speech week came queer ko...
28617    mars probe destroyed orbiting spielberg-gates ...
28618                              dad clarifies food stop
Name: headline, Length: 28503, dtype: object

In [12]:
# train-test split
headline_target = data['is_sarcastic']
headline_attributes = data['headline']
attribute_train, attribute_test, labels_train, labels_test = train_test_split(headline_attributes, headline_target, test_size=0.30)


In [13]:
# form dataframes for training and test sets
att_train = pd.DataFrame(attribute_train)
label_train = pd.DataFrame(labels_train)
att_test = pd.DataFrame(attribute_test)
label_test = pd.DataFrame(labels_test)

training_set = label_train.join(att_train)
test_set = label_test.join(att_test)

In [14]:
# determines amount of unqiue words in our data
# this takes forever because set operations are slow

unqiue_words = set()
for headline in headline_attributes:
    unqiue_words = unqiue_words.union(set(headline.split()))

print(f'{len(unqiue_words)} unqiue words in the headline data')

38234 unqiue words in the headline data


In [15]:
# given that there are over 38000 unqiue words a 20,000 vocab size seems appropriate
vocab_size = 20000

# initalize tokenizer and fit encodings to our dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(data['headline'])

In [16]:
# tokenize and sequence the training and test splits
train_seqs = tokenizer.texts_to_sequences(training_set['headline'])
train_pad = pad_sequences(train_seqs, maxlen=50, padding='post', truncating='post')

test_seqs = tokenizer.texts_to_sequences(test_set['headline'])
test_pad = pad_sequences(test_seqs, maxlen=50, padding='post', truncating='post')