# Sarcasm Detection for News Headlines

## Imports

In [9]:
import json
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Read News Data

In [5]:
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('datasets/Sarcasm_Headlines_Dataset.json'))

print('Data Sample:', data[0])

Data Sample: {'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}


# Preprocessing the Data

In [21]:
training_size = 20000
vocab_size = 10000
max_len = 100
padding_type = 'post'
truncating_type = 'post'

### Dividing the data

In [11]:
headlines = []
labels = []

for news in data:
    headlines.append(news['headline'])
    labels.append(news['is_sarcastic'])

### Spliting the data

In [14]:
training_data = headlines[:training_size]
training_labels = labels[:training_size]

testing_data = headlines[training_size:]
testing_labels = labels[training_size:]

### Tokenizing

In [26]:
# Create tokenizer object
tokenizer = Tokenizer(num_words = vocab_size, oov_token = '<OOV>')

# Create a word_index dictionary
tokenizer.fit_on_texts(training_data)

# Get word_index dictionary
word_index = tokenizer.word_index

# Tokenize and pad training data
training_sequences = tokenizer.texts_to_sequences(training_data)
training_data_padded = pad_sequences(training_sequences, maxlen = max_len, padding = padding_type, truncating = truncating_type)

# Tokenize and pad testing data
testing_sequences = tokenizer.texts_to_sequences(testing_data)
testing_data_padded = pad_sequences(testing_sequences, maxlen = max_len, padding = padding_type, truncating = truncating_type)

# Sequence sample
print(training_sequences[:3])

[[328, 1, 799, 3405, 2404, 47, 389, 2214, 1, 6, 2614, 8863], [4, 6840, 3096, 3097, 23, 2, 161, 1, 390, 2842, 6, 251, 9, 889], [153, 890, 2, 891, 1445, 2215, 595, 5650, 221, 133, 36, 45, 2, 8864]]
