In [1]:
import os
import zipfile
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# download from kaggle
# ref https://rishabhmisra.github.io/publications/
KAGGLE_DATASET = "rmisra/news-headlines-dataset-for-sarcasm-detection"
BASE_PATH="/home/$user/data/tmp"
DATASETS_DIR="./datasets"
DATASET_NAME="news-headlines-dataset-for-sarcasm-detection.zip"
JSON_FILE = "Sarcasm_Headlines_Dataset.json"

In [3]:
# Kaggle Dataset Download :) 
def download_kaggle_dataset(dataset_name, path_to_download=DATASETS_DIR):
    print("Downloading at: {}".format(path_to_download))
    # --force if you want to force download
    !kaggle datasets download -d $dataset_name -p $path_to_download 

In [4]:
download_kaggle_dataset(KAGGLE_DATASET)

Downloading at: ./datasets
news-headlines-dataset-for-sarcasm-detection.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
def extract_dataset(filename, extract_dir=BASE_PATH):
    ''' will extract the dataset on the fullpath 
      to the extract_dir defined if not then default to BASE_PATH
      Args:
        filename (str): zip filename
        extract_dir (str): path to extract the zip file
    '''

    file_zip=os.path.join(DATASETS_DIR,filename)
    zip_ref = zipfile.ZipFile(file_zip, 'r')
    zip_ref.extractall(extract_dir)
    zip_ref.close()

In [8]:
# Filename without extension
name = os.path.splitext(os.path.basename(DATASET_NAME))[0]
# Base path + new dir 
new_dir = os.path.join(BASE_PATH, name)
# create the directory
try:
    os.mkdir(new_dir)
except FileExistsError:
    print("Directory {} already exist, so skipping creation".format(new_dhttps://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csvir))
# extract the dataset
extract_dataset(filename=DATASET_NAME, extract_dir=new_dir)

In [9]:
!ls $BASE_PATH/$name

Sarcasm_Headlines_Dataset.json	Sarcasm_Headlines_Dataset_v2.json


In [23]:
!head -n 3 $BASE_PATH/$name/Sarcasm_Headlines_Dataset.json

{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5", "headline": "former versace store clerk sues over secret 'black code' for minority shoppers", "is_sarcastic": 0}
{"article_link": "https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365", "headline": "the 'roseanne' revival catches up to our thorny political mood, for better and worse", "is_sarcastic": 0}
{"article_link": "https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697", "headline": "mom starting to fear son's web series closest thing she will have to grandchild", "is_sarcastic": 1}


In [11]:
def read_dataset_as_json(filename):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [26]:
# Path to json dataset :)
fullpath = os.path.join(BASE_PATH,name, JSON_FILE)

# load the data as json
dataset = read_dataset_as_json(fullpath)

# Records on the dataset
print("Records on headlines dataset :{}".format(len(dataset)))

Records on headlines dataset :26709


In [14]:
# extract sentences
headlines = [obj['headline'] for obj in dataset]
# let's checkout the 1st one
headlines[0]

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [46]:
def fit_tokenizer(tokenizer, sentences,on_text=True):
    # encode the words using the tokenizer
    if on_text:
        tokenizer.fit_on_texts(sentences)
    else:
        return tokenizer.texts_to_sequences(sentences)

In [47]:
def get_word_index(tokenizer):
    # get the indices of the words 
    return tokenizer.word_index

In [48]:
# Start working with the Tokenizer 
tokenizer = Tokenizer(oov_token="<oov>")

# encode the words
fit_tokenizer(tokenizer, headlines)

# Get the word index, first 10 
print(list(get_word_index(tokenizer))[:10])

['<oov>', 'to', 'of', 'the', 'in', 'for', 'a', 'on', 'and', 'with']


In [51]:
# Generate the sequences
seq_no_padding = fit_tokenizer(tokenizer, headlines, on_text=False)
# adding the padding 
padded_seq = pad_sequences(seq_no_padding, padding='post')

In [52]:
print("Headlines[2]: {}".format(headlines[2]))

print("Padded[2]: {}".format(padded_seq[2]))

print("Shape of padded seq: {}".format(padded_seq.shape))


Headlines[2]: mom starting to fear son's web series closest thing she will have to grandchild
Padded[2]: [  145   838     2   907  1749  2093   582  4719   221   143    39    46
     2 10736     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
Shape of padded seq: (26709, 40)
