In [59]:
import os
import zipfile
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [43]:
# download from kaggle
# ref https://rishabhmisra.github.io/publications/
KAGGLE_DATASET = "rmisra/news-headlines-dataset-for-sarcasm-detection"
BASE_PATH="/data/tmp"
DATASETS_DIR="./datasets"
DATASET_NAME="news-headlines-dataset-for-sarcasm-detection.zip"
JSON_FILE = "Sarcasm_Headlines_Dataset.json"

In [44]:
# Kaggle Dataset Download :) 
def download_kaggle_dataset(dataset_name, path_to_download=DATASETS_DIR):
    print("Downloading at: {}".format(path_to_download))
    # --force if you want to force download
    !kaggle datasets download -d $dataset_name -p $path_to_download 

In [45]:
download_kaggle_dataset(KAGGLE_DATASET)

Downloading at: ./datasets
news-headlines-dataset-for-sarcasm-detection.zip: Skipping, found more recently modified local copy (use --force to force download)


In [46]:
def extract_dataset(filename, extract_dir=BASE_PATH):
    ''' will extract the dataset on the fullpath 
      to the extract_dir defined if not then default to BASE_PATH
      Args:
        filename (str): zip filename
        extract_dir (str): path to extract the zip file
    '''

    file_zip=os.path.join(DATASETS_DIR,filename)
    zip_ref = zipfile.ZipFile(file_zip, 'r')
    zip_ref.extractall(extract_dir)
    zip_ref.close()

In [47]:
# Filename without extension
name = os.path.splitext(os.path.basename(DATASET_NAME))[0]
# Base path + new dir 
new_dir = os.path.join(BASE_PATH, name)
# create the directory
try:
    os.mkdir(new_dir)
except FileExistsError:
    print("Directory {} already exist, so skipping creation".format(new_dir))
# extract the dataset
extract_dataset(filename=DATASET_NAME, extract_dir=new_dir)

Directory /data/tmp/news-headlines-dataset-for-sarcasm-detection already exist, so skipping creation


In [48]:
!ls $BASE_PATH/$name

Sarcasm_Headlines_Dataset.json	Sarcasm_Headlines_Dataset_v2.json


In [49]:
!head $BASE_PATH/$name/Sarcasm_Headlines_Dataset.json

{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5", "headline": "former versace store clerk sues over secret 'black code' for minority shoppers", "is_sarcastic": 0}
{"article_link": "https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365", "headline": "the 'roseanne' revival catches up to our thorny political mood, for better and worse", "is_sarcastic": 0}
{"article_link": "https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697", "headline": "mom starting to fear son's web series closest thing she will have to grandchild", "is_sarcastic": 1}
{"article_link": "https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302", "headline": "boehner just wants wife to listen, not come up with alternative debt-reduction ideas", "is_sarcastic": 1}
{"article_link": "https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy

In [54]:
def read_dataset_as_json(filename):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [53]:
# load the data as json
fullpath = os.path.join(BASE_PATH,name, JSON_FILE)

dataset = read_dataset_as_json(fullpath)
len(dataset)

26709

In [58]:
# extract sentences
headlines = [obj['headline'] for obj in dataset]

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [None]:
# do preprocessing with Tokenizer
tokenizer = Tokenizer(num_words=)