#### Step 1) Download the existing dataset and create tuples of labelled sentences

In [69]:
import os
import requests

file_name = "sarcastic_news_headlines_dataset.json"
if not os.path.exists(file_name):
    print(f"Download dataset from GitHub.")
    data_url = "https://raw.githubusercontent.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/master/Sarcasm_Headlines_Dataset.json"
    with open(file_name, 'w') as f:
        contents = requests.get(data_url).text
        lines = contents.split("\n")
        f.write("\n".join([line.strip() for line in lines]))
else:
    print(f"Dataset already downloaded. Skipping.")

Dataset already downloaded. Skipping.


In [70]:
def parseJson(fname):
    for line in open(fname, 'r'):
        d = eval(line)
        del d["article_link"]
        yield d
        
data = list(parseJson(file_name))

data_tuples = [(item["headline"], item["is_sarcastic"]) for item in data]

#### Step 2) Import raw scraped texts

In [71]:
# import raw data
file_srcs = [
    "raw_scraped/babylonbee.txt", 
    "raw_scraped/onion.txt", 
    "raw_scraped/politico.txt"
]
file_labels = [1, 1, 0]

sents = []
labels = []
for i, file in enumerate(file_srcs):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        sents += [line.strip() for line in lines]
        labels += [file_labels[i]] * len(lines)

#### Step 3) Preprocessing and merging of data

In [72]:
from unidecode import unidecode

def preprocess(sent: str) -> str:
    sent = sent.lower()
    sent = unidecode(sent)
    sent = sent.strip()
    return sent
    
def filter_cond(sent):
    return "politico playbook" not in sent \
            and len(sent.split()) in range(5, 16)

In [73]:
# merge all data and ensure shape (sentence, label)
# then preprocess sentences, filter them and ensure there are no duplicates
labelled = data_tuples + list(zip(sents, labels))
processed = [(preprocess(sent), label) for sent, label in labelled]
filtered = [(sent, label) for sent, label in processed if filter_cond(sent)]
unique = list(set(filtered))

In [74]:
len(unique)

53925

#### Step 4) Save to CSV

In [75]:
import pandas as pd

df = pd.DataFrame(unique, columns=["headline", "is_sarcastic"])
target = "dataset.csv"
df.to_csv(target, index=False)

f"Saved {len(unique)} labelled examples to {target}."

'Saved 53925 labelled examples to dataset.csv.'