There were some broken urls in the original dataset. Upon inspection, many of the broken urls seem to have https://www.huffingtonpost.com appended to the front of them. The notebook here goes through the broken urls and strips the beginning HuffPost address from them and attempts to access the website again 

In [None]:
import os, re, pickle
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import html5lib

In [None]:
def deEmojify(text): # thank you https://stackoverflow.com/users/6579239/abdul-razak-adam
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def clean_html(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = ''
  for x in raw_html:
    cx = re.sub(cleanr, '', str(x))
    cx = cx.replace(u'\xa0', u' ')
    cx = cx.lstrip()
    cleantext += deEmojify(cx) + ' '
  return cleantext.lower() # the tokenizer converts to lowercase as a default

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/Sarcasm/content.pickle', 'rb') as f:
  website_contents = pickle.load(f)

In [None]:
import zipfile

path = '/content/drive/MyDrive/Colab Notebooks/Sarcasm/Sarcasm_Headlines_Dataset_v2.json.zip'
z = zipfile.ZipFile(path, 'r')
z.extractall('/tmp')
z.close()

import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('/tmp/Sarcasm_Headlines_Dataset_v2.json'))

labels = []

for record in data:
  labels.append(record['is_sarcastic'])

About 47% of the articles are labeled as sarcastic

In [None]:
sum(labels)/len(website_contents)

0.476396799329117

Some links could not be opened

In [None]:
website_contents[9]

'Failed to retrieve: https://www.huffingtonpost.comhttp://pubx.co/6IXxhm'

In [None]:
failed = []
fail_urls = []
for i in range(len(website_contents)):
  c = website_contents[i]
  T = re.match('Failed to retrieve', c)
  if T:
    failed.append(True)
    f, url = c.split(':', 1)
    url = url.strip()
    fail_urls.append((i,url))
  else:
    failed.append(False)

We could just remove these observations from our dataset, but many of the failed urls seem to have https://www.huffingtonpost.com appended to the front of a valid web address. We can try removing this from the front of the web address strings and see if this allows us to open the website

In [None]:
len(fail_urls)

904

In [None]:
update_fail_urls = []
n_chars = len('https://www.huffingtonpost.com')
for w in fail_urls:
  url = w[1]
  T = re.match('https://www.huffingtonpost.com', url)
  if T:
    url = url[n_chars:]
  try:
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    stream = urlopen(req, timeout = 5)
    html = stream.read()
    stream.close()
    html = html.decode('utf-8') 
    soup = BeautifulSoup(html, 'html5lib')
    website_contents[w[0]] = clean_html(soup.find_all('h1') + soup.find_all('p'))
  except:
    website_contents[w[0]] = 'Failed to retrieve: {}'.format(url)

In [None]:
update_fail_urls = []
for i in range(len(website_contents)):
  c = website_contents[i]
  T = re.match('Failed to retrieve', c)
  if T:
    failed.append(True)
    f, url = c.split(':', 1)
    url = url.strip()
    update_fail_urls.append((i,url))

In [None]:
len(update_fail_urls)

419

In [None]:
fail_idxs = []
for fail in update_fail_urls:
  fail_idxs.append(fail[0])
for idx in sorted(fail_idxs, reverse=True):
  del website_contents[idx]
  del labels[idx]

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/Sarcasm/website_contents.pickle', 'wb') as f:
  pickle.dump(website_contents, f)
with open('/content/drive/MyDrive/Colab Notebooks/Sarcasm/labels.pickle', 'wb') as f:
  pickle.dump(labels, f)