In [1]:
import json
import re
from urllib.request import urlopen
import json
import html2text
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pickle
import signal
import random

In [3]:
'''
Collect all comments by the post
Only keep posts with more than 5 comments
Takes about 3 minutes to run on timan107

To get the reddit data (chosen arbitrarily), go to the data dir and run

mkdir reddit
cd reddit
wget https://files.pushshift.io/reddit/comments/RC_2009-05.bz2
bzip2 -d RC_2009-05.bz2
'''

# comments_by_post = {}
# with open('../data/reddit/RC_2009-05', 'r') as f:
#     for line in f:
#         d = json.loads(line)
#         link_id = d['link_id']
#         if link_id not in comments_by_post:
#             comments_by_post[link_id] = {}
#         d['body'] = html2text.html2text(d['body'])
#         comments_by_post[link_id][d['id']] = d
        
# for key in list(comments_by_post.keys()):
#     if len(comments_by_post[key]) < 5:
#         del comments_by_post[key]

comments_by_post = {}
with open('../data/reddit/RC_2013-09', 'r') as f:
    for line in f:
        d = json.loads(line)
        link_id = d['link_id']
        if link_id not in comments_by_post:
            comments_by_post[link_id] = {}
        d['body'] = html2text.html2text(d['body'])
        comments_by_post[link_id][d['id']] = d
        
for key in list(comments_by_post.keys()):
    if len(comments_by_post[key]) < 5:
        del comments_by_post[key]

In [6]:
pickle.dump(comments_by_post, open('../data/reddit/comments.pkl', 'wb'))

In [2]:
comments_by_post = pickle.load(open('../data/reddit/comments.pkl', 'rb'))

In [3]:
def collect_ancestors(comments: dict, comment_id: str) -> list:
    '''
    For a given list of comments and a comment in the list, reconstruct a path to the top-level comment
    Returns a list of comment IDs

    This method isn't very efficient but good enough for now
    '''
    ancestors = []
    while True:
        
        if comment_id[:2] == 't3': 
            # refers to a link (top-level comment)
            # means we've reached the top of the chain
            return ancestors[::-1]

        if comment_id[:2] == 't1':
            comment_id = comment_id[3:]

        try:
            # there is an error here sometimes where the comment id is not present in the list
            # probably fine for now, but may need to address in the future
            old_comment_id = comment_id
            comment_id = comments[comment_id]['parent_id']
            ancestors.append(old_comment_id)
        except:
            return ancestors[::-1]

In [4]:
'''
Cycle through all posts and comments, find any URL mentions, and save the mention location + the comment's ancestors
'''
all_urls = []
for post_id in comments_by_post:
    for comment_id in comments_by_post[post_id]:
        
        # first check if post body contains URL
#         urls = re.findall(r'(https?://\S+)', comments_by_post[post_id][comment_id]['body'])
#         len1 = len(urls)

        # check if post body contains URL, accounts for edge case when dash is at the end of the line
        current_comment_text = comments_by_post[post_id][comment_id]['body']
        urls = re.findall(r'(https?://\S+-\n)?(?(1)([\S]*)|(https?://\S+))', current_comment_text)
        
#         len2 = len(urls)
        
        #if len1 != len2:
        #    print(current_comment_text)
                
        if urls:
            ancestors = collect_ancestors(comments_by_post[post_id], comment_id)
            
            for url in urls:
                url = "".join(list(url))

                # heuristics for parsing errors
                url = re.sub('\)', '', url)
                url = re.sub('\]', '', url)
                url = re.sub('\n', '', url)
                
                # remove non-alphnumeric characters
                url_letters = re.sub('[^0-9a-zA-Z]', '', url)
                                
                # ignore pdfs
                if 'pdf' == url_letters[-3:] or 'jpg' in url_letters[-3:] or 'png' in url_letters[-3:] or 'gif' in url_letters[-3:]:
                    continue
                
                all_urls.append({'post_id': post_id, 'comment_id': comment_id, 'url': url, 'ancestors': ancestors})

urls_with_context = [x for x in all_urls if len(x['ancestors']) > 2]

In [6]:
print(len(urls_with_context), len(comments_by_post))

594695 958634


In [5]:
'''
Loops through all ancestors of a URL comment and returns the chain up to to the top comment
'''

def get_context(url_obj: dict) -> list:
    post_id = url_obj['post_id']
    context = []
    for ancestor in url_obj['ancestors']:
        context.append(comments_by_post[post_id][ancestor]['body'])
    return context

In [7]:
'''
Simple method to scrape text from URLs. Not very robust. Need to handle exceptions. YouTube links take very long
'''

class TimeoutException(Exception):
    pass

def handler(signum, frame):
    # print("Time Exceeded!")
    raise TimeoutException

def scrape(url: str) -> str:
    try:
        html = urlopen(url).read()
    except Exception as e:
        return ""

    soup = BeautifulSoup(html, features="html.parser")

    paragraphs = soup.find_all("p")

    alphabet_checker = re.compile('[a-zA-z]')

    for paragraph in paragraphs:
        paragraph_text = paragraph.get_text()
        if alphabet_checker.findall(paragraph_text):
            if 'wikipedia.org' in url:
                return paragraph_text
            # print(paragraph.get_text())

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines and short lines
    text = ""
    for chunk in chunks:
        if chunk and len(chunk) > 50:
            # inspired by https://bigscience.huggingface.co/blog/building-a-tb-scale-multilingual-dataset-for-language-modeling#:~:text=Filters%2C%20tools%2C%20and%20indicators%20of%20data%20quality
            only_text = re.sub('[^ \w\*]', '', chunk)
            if len(only_text) / len(chunk) > 0.8:
                text += chunk

    return text

In [8]:
# explore distribution of domains (help us with parsing)
domains = {}
for url in urls_with_context:
    try:
        domain = urlparse(url['url']).netloc
    except:
        pass
        # print(url['url'])
    if domain not in domains:
        domains[domain] = 0
    domains[domain] += 1

sorted_domains = sorted([(domain, domains[domain]) for domain in domains], reverse=True, key=lambda x: x[1])
print(sorted_domains[:100])

[('www.youtube.com', 78919), ('www.reddit.com', 72305), ('en.wikipedia.org', 63635), ('para.ms', 25160), ('imgur.com', 20784), ('pcpartpicker.com', 9461), ('youtu.be', 8754), ('www.amazon.com', 6752), ('www.google.com', 4161), ('github.com', 2490), ('en.m.wikipedia.org', 2411), ('www.imdb.com', 2369), ('i.imgur.com', 2311), ('www.ncbi.nlm.nih.gov', 2295), ('twitter.com', 2219), ('www.nytimes.com', 2173), ('www.theguardian.com', 2007), ('redditenhancementsuite.com', 1727), ('www.huffingtonpost.com', 1669), ('www.facebook.com', 1412), ('xkcd.com', 1394), ('uk.pcpartpicker.com', 1306), ('m.youtube.com', 1180), ('www.ebay.com', 1172), ('www.bbc.co.uk', 1101), ('www.forbes.com', 1081), ('gatherer.wizards.com', 1059), ('steamcommunity.com', 1054), ('www.newegg.com', 1021), ('coinflipbot.re', 1018), ('play.google.com', 942), ('lmgtfy.com', 940), ('www.washingtonpost.com', 909), ('www.dailymail.co.uk', 880), ('www.urbandictionary.com', 859), ('soundcloud.com', 833), ('www.telegraph.co.uk', 821

In [9]:
# make a basic training data set for bbc
# url_keywords = ['youtube.com']

url_keywords = ['bbc.co.uk', 'cnn.com', 'wikipedia.org']

training_data = []

for i, url in enumerate(urls_with_context):
    
    if i % 100 == 0: 
        # print(i / len(urls_with_context))
        print(i, len(training_data))
        # if len(training_data) % 5000 == 0:
            # pickle.dump(training_data, open('../data/reddit/without_context.pkl', 'wb'))

    if len(training_data) == 6000:
        break

    for keyword in url_keywords:
        if keyword in url['url']:
        
            try:
                signal.signal(signal.SIGALRM, handler)
                signal.alarm(5) # 5 second timeout
                
                text = scrape(url['url'])
                
                signal.alarm(0) # disable alarm
                
                if text != '':
                    url['text'] = text
                    training_data.append(url)
            except Exception as e:
                print(url['url'], e.__class__.__name__)

0 0
http://en.wikipedia.org/wiki/Aegis_Combat_System TimeoutException
100 9
200 19
300 21
400 43
500 58
http://en.wikipedia.org/wiki/2013%E2%80%9314_UEFA_Champions_League_qualifying_phase_and_play-off_round TimeoutException
600 71
700 80
800 84
900 91
1000 104
1100 115
1200 122
1300 132
http://en.wikipedia.org/wiki/Robert_H._Jackson TimeoutException
1400 137
1500 146
1600 157
1700 169
1800 173
1900 176
2000 183
http://en.wikipedia.org/wiki/Cantons_of_Switzerland TimeoutException
2100 196
2200 213
2300 223
2400 231
http://en.wikipedia.org/wiki/7_July_2005_London_bombings TimeoutException
2500 243
2600 253
2700 265
2800 278
2900 283
3000 285
3100 296
3200 308
http://en.wikipedia.org/wiki/Mass_murder#Mass_murder_by_a_state TimeoutException
3300 315
3400 327
3500 333
3600 342
3700 354
3800 362
3900 372
http://en.wikipedia.org/wiki/Rape_statistics TimeoutException
4000 375
4100 383
4200 396
4300 405
4400 410
4500 424
4600 433
4700 438
4800 443
4900 449
http://en.wikipedia.org/wiki/Uncanny_v

In [10]:
# sets up a general training data set by adding context of all comments

for example in training_data:
    # get text of comment + ancestor comments
    context = get_context(example)
    example['full_context'] = context

pickle.dump(training_data, open('../data/reddit/bbc_news_scrape_raw.pkl', 'wb'))

In [11]:
# format training data for pyserini (https://github.com/castorini/pyserini/)

pyserini_retrieval_docs = []
relevance_scores = []
for i, example in enumerate(training_data):
    doc = {"id": i, "contents": example['text']}
    pyserini_retrieval_docs.append(doc)
    relevance_score = str(i) + ' 0 ' + str(i) + ' 1'
    relevance_scores.append(relevance_score)

with open('../data/reddit/pyserini/bbc_news_pyserini.jsonl', 'w') as f:
    for doc in pyserini_retrieval_docs:
        f.write(json.dumps(doc) + '\n')

with open('../data/reddit/bbc_news_rel.txt', 'w') as f:
    for rs in relevance_scores:
        f.write(rs + '\n')