In [1]:
import json
import re
from urllib.request import urlopen
import json
import html2text
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pickle
import signal
import random

In [2]:
'''
Collect all comments by the post
Only keep posts with more than 5 comments
Takes about 3 minutes to run on timan107

To get the reddit data (chosen arbitrarily), go to the data dir and run

mkdir reddit
cd reddit
wget https://files.pushshift.io/reddit/comments/RC_2009-05.bz2
bzip2 -d RC_2009-05.bz2
'''

comments_by_post = {}
with open('../data/reddit/RC_2009-05', 'r') as f:
    for line in f:
        d = json.loads(line)
        link_id = d['link_id']
        if link_id not in comments_by_post:
            comments_by_post[link_id] = {}
        d['body'] = html2text.html2text(d['body'])
        comments_by_post[link_id][d['id']] = d
        
for key in list(comments_by_post.keys()):
    if len(comments_by_post[key]) < 5:
        del comments_by_post[key]

In [3]:
def collect_ancestors(comments: dict, comment_id: str) -> list:
    '''
    For a given list of comments and a comment in the list, reconstruct a path to the top-level comment
    Returns a list of comment IDs

    This method isn't very efficient but good enough for now
    '''
    ancestors = []
    while True:
        
        if comment_id[:2] == 't3': 
            # refers to a link (top-level comment)
            # means we've reached the top of the chain
            return ancestors[::-1]

        if comment_id[:2] == 't1':
            comment_id = comment_id[3:]

        try:
            # there is an error here sometimes where the comment id is not present in the list
            # probably fine for now, but may need to address in the future
            old_comment_id = comment_id
            comment_id = comments[comment_id]['parent_id']
            ancestors.append(old_comment_id)
        except:
            return ancestors[::-1]

In [4]:
'''
Cycle through all posts and comments, find any URL mentions, and save the mention location + the comment's ancestors
'''
all_urls = []
for post_id in comments_by_post:
    for comment_id in comments_by_post[post_id]:
        
        # first check if post body contains URL
#         urls = re.findall(r'(https?://\S+)', comments_by_post[post_id][comment_id]['body'])
#         len1 = len(urls)

        # check if post body contains URL, accounts for edge case when dash is at the end of the line
        current_comment_text = comments_by_post[post_id][comment_id]['body']
        urls = re.findall(r'(https?://\S+-\n)?(?(1)([\S]*)|(https?://\S+))', current_comment_text)
        
#         len2 = len(urls)
        
        #if len1 != len2:
        #    print(current_comment_text)
                
        if urls:
            ancestors = collect_ancestors(comments_by_post[post_id], comment_id)
            
            for url in urls:
                url = "".join(list(url))

                # heuristics for parsing errors
                url = re.sub('\)', '', url)
                url = re.sub('\]', '', url)
                url = re.sub('\n', '', url)
                
                # remove non-alphnumeric characters
                url_letters = re.sub('[^0-9a-zA-Z]', '', url)
                                
                # ignore pdfs
                if 'pdf' == url_letters[-3:] or 'jpg' in url_letters[-3:] or 'png' in url_letters[-3:] or 'gif' in url_letters[-3:]:
                    continue
                
                all_urls.append({'post_id': post_id, 'comment_id': comment_id, 'url': url, 'ancestors': ancestors})

urls_with_context = [x for x in all_urls if len(x['ancestors']) > 2]

In [5]:
'''
Loops through all ancestors of a URL comment and returns the chain up to to the top comment
'''

def get_context(url_obj: dict) -> list:
    post_id = url_obj['post_id']
    context = []
    for ancestor in url_obj['ancestors']:
        context.append(comments_by_post[post_id][ancestor]['body'])
    return context

In [42]:
'''
Simple method to scrape text from URLs. Not very robust. Need to handle exceptions. YouTube links take very long
'''

class TimeoutException(Exception):
    pass

def handler(signum, frame):
    # print("Time Exceeded!")
    raise TimeoutException

def scrape(url: str) -> str:
    try:
        html = urlopen(url).read()
    except Exception as e:
        return ""

    soup = BeautifulSoup(html, features="html.parser")

    paragraphs = soup.find_all("p")

    alphabet_checker = re.compile('[a-zA-z]')

    for paragraph in paragraphs:
        paragraph_text = paragraph.get_text()
        if alphabet_checker.findall(paragraph_text):
            if 'wikipedia.org' in url:
                return paragraph_text
            # print(paragraph.get_text())

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines and short lines
    text = ""
    for chunk in chunks:
        if chunk and len(chunk) > 50:
            # inspired by https://bigscience.huggingface.co/blog/building-a-tb-scale-multilingual-dataset-for-language-modeling#:~:text=Filters%2C%20tools%2C%20and%20indicators%20of%20data%20quality
            only_text = re.sub('[^ \w\*]', '', chunk)
            if len(only_text) / len(line) > 0.8:
                text += chunk

    return text

In [43]:
scrape("http://en.wikipedia.org/wiki/H1N1")

'In virology, influenza A virus subtype H1N1 (A/H1N1) is a subtype of Influenza A virus. Well known outbreaks of H1N1 strains in humans include the Spanish flu, the 1977 Russian flu pandemic and the 2009 swine flu pandemic. It is an orthomyxovirus that contains the glycoproteins hemagglutinin and neuraminidase. For this reason, they are described as H1N1, H1N2 etc., depending on the type of H or N antigens they express with metabolic synergy. Hemagglutinin causes red blood cells to clump together and binds the virus to the infected cell. Neuraminidase is a type of glycoside hydrolase enzyme which helps to move the virus particles through the infected cell and assist in budding from the host cells.[1]\n'

In [9]:
# explore distribution of domains (help us with parsing)
domains = {}
for url in urls_with_context:
    try:
        domain = urlparse(url['url']).netloc
    except:
        print(url['url'])
    if domain not in domains:
        domains[domain] = 0
    domains[domain] += 1

sorted_domains = sorted([(domain, domains[domain]) for domain in domains], reverse=True, key=lambda x: x[1])
print(sorted_domains[-100:])

http://www.enamerique.com[(2(http://www.enamerique.net
[('browsehappy.com', 1), ('itiz.in', 1), ('altavista.com', 1), ('www.yahoo.com', 1), ('www.thesidebar.org', 1), ('www.pickntools.com', 1), ('ni4d.us', 1), ('nemesis.thewavelength.net', 1), ('www.northernsun.com', 1), ('www.schoolphysics.co.uk', 1), ('lost-found.se', 1), ('netradio.dr.dk', 1), ('www.classicfm.co.uk', 1), ('www.cyriak.co.uk', 1), ('manga.clone-army.org', 1), ('cbc.ca', 1), ('usgovinfo.about.com', 1), ('www.partyben.com', 1), ('data.bls.gov', 1), ('www.salem-news.com', 1), ('z.about.com', 1), ('svetlana14s.narod.ru', 1), ('www.clubsnap.com', 1), ('www.economistsubscriptions.com', 1), ('www.economistacademic.com', 1), ('bastiat.org', 1), ('www.solwise.co.uk', 1), ('www.scibooks.org', 1), ('www.crcbermuda.com', 1), ('bushwells.wordpress.com', 1), ('www.stiffs.com', 1), ('www.famous-people-', 1), ('realcostofprisons.org', 1), ('moultano.blogspot.com', 1), ('lucumr.pocoo.org', 1), ('www.houseofnumbers.com', 1), ('www.lega

In [55]:
# make a basic training data set for bbc
# url_keywords = ['youtube.com']

# url_keywords = ['bbc.co.uk', 'cnn.com', 'wikipedia.org', 'www.youtube.com', 'www.imdb.com', 'www.washingtonpost.com']

training_data = []

for i, url in enumerate(urls_with_context):
    
    if i % 100 == 0: 
        # print(i / len(urls_with_context))
        print(i, len(training_data))
    
    if len(training_data) == 6000:
        break

    try:
        signal.signal(signal.SIGALRM, handler)
        signal.alarm(5) # 5 second timeout
        
        text = scrape(url['url'])
        
        signal.alarm(0) # disable alarm
        
        if text != '':
            url['text'] = text
            training_data.append(url)
        # else:
            # print(url['url'])
    except Exception as e:
        print(url['url'], e.__class__.__name__)

#     for keyword in url_keywords:
#         if keyword in url['url']:
            
# #             print(url['url'])

#             try:
#                 signal.signal(signal.SIGALRM, handler)
#                 signal.alarm(5) # 5 second timeout
                
#                 text = scrape(url['url'])
                
#                 signal.alarm(0) # disable alarm
                
#                 if text != '':
#                     url['text'] = text
#                     training_data.append(url)
#             except:
#                 print(url['url'])

0
45
106
138
181
210
247
288
327
365
404


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


435
478
512


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


543


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


581
612


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


652
688


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


717
752
790


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


820
861


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


897
928
964
1006
1044


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1082




1123


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1166
1194
1240
1278
1303
1341


  warn('Trying to detect encoding from a tiny portion of ({}) byte(s).'.format(length))
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1371
1413
1449
1488
1521
1552
1590


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1622


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1662
1702


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1740
1776
1809


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1849
1885
1911
1940
1986
2029
2052


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2092


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2117


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2141


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2163
2196
2235


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2276


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2315


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


http://www.datejesus.com/ TimeoutException
2336


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2368
2408


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2451


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2481
2514
2548
http://www.thestranger.com/seattle/Content?oid=30811 TimeoutException
2583


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2622


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2663
2699
2740
2776
2805
2839
2877
2915
2950


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2990


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3035


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3079
3126
3159
3207
3246


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3277
3319
http://www.visual-memory.co.uk/amk/doc/0055.html TimeoutException
3356
http://www.sphinxsearch.com/docs/current.html#conf-sql-query TimeoutException
3385
3426
3463
http://rkba.org/research/cramer/shall-issue.html TimeoutException
3502
3548
3593
3627
3657
3693
3728
3765
3797
3823


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3868
3898
3923
3964


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4005
4044
4077
4116
4148


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4181


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4223
4256
4296
4341


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4375
4409
4438
4475


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4511
4543


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4577


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


http://terminatorsalvation.pizzahut.com.edgesuite.net/media/flvs/TerminatorETRL_High.flv TimeoutException


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4612


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4636


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4661
4691
4725
4763
4802


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4831
4860
4894
4943
4976
5022
5067


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5104
5137
5177
5215


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5248
http://dev.w3.org/html5/spec/Overview.html. TimeoutException
http://www.whatwg.org/specs/web-apps/current-work/ TimeoutException
5277
http://stuffwhitepeoplelike.com/2008/11/18/116-black-music-that-black-people-dont-listen-to-anymore/ TimeoutException
5301
5333
5376
5414


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5464


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5503
5535


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5577
5609


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5636


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5675


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5708


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5735


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5770
5805
5839


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5879
5922


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5959
5991


In [61]:
len(training_data)

6000

In [56]:
# sets up a general training data set by adding context of all comments

for example in training_data:
    # get text of comment + ancestor comments
    context = get_context(example)
    example['full_context'] = context

pickle.dump(training_data, open('../data/reddit/6000_scrape_raw.pkl', 'wb'))

In [59]:
# format training data for pyserini (https://github.com/castorini/pyserini/)

pyserini_retrieval_docs = []
relevance_scores = []
for i,example in enumerate(training_data):
    doc = {"id": i, "contents": example['text']}
    pyserini_retrieval_docs.append(doc)
    relevance_score = str(i) + ' 0 ' + str(i) + ' 1'
    relevance_scores.append(relevance_score)

with open('../data/reddit/pyserini/bbc_news_pyserini.jsonl', 'w') as f:
    for doc in pyserini_retrieval_docs:
        f.write(json.dumps(doc) + '\n')

with open('../data/reddit/bbc_news_rel.txt', 'w') as f:
    for rs in relevance_scores:
        f.write(rs + '\n')

In [64]:
import bz2
import _pickle as cPickle

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f: 
        cPickle.dump(data, f)

In [66]:
compressed_pickle('../data/reddit/6000_scrape_raw', training_data)