In [1]:
import json
import re
from urllib.request import urlopen
import json
import html2text
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pickle

In [31]:
'''
Collect all comments by the post
Only keep posts with more than 5 comments
Takes about 3 minutes to run on timan107

To get the reddit data (chosen arbitrarily), go to the data dir and run

mkdir reddit
cd reddit
wget https://files.pushshift.io/reddit/comments/RC_2009-05.bz2
bzip2 -d RC_2009-05.bz2
'''

comments_by_post = {}
with open('../data/reddit/RC_2009-05', 'r') as f:
    for line in f:
        d = json.loads(line)
        link_id = d['link_id']
        if link_id not in comments_by_post:
            comments_by_post[link_id] = {}
        d['body'] = html2text.html2text(d['body'])
        comments_by_post[link_id][d['id']] = d
        
for key in list(comments_by_post.keys()):
    if len(comments_by_post[key]) < 5:
        del comments_by_post[key]

In [None]:
comments_by_post['t3_8gupc']

In [10]:
def collect_ancestors(comments: dict, comment_id: str) -> list:
    '''
    For a given list of comments and a comment in the list, reconstruct a path to the top-level comment
    Returns a list of comment IDs

    This method isn't very efficient but good enough for now
    '''
    ancestors = []
    while True:
        
        if comment_id[:2] == 't3': 
            # refers to a link (top-level comment)
            # means we've reached the top of the chain
            return ancestors[::-1]

        if comment_id[:2] == 't1':
            comment_id = comment_id[3:]

        try:
            # there is an error here sometimes where the comment id is not present in the list
            # probably fine for now, but may need to address in the future
            old_comment_id = comment_id
            comment_id = comments[comment_id]['parent_id']
            ancestors.append(old_comment_id)
        except:
            return ancestors[::-1]

In [201]:
# re.findall(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '[http://en.wikipedia.org/wiki/List_of_best-selling_video_games#PC](http://en.wikipedia.org/wiki/List_of_best-selling_video_games#PC)')
re.findall(r'(https?://\S+-\n)?(?(1)([\S]*)|(https?://\S+))', '[http://en.wikipedia.org/wiki/List_of_best-selling_video_games#PC](http://en.wikipedia.org/wiki/List_of_best-selling_video_games#PC)')

[('',
  '',
  'http://en.wikipedia.org/wiki/List_of_best-selling_video_games#PC](http://en.wikipedia.org/wiki/List_of_best-selling_video_games#PC)')]

In [195]:
'''
Cycle through all posts and comments, find any URL mentions, and save the mention location + the comment's ancestors
'''
all_urls = []
for post_id in comments_by_post:
    for comment_id in comments_by_post[post_id]:
        
        # first check if post body contains URL
        urls = re.findall(r'(https?://\S+)', comments_by_post[post_id][comment_id]['body'])
        len1 = len(urls)

        # check if post body contains URL, accounts for edge case when dash is at the end of the line
        current_comment_text = comments_by_post[post_id][comment_id]['body']
        urls = re.findall(r'(https?://\S+-\n)?(?(1)([\S]*)|(https?://\S+))', current_comment_text)
        len2 = len(urls)
        
        if len1 != len2:
            print(current_comment_text)
                
        if urls:
            ancestors = collect_ancestors(comments_by_post[post_id], comment_id)
            
            for url in urls:
                url = "".join(list(url))

                # heuristics for parsing errors
                url = re.sub('\)', '', url)
                url = re.sub('\]', '', url)
                url = re.sub('\n', '', url)
                
                # remove non-alphnumeric characters
                url_letters = re.sub('[^0-9a-zA-Z]', '', url)
                                
                # ignore pdfs
                if 'pdf' == url_letters[-3:] or 'jpg' in url_letters[-3:] or 'png' in url_letters[-3:] or 'gif' in url_letters[-3:]:
                    continue
                
                all_urls.append({'post_id': post_id, 'comment_id': comment_id, 'url': url, 'ancestors': ancestors})

urls_with_context = [x for x in all_urls if len(x['ancestors']) > 2]

[http://en.wikipedia.org/wiki/List_of_best-
selling_video_games#PC](http://en.wikipedia.org/wiki/List_of_best-
selling_video_games#PC)


You could try YesScript. It allows JS by default but you can blacklist any
site with a click of its icon. [https://addons.mozilla.org/en-
US/firefox/addon/4922](https://addons.mozilla.org/en-US/firefox/addon/4922)


You don't need a keel to track a canoe. That's a myth.
[http://ask.metafilter.com/19260/Wheres-the-
keel](http://ask.metafilter.com/19260/Wheres-the-keel)


The really bad thing is they didn't even try to make it work:
[https://bugs.launchpad.net/ubuntu/+source/update-
manager/+bug/369706](https://bugs.launchpad.net/ubuntu/+source/update-
manager/+bug/369706)


This seems to be saying that women like trivial shiny things that are
extremely easy to use and don't have any real purpose or function. I guess the
author must have realized this halfway through writing the post, because this
nonsense about "depth" and "soul" is a load of doublethi

She has a deviant art site, too, with other good stuff:
[http://cedarseed.deviantart.com/gallery/#Tutorials-and-
Stock](http://cedarseed.deviantart.com/gallery/#Tutorials-and-Stock)


I don't know, but I'd guess it probably is better overall. That said, I know
there have been issues where they've tested the compiled code in (buggy) ARM
emulators rather than on real hardware, resulting in the compiler generating
code that would [_crash_ on real
hardware](http://74.125.95.132/search?q=cache:JP4fggHMijMJ:hardwarebug.org/2008/10/11/codesourcery-
gcc-2008q3-fail/+http://hardwarebug.org/2008/10/11/codesourcery-
gcc-2008q3-fail/&cd=1&hl=en&ct=clnk&gl=us&client=firefox-a). And I'm aware of
at least [one
other](http://74.125.95.132/search?q=cache:3XnB69GJecMJ:hardwarebug.org/2008/11/28/codesourcery-
fails-again/+http://hardwarebug.org/2008/11/28/codesourcery-fails-
again/&cd=1&hl=en&ct=clnk&gl=us&client=firefox-a) bug resulting in incorrect
code generation. Presumably these bugs have been fixed

>it's well known that these can be trivially opened by using the shaft of a
ballpoint pen It's not "well known", it was a fluke with a particular type of
kryptonite lock. IIRC, the company graciously allowed people to return them
and be sent new ones. Anyone with big enough bolt cutters or a good hacksaw
can get through *any* lock. Nothing is secure, you just have to make it as
annoying as possible to steal something, and lower the payoff. Do it the way
that the Dutch do it. Get a sturdy bike, but one with zero bling. Hell, get a
few cans of spray paint and trash the frame. No quick-release anything, it
should take some tools to take the seat, wheels, etc off the bike. Always use
two locks. I use a convenient [rear-wheel
lock](http://www.jstevenwood.com/images/bikelock.jpg), which immobilizes the
rear wheel and (more importantly) forces the habit of locking your bike (works
like a car ignition system). Second lock should be a [big fucking
chain](http://images.google.com/imgres?imgurl=h

Ubuntu Pre-Installed [https://help.ubuntu.com/community/UbuntuPre-
installed](https://help.ubuntu.com/community/UbuntuPre-installed)


Liberty means, "I decide, not the government - even if it's a decision to be
foolish". Edit: I'm stating a definition, not a personal opinion or political
philosophy. Here are some
[d](http://www.answers.com/topic/liberty)ic[ti](http://www.merriam-
webster.com/dictionary/liberty)on[ary](http://www.thefreedictionary.com/liberty)
li[nks](http://www.google.com/search?q=define:liberty) if you have never
looked the term up.


InfoQ offers a transcript of the interview, but as costumary it's inside an
impossibly small text box. I've posted to the full transcript in a more
readable format to my blog:
[http://olifante.blogs.com/covil/2009/05/transcript-of-rich-hickey-
interview.html!](http://olifante.blogs.com/covil/2009/05/transcript-of-rich-
hickey-interview.html)


Uh oh... the aliens have fired back:
[http://timesofindia.indiatimes.com/Blast-rocks-Lahore-ma

In [187]:
len(all_urls)

78216

In [185]:
len(all_urls)

79297

In [175]:
all_urls

[{'post_id': 't3_8gupc',
  'comment_id': 'c098olp',
  'url': 'http://65.127.124.62/south_asia/4483241.stm.htm',
  'ancestors': ['c098olp']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098pvz',
  'url': 'http://www.reddit.com/r/AskReddit/comments/8gt1l/question_redditors_how_many_people_besides_myself/c098ann...',
  'ancestors': ['c098pvz']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098pwy',
  'url': 'http://www.reddit.com/r/pics/comments/8gx7f/swineflu_zombie_strikes_again/',
  'ancestors': ['c098pwy']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098ss8',
  'url': 'http://www.youtube.com/watch?v=9k18Q_63KpE&feature=PlayList&p=68814390F8D6077D&playnext=1&playnext_from=PL&index=2',
  'ancestors': ['c098ss8']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098tgm',
  'url': 'http://tinyurl.com/cceu2y',
  'ancestors': ['c098tgm']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c09915p',
  'url': 'http://en.wikipedia.org/wiki/Thanatos',
  'ancestors': ['c098nlj', 'c09915p']},
 {'post_id': 't3_8g

In [17]:
all_urls

[{'post_id': 't3_8gupc',
  'comment_id': 'c098olp',
  'url': 'http://65.127.124.62/south_asia/4483241.stm.htm',
  'ancestors': ['c098olp']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098os3',
  'url': 'http://imgur.com/2NXoe.jpg.',
  'ancestors': ['c098os3']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098pvz',
  'url': 'http://www.reddit.com/r/AskReddit/comments/8gt1l/question_redditors_how_many_people_besides_myself/c098ann...',
  'ancestors': ['c098pvz']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098pwy',
  'url': 'http://www.reddit.com/r/pics/comments/8gx7f/swineflu_zombie_strikes_again/',
  'ancestors': ['c098pwy']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098ss8',
  'url': 'http://www.youtube.com/watch?v=9k18Q_63KpE&feature=PlayList&p=68814390F8D6077D&playnext=1&playnext_from=PL&index=2',
  'ancestors': ['c098ss8']},
 {'post_id': 't3_8gupc',
  'comment_id': 'c098tgm',
  'url': 'http://tinyurl.com/cceu2y',
  'ancestors': ['c098tgm']},
 {'post_id': 't3_8gupc',
  'comment_id':

In [176]:
'''
Loops through all ancestors of a URL comment and returns the chain up to to the top comment
'''

def get_context(url_obj: dict) -> list:
    post_id = url_obj['post_id']
    context = []
    for ancestor in url_obj['ancestors']:
        context.append(comments_by_post[post_id][ancestor]['body'])
    return context

In [177]:
'''
Simple method to scrape text from URLs. Not very robust. Need to handle exceptions. YouTube links take very long
'''

# add a timeout

def scrape(url: str) -> str:
    try:
        html = urlopen(url).read()
    except Exception as e:
        return ""

    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines and short lines
    text = '\n'.join(chunk for chunk in chunks if chunk and len(chunk) > 50)

    return text

In [178]:
# explore distribution of domains (help us with parsing)
domains = {}
for url in urls_with_context:
    try:
        domain = urlparse(url['url']).netloc
    except:
        print(url['url'])
    if domain not in domains:
        domains[domain] = 0
    domains[domain] += 1

sorted_domains = sorted([(domain, domains[domain]) for domain in domains], reverse=True, key=lambda x: x[1])
print(sorted_domains[:100])

http://www.enamerique.com[(2(http://www.enamerique.net
[('en.wikipedia.org', 6012), ('www.reddit.com', 3141), ('www.youtube.com', 2591), ('www.google.com', 386), ('www.amazon.com', 340), ('www.imdb.com', 251), ('scriptures.lds.org', 217), ('news.bbc.co.uk', 166), ('www.biblegateway.com', 160), ('www.nytimes.com', 147), ('www.flickr.com', 137), ('tinyurl.com', 126), ('video.google.com', 119), ('www.guardian.co.uk', 106), ('dictionary.reference.com', 104), ('xkcd.com', 103), ('www.merriam-webster.com', 95), ('images.google.com', 92), ('lokonline.com', 76), ('lmgtfy.com', 75), ('books.google.com', 73), ('www.urbandictionary.com', 73), ('addons.mozilla.org', 64), ('maps.google.com', 56), ('www.cnn.com', 52), ('www.washingtonpost.com', 51), ('www.timesonline.co.uk', 50), ('www.msnbc.msn.com', 49), ('www.snopes.com', 48), ('pickyourhours.com', 48), ('www.theonion.com', 47), ('www.cdc.gov', 45), ('www.newegg.com', 44), ('www.yooouuutuuube.com', 44), ('code.google.com', 43), ('www.ted.com', 43

In [None]:
# make a basic training data set for bbc
url_keywords = ['bbc.co.uk']
training_data = []

for i, url in enumerate(urls_with_context):
    if i % 1000 == 0: print(i / len(urls_with_context))
    for keyword in url_keywords:
        if keyword in url['url']:
            print(url['url'])
            try:
                text = scrape(url['url']).split('\n')
                if text != '' and text != ['']:
                    url['text'] = text
                    training_data.append(url)
            except:
                break

In [180]:
# sets up a general training data set by adding context of all comments

for example in training_data:
    # get text of comment + ancestor comments
    context = get_context(example)
    example['full_context'] = context

pickle.dump(training_data, open('../data/reddit/bbc_news_scrape_raw.pkl', 'wb'))

In [182]:
for example in training_data[2:3]:
    print(example['text'], 2 * '\n', "\n".join(example['full_context']))

["BBC News | AFRICA | Tutu condemns Israeli 'apartheid'", 'South African Archbishop Desmond Tutu has accused Israel of practising apartheid in its policies towards the Palestinians.', 'The Nobel peace laureate said he was "very deeply distressed" by a visit to the Holy Land, adding that "it reminded me so much of what happened to us black people in South Africa".', 'The Jewish lobby is powerful - very powerful. Well, so what?', 'In a speech in the United States, carried in the UK\'s Guardian newspaper, Archbishop Tutu said he saw "the humiliation of the Palestinians at checkpoints and roadblocks, suffering like us when young white police officers prevented us from moving about".', 'The archbishop, who was a leading opponent of apartheid in South Africa, said Israel would "never get true security and safety through oppressing another people".', 'Archbishop Tutu said his criticism of the Israeli Government did not mean he was anti-Semitic.', '"I am not even anti-white, despite the madnes

In [183]:
# format training data for pyserini (https://github.com/castorini/pyserini/)

pyserini_retrieval_docs = []
relevance_scores = []
for i,example in enumerate(training_data):
    doc = {"id": i, "contents": " ".join(example['text'])}
    pyserini_retrieval_docs.append(doc)
    relevance_score = str(i) + ' 0 ' + str(i) + ' 1'
    relevance_scores.append(relevance_score)

with open('../data/reddit/pyserini/bbc_news_pyserini.jsonl', 'w') as f:
    for doc in pyserini_retrieval_docs:
        f.write(json.dumps(doc) + '\n')

with open('../data/reddit/bbc_news_rel.txt', 'w') as f:
    for rs in relevance_scores:
        f.write(rs + '\n')