In [2]:
import json
import re
from urllib.request import urlopen
import json
import html2text
from bs4 import BeautifulSoup
import requests

In [4]:
'''
Collect all comments by the post
Only keep posts with more than 5 comments
Takes about 3 minutes to run

To get the reddit data (chosen arbitrarily), go to the data dir and run

mkdir reddit
cd reddit
wget https://files.pushshift.io/reddit/comments/RC_2009-05.bz2
bzip2 -d RC_2009-05.bz2
'''

comments_by_post = {}
with open('../data/reddit/RC_2009-05', 'r') as f:
    for line in f:
        d = json.loads(line)
        link_id = d['link_id']
        if link_id not in comments_by_post:
            comments_by_post[link_id] = {}
        d['body'] = html2text.html2text(d['body'])
        comments_by_post[link_id][d['id']] = d

for key in list(comments_by_post.keys()):
    if len(comments_by_post[key]) < 5:
        del comments_by_post[key]

In [35]:
def collect_ancestors(comments: dict, comment_id: str) -> list:
    '''
    For a given list of comments and a comment in the list, reconstruct a path to the top-level comment
    Returns a list of comment IDs

    This method isn't very efficient but good enough for now
    '''
    ancestors = []
    while True:
        
        if comment_id[:2] == 't3': 
            # refers to a link (top-level comment)
            # means we've reached the top of the chain
            return ancestors[::-1]

        if comment_id[:2] == 't1':
            comment_id = comment_id[3:]


        try:
            # there is an error here sometimes where the comment id is not present in the list
            # probably fine for now, but may need to address in the future
            old_comment_id = comment_id
            comment_id = comments[comment_id]['parent_id']
            ancestors.append(old_comment_id)
        except:
            return ancestors[::-1]

In [79]:
'''
Cycle through all posts and comments, find any URL mentions, and save the mention location + the comment's ancestors
'''
all_urls = []
for post_id in comments_by_post:
    for comment_id in comments_by_post[post_id]:
        # first check if post body contains URL
        urls = re.findall(r'(https?://\S+)', comments_by_post[post_id][comment_id]['body'])
        if urls != []:
            ancestors = collect_ancestors(comments_by_post[post_id], comment_id)
            for url in urls:
                url = re.sub('\)', '', url)
                all_urls.append({'post_id': post_id, 'comment_id': comment_id, 'url': url, 'ancestors': ancestors})

urls_with_context = [x for x in all_urls if len(x['ancestors']) > 2]

In [84]:
'''
Loops through all ancestors of a URL comment and returns the chain up to to the top comment
'''

def display_context(url_obj: dict) -> list:
    post_id = url_obj['post_id']
    context = []
    for ancestor in url_obj['ancestors']:
        context.append(comments_by_post[post_id][ancestor]['body'])
    return context

In [67]:
'''
Simple method to scrape text from URLs. Not very robust. Need to handle exceptions. YouTube links take very long
'''

def scrape(url: str) -> str:
    try:
        html = urlopen(url).read()
    except Exception as e:
        return e

    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

In [83]:
url_finding = urls_with_context[7]

print(display_context(url_finding))
print(scrape(url_finding['url']))

{'post_id': 't3_8gsuf', 'comment_id': 'c098rmv', 'url': 'http://en.wikipedia.org/wiki/Jeet_Kune_Do', 'ancestors': ['c098ocj', 'c098r4w', 'c098rmv']}
["JKD Concepts would say he didn't do what's right. Ideally you want to\nannihilate your opponent, and then run like hell, but that gets you into jail\neven faster :(.\n\n", "What's JKD?\n\n", '[Jeet Kun Do](http://en.wikipedia.org/wiki/Jeet_Kune_Do)\n\n']
Jeet Kune Do - Wikipedia
Jeet Kune Do
From Wikipedia, the free encyclopedia
Jump to navigation
Jump to search
Hybrid martial art
This article has multiple issues. Please help improve it or discuss these issues on the talk page. (Learn how and when to remove these template messages)
This article possibly contains original research. Please improve it by verifying the claims made and adding inline citations. Statements consisting only of original research should be removed. (March 2019) (Learn how and when to remove this template message)This article needs additional citations for verificat