In [3]:
import requests
from multiprocessing import Process, Manager
from fuzzywuzzy import fuzz
from textblob import TextBlob
import re

In [4]:
def get_story(story_id, stories):
    url = 'https://hacker-news.firebaseio.com/v0/item/%d.json' % story_id
    resp = requests.get(url)
    stories.append(resp.json())
    return stories

In [5]:
def get_top_stories():
    manager = Manager()
    stories = manager.list()
    url = 'https://hacker-news.firebaseio.com/v0/topstories.json'
    ids = requests.get(url)
    processes = [Process(target=get_story, args=(sid, stories))
                 for sid in ids.json()[:40]]
    for p in processes:
        p.start()
    for p in processes:
        p.join()
    return stories

In [6]:
def get_all_comments(sid):
    manager = Manager()
    comments = manager.list()
    story = get_story(sid, [])
    if not story[0].get('kids'):
        return []
    processes = [Process(target=get_story, args=(cid, comments))
                 for cid in story[0].get('kids')]
    for p in processes:
        p.start()
    for p in processes:
        p.join()
    return [c for c in comments if not c.get('deleted')]

In [7]:
def remove_html(text):
    try:
        return re.sub('<[^<]+?>', '', text)
    except:
        print text
    return text

In [8]:
def is_match(first, second):
    ratio = fuzz.token_sort_ratio(first, second)
    if ratio > 50:
        return True
    return False

In [9]:
def find_matching_comments():
    stories = get_top_stories()
    comments = []
    while len(comments) < 1:
        for s in stories:
            comments.extend(get_all_comments(s.get('id')))
    matches = []
    comment_text = ['%s - %s' % (c.get('by'),
                                 remove_html(c.get('text'))) for c in comments]
    for c in comments:
        ctext = remove_html(c.get('text'))
        comment_text.remove('%s - %s' % (c.get('by'), ctext))
        for txt in comment_text:
            if is_match(ctext, txt):
                matches.append((c, txt))
    return matches

In [10]:
def comment_sentiment():
    stories = get_top_stories()
    comments = get_all_comments(stories[0].get('id'))
    for comm in comments:
        comm['sentiment'] = TextBlob(comm.get(
            'text')).sentiment.polarity
    comments.sort(key=lambda x: x.get('sentiment'))
    return comments

In [11]:
comment_sentiment()

[{u'by': u'seqizz',
  u'id': 9921709,
  u'kids': [9921739],
  u'parent': 9921335,
  'sentiment': -1.0,
  u'text': u'Don&#x27;t be evil (with exceptions).',
  u'time': 1437481801,
  u'type': u'comment'},
 {u'by': u'HN_bias',
  u'id': 9921714,
  u'kids': [9921748],
  u'parent': 9921335,
  'sentiment': -0.45,
  u'text': u'fuck you silicon valley money obsessed programmer. you are a traitor to hacker culture.',
  u'time': 1437481849,
  u'type': u'comment'},
 {u'by': u'dudul',
  u'id': 9921780,
  u'parent': 9921335,
  'sentiment': -0.3333333333333333,
  u'text': u'That may be shallow, but I can&#x27;t take her seriously because of the medium she picked to convey her story.<p>This is unreadable.',
  u'time': 1437482551,
  u'type': u'comment'},
 {u'by': u'deskamess',
  u'id': 9921793,
  u'parent': 9921335,
  'sentiment': -0.15833333333333333,
  u'text': u'For some reason I was under the impression the spreadsheet was set up to detect gender&#x2F;ethnicity based inequality in salaries. Not sur

In [12]:
find_matching_comments()

None
None


TypeError: s1 is None