In [1]:
import requests
import numpy as np
import pandas as pd
from datetime import datetime

client = requests.Session()

class Scraper:
    def __init__(self):
        self.empty_dicts()

    def empty_dicts(self):
        self.users = {
            'id':[],
            'username':[]
        }
        self.stories = {
            'id':[],
            'title':[],
            'by_id':[],
            'descendants':[],
            'score':[],
            'time':[],
            'url':[]
        }
        self.jobs = {
            'id':[],
            'title':[],
            'text':[],
            'by_id':[],
            'score':[],
            'time':[],
            'url':[],
        }
        self.comments = {
            'id':[],
            'text':[],
            'by_id':[],
            'time':[]
        }
        self.polls = {
            'id':[],
            'title':[],
            'text':[],
            'by_id':[],
            'descendants':[],
            'score':[],
            'time':[],
        }
        self.pollopts = {
            'id':[],
            'text':[],
            'by_id':[],
            'poll':[],
            'score':[],
            'time':[],
        }
        self.parents = {
            'id':[],
            'parent':[],
            'type':[],
        }
        self.deleted = {
            'item':[]
        }
        self.dead = {
            'item':[]
        }
        self.scrape = {
            'last_id':[]
        }
        self.skipped = []

    def get(self, id):
        url = f'https://hacker-news.firebaseio.com/v0/item/{id}.json'
        response = client.get(url)
        return response.json()

    def get_max(self):
        url = 'https://hacker-news.firebaseio.com/v0/maxitem.json'
        response = client.get(url)
        return int(response.text)

    def to_dict(self, input_id):
        self.scrape['last_id'] = [input_id]
        response = self.get(input_id)

        # sanity check
        try:
            id = response['id']
            type = response['type']
        except KeyError:
            self.skipped.append(id)
            return

        # check if deleted
        try:
            if response['deleted']:
                self.deleted['item'].append(response['id'])
        except KeyError:
            pass

        # check if dead
        try: 
            if response['dead']:
                self.dead['item'].append(response['id'])
        except KeyError:
            pass

        try:
            title = response['title']
        except KeyError:
            title = np.nan

        try:
            text = response['text']
        except KeyError:
            text = np.nan

        try:
            username = response['by']
        except KeyError:
            username = np.nan
        finally:
            if username in self.users['username']:
                by_id = self.users['username'].index(username)+1
            else:
                by_id = len(self.users['id'])+1
                self.users['id'].append(by_id)
                self.users['username'].append(username)
        try:
            score = response['score']
        except KeyError:
            score = np.nan
            
        try:
            time = response['time']
        except KeyError:
            time = np.nan

        try:
            url = response['url']
        except KeyError:
            url = np.nan

        try:
            descendants = response['descendants']
        except KeyError:
            descendants = np.nan

        try:
            poll = response['poll']
        except KeyError:
            poll = np.nan

        try:
            parent = response['parent']
        except KeyError:
            parent = np.nan

        if type == 'story':
            self.stories['id'].append(id)
            self.stories['title'].append(title)
            self.stories['by_id'].append(by_id)
            self.stories['descendants'].append(descendants)
            self.stories['score'].append(score)
            self.stories['time'].append(time)
            self.stories['url'].append(url)

        elif type == 'job':
            self.jobs['id'].append(id)
            self.jobs['title'].append(title)
            self.jobs['text'].append(text)
            self.jobs['by_id'].append(by_id)
            self.jobs['score'].append(score)
            self.jobs['time'].append(time)
            self.jobs['url'].append(url)

        elif type == 'comment':
            self.parents['id'].append(id)
            self.parents['parent'].append(parent)
            self.parents['type'].append('comment')
            self.comments['id'].append(id)
            self.comments['text'].append(text)
            self.comments['by_id'].append(by_id)
            self.comments['time'].append(time)

        elif type == 'poll':
            self.polls['id'].append(id)
            self.polls['title'].append(title)
            self.polls['text'].append(text)
            self.polls['by_id'].append(by_id)
            self.polls['descendants'].append(descendants)
            self.polls['score'].append(score)
            self.polls['time'].append(time)

        elif type == 'pollopt':
            self.pollopts['id'].append(id)
            self.pollopts['text'].append(text)
            self.pollopts['by_id'].append(by_id)
            self.pollopts['poll'].append(poll)
            self.pollopts['score'].append(score)
            self.pollopts['time'].append(time)

In [2]:
scraper = Scraper()
for i in range(1, 100):
    scraper.to_dict(i)

In [3]:
pd.DataFrame(scraper.comments)

Unnamed: 0,id,text,by_id,time
0,15,&#34;the rising star of venture capital&#34; -...,8,1160423461
1,17,Is there anywhere to eat on Sandhill Road?,1,1160423565
2,22,It's kind of funny that Sevin Rosen is giving ...,1,1160446702
3,23,"This is interesting, but the limitations becom...",9,1160447453
4,30,Stay tuned...,11,1160494499
5,31,I'm tuned...,1,1160494805
6,33,winnar winnar chicken dinnar!,11,1160495440
7,34,what do you mean? this story's still not #1,1,1160495633
8,35,perhaps if i hadn't told you it was coming\r\n...,11,1160495862
9,36,Can you do it again?,1,1160496061


In [6]:
from sqlalchemy import create_engine
import os
user = os.environ['DBUSER']
pw = os.environ['DBPW']
engine = create_engine(f'postgresql://{user}:{pw}@localhost:5432/hn')

In [None]:
with engine.begin() as con:
    df = pd.read_sql(sql='SELECT * FROM users;', con=con)