# Dataquest Project: Hacker News Pipeline
The goal of this project is to apply the data pipeline we have built in the lesson portion of the unit to a real data project. We will use JSON data from a Hacker News API. 

In [178]:
from pipeline import Pipeline
from pipeline import build_csv
import json
import io
import csv
import string
from stop_words import stop_words
from datetime import datetime

In [179]:
pipeline = Pipeline()

In [180]:
@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as file:
        data = json.load(file)
        stories = data['stories']
    return stories

In [184]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(filtered_stories):
    lines = []
    for story in filtered_stories:
        lines.append((story['objectID'], 
                     datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), 
                     story['url'], 
                     story['points'], 
                     story['title']))
    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())

In [186]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    header_position = header.index('title')
    return (row[header_position] for row in reader)

In [189]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        title = title.lower()
        title = title.translate(str.maketrans('','', string.punctuation))
        yield title

In [192]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    return_dict = {}
    for title in titles:
        title = title.split()
        for word in title:
            if word not in stop_words:
                if word in return_dict:
                    return_dict[word] += 1
                else:
                    return_dict[word] = 1
    return return_dict            

In [193]:
@pipeline.task(depends_on=build_keyword_dictionary)
def get_top_hundred_words(word_dict):
    word_tup = [(key, value) for key, value in word_dict.items()]
    word_tup_sort = sorted(word_tup, key=lambda x: x[1], reverse=True)
    return word_tup_sort[:100]

In [194]:
ran = pipeline.run()

In [195]:
ran[get_top_hundred_words]

[('new', 185),
 ('google', 167),
 ('bitcoin', 101),
 ('open', 92),
 ('programming', 90),
 ('web', 88),
 ('data', 85),
 ('video', 79),
 ('python', 76),
 ('code', 72),
 ('facebook', 71),
 ('released', 71),
 ('using', 70),
 ('2013', 65),
 ('javascript', 65),
 ('free', 64),
 ('source', 64),
 ('game', 63),
 ('internet', 62),
 ('microsoft', 59),
 ('c', 59),
 ('linux', 58),
 ('app', 57),
 ('pdf', 55),
 ('work', 54),
 ('language', 54),
 ('software', 52),
 ('2014', 52),
 ('startup', 51),
 ('apple', 50),
 ('use', 50),
 ('make', 50),
 ('time', 48),
 ('yc', 48),
 ('security', 48),
 ('nsa', 45),
 ('github', 45),
 ('windows', 44),
 ('1', 41),
 ('world', 41),
 ('way', 41),
 ('like', 41),
 ('project', 40),
 ('computer', 40),
 ('heartbleed', 40),
 ('git', 37),
 ('users', 37),
 ('dont', 37),
 ('design', 37),
 ('ios', 37),
 ('developer', 36),
 ('os', 36),
 ('twitter', 36),
 ('ceo', 36),
 ('vs', 36),
 ('life', 36),
 ('big', 35),
 ('day', 35),
 ('android', 34),
 ('online', 34),
 ('years', 33),
 ('simple', 