### Hacker News Data Analysis

We will analyze the data from Hacker News to understand if posts for `Ask HN` or `Show HN` receive more comments than average.

In [1]:
from csv import reader

hacker_news_file = open("hacker_news.csv")
read_file = reader(hacker_news_file)

hn = list(read_file)
headers = hn[0]
hn = hn[1:]

print(headers)
print(hn[:5])

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]


In [2]:
# Define Column Numbers
ID_COL = 0
TITLE_COL = 1
URL_COL = 2
NUM_POINTS_COL = 3
NUM_COMMENTS_COL = 4
AUTHOR_COL = 5
CREATED_AT_COL = 6

In [3]:
ask_posts = []
show_posts = []
other_posts = []

TITLE_COL = 1

for record in hn:
    if record[TITLE_COL].lower().startswith('ask hn'):
        ask_posts.append(record)
    elif record[TITLE_COL].lower().startswith('show hn'):
        show_posts.append(record)
    else:
        other_posts.append(record)

print('Ask Posts: ', len(ask_posts))
print('Show Posts: ', len(show_posts))
print('Other Posts: ', len(other_posts))

Ask Posts:  1744
Show Posts:  1162
Other Posts:  17194


In [4]:
def calculate_comments(dataset, comment_col):
    total_comments = 0
    
    for record in dataset:
        total_comments += int(record[comment_col])
    
    avg_comments = total_comments / len(dataset)
    
    return total_comments, avg_comments

total_ask_comments, avg_ask_comments = calculate_comments(ask_posts, NUM_COMMENTS_COL)
total_show_comments, avg_show_comments = calculate_comments(show_posts, NUM_COMMENTS_COL)
total_other_comments, avg_other_comments = calculate_comments(other_posts, NUM_COMMENTS_COL)

print('Ask Posts: ', total_ask_comments, avg_ask_comments)
print('Show Posts: ', total_show_comments, avg_show_comments)
print('Other Posts: ', total_other_comments, avg_other_comments)


Ask Posts:  24483 14.038417431192661
Show Posts:  11988 10.31669535283993
Other Posts:  462055 26.8730371059672


We observe that the posts that are of type `Ask HN` receive, on average, approximately 14 comments for each post. Posts of type `Show HN` receive on average only 10 comments per post.

Compared to the average for other posts, which is 26 comments per post, these numbers indicate that the actual number of posts received by posts of type `Ask HN` or `Show HN` are actually much lower than the comments received by Ohter posts.


#### Calculating the Counts & Comments by Hour

In [11]:
import datetime as dt

def build_freq_table(dataset, keyextractor, valueextractor):
    freq_table = {}
    for record in dataset:
        key = keyextractor(record)
        if not key in freq_table:
            freq_table[key] = 0
        freq_table[key] += valueextractor(record)
    return freq_table


def extract_date_part(input_date, date_format, date_part):
    return dt.datetime.strptime(input_date, date_format).strftime(date_part)


DATE_FORMAT = '%m/%d/%Y %H:%M'
counts_by_hour = build_freq_table(
    ask_posts, 
    lambda x: extract_date_part(x[CREATED_AT_COL], DATE_FORMAT, '%H'), 
    lambda x: 1
)
comments_by_hour = build_freq_table(
    ask_posts, 
    lambda x: extract_date_part(x[CREATED_AT_COL], DATE_FORMAT, '%H'), 
    lambda x: int(x[NUM_COMMENTS_COL])
)

print('Counts by Hour: ', counts_by_hour)
print('Comments by Hour: ', comments_by_hour)


Counts by Hour:  {'04': 47, '16': 108, '17': 100, '00': 55, '20': 80, '13': 85, '19': 110, '05': 46, '12': 73, '07': 34, '18': 109, '10': 59, '22': 71, '02': 58, '09': 45, '21': 109, '08': 48, '23': 68, '01': 60, '11': 58, '15': 116, '03': 54, '06': 44, '14': 107}
Comments by Hour:  {'04': 337, '16': 1814, '17': 1146, '00': 447, '20': 1722, '13': 1253, '19': 1188, '05': 464, '12': 687, '07': 267, '18': 1439, '10': 793, '22': 479, '02': 1381, '09': 251, '21': 1745, '08': 492, '23': 543, '01': 683, '11': 641, '15': 4477, '03': 421, '06': 397, '14': 1416}


#### Calculating the Average Comments by Hour

In [13]:
def calculate_average_comments(count_set, comment_set):
    avg_list = []
    for key in count_set:
        avg_list.append([key, comment_set[key] / count_set[key]])
    return avg_list

avg_by_hour = calculate_average_comments(counts_by_hour, comments_by_hour)

print(avg_by_hour)

[['04', 7.170212765957447], ['16', 16.796296296296298], ['17', 11.46], ['00', 8.127272727272727], ['20', 21.525], ['13', 14.741176470588234], ['19', 10.8], ['05', 10.08695652173913], ['12', 9.41095890410959], ['07', 7.852941176470588], ['18', 13.20183486238532], ['10', 13.440677966101696], ['22', 6.746478873239437], ['02', 23.810344827586206], ['09', 5.5777777777777775], ['21', 16.009174311926607], ['08', 10.25], ['23', 7.985294117647059], ['01', 11.383333333333333], ['11', 11.051724137931034], ['15', 38.5948275862069], ['03', 7.796296296296297], ['06', 9.022727272727273], ['14', 13.233644859813085]]


In [22]:
def print_nice(dataset):
    for rec in sorted(
        dataset, 
        key=lambda x: x[1], 
        reverse=True
    ):
        print('{:<2}00 hours: {:>5.2f} comments'.format(rec[0], rec[1]))
        
print_nice(avg_by_hour)

1500 hours: 38.59 comments
0200 hours: 23.81 comments
2000 hours: 21.52 comments
1600 hours: 16.80 comments
2100 hours: 16.01 comments
1300 hours: 14.74 comments
1000 hours: 13.44 comments
1400 hours: 13.23 comments
1800 hours: 13.20 comments
1700 hours: 11.46 comments
0100 hours: 11.38 comments
1100 hours: 11.05 comments
1900 hours: 10.80 comments
0800 hours: 10.25 comments
0500 hours: 10.09 comments
1200 hours:  9.41 comments
0600 hours:  9.02 comments
0000 hours:  8.13 comments
2300 hours:  7.99 comments
0700 hours:  7.85 comments
0300 hours:  7.80 comments
0400 hours:  7.17 comments
2200 hours:  6.75 comments
0900 hours:  5.58 comments


Based on the above analysis, it is evident that `Ask HN` posts created around 2PM Central Time have the highest chances of getting comments.