# HackerNews Analytics


1. Initialization

In [1]:
from csv import reader

def load_data(file_name):
    file = open(file = file_name, encoding = 'utf8')
    data = reader(file)
    return list(data)

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        
hn = load_data("HN_posts_year_to_Sep_26_2016.csv")

headers = hn[0]
hn = hn[1:]
# explore_data(hn, 0, 5, True)

2. Filter data

In [2]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1].lower()
    if title.startswith("ask hn"):
        ask_posts.append(row)
    elif title.startswith("show hn"):
        show_posts.append(row)
    else:
        other_posts.append(row)

print("Ask HN number:", len(ask_posts))
print("Show HN number:", len(show_posts))
print("Other number:", len(other_posts))



Ask HN number: 9139
Show HN number: 10158
Other number: 273822


3. Calculate average number of comments per post type

In [3]:
def calc_avg_comments(posts):
    comments_num = 0
    for post in posts:
        comments_num += int(post[4])
    comments_avg = comments_num/len(posts)
    return comments_avg

avg_ask_comments = calc_avg_comments(ask_posts)
avg_show_comments = calc_avg_comments(show_posts)

print("Avg ask comments:", avg_ask_comments)
print("Avg show comments:", avg_show_comments)

Avg ask comments: 10.393478498741656
Avg show comments: 4.886099625910612


Result: Average number of comments for questions is higher than for shows.

4. Calculating number of posts and comments per hour

In [4]:
import datetime as dt
result_list = []
for post in ask_posts:
    result_list.append([post[6], int(post[4])])

print(result_list[1:5])

counts_by_hour = dict()
comments_by_hour = dict()
for item in result_list:
    post_dt = dt.datetime.strptime(item[0], "%m/%d/%Y %H:%M")
    hour = post_dt.strftime("%H")
    if hour not in counts_by_hour:
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = item[1]
    else:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += item[1]

print(counts_by_hour)
print(comments_by_hour)

[['9/26/2016 1:17', 3], ['9/25/2016 22:57', 0], ['9/25/2016 22:48', 3], ['9/25/2016 21:50', 2]]
{'02': 269, '01': 282, '22': 383, '21': 518, '19': 552, '17': 587, '15': 646, '14': 513, '13': 444, '11': 312, '10': 282, '09': 222, '07': 226, '03': 271, '23': 343, '20': 510, '16': 579, '08': 257, '00': 301, '18': 614, '12': 342, '04': 243, '06': 234, '05': 209}
{'02': 2996, '01': 2089, '22': 3372, '21': 4500, '19': 3954, '17': 5547, '15': 18525, '14': 4972, '13': 7245, '11': 2797, '10': 3013, '09': 1477, '07': 1585, '03': 2154, '23': 2297, '20': 4462, '16': 4466, '08': 2362, '00': 2277, '18': 4877, '12': 4234, '04': 2360, '06': 1587, '05': 1838}


5. Calculate average comments per post per hour

In [5]:
avg_by_hour = []
for hour in counts_by_hour:
    avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])

print("Average comments per post per hour:", avg_by_hour)

Average comments per post per hour: [['02', 11.137546468401487], ['01', 7.407801418439717], ['22', 8.804177545691905], ['21', 8.687258687258687], ['19', 7.163043478260869], ['17', 9.449744463373083], ['15', 28.676470588235293], ['14', 9.692007797270955], ['13', 16.31756756756757], ['11', 8.96474358974359], ['10', 10.684397163120567], ['09', 6.653153153153153], ['07', 7.013274336283186], ['03', 7.948339483394834], ['23', 6.696793002915452], ['20', 8.749019607843136], ['16', 7.713298791018998], ['08', 9.190661478599221], ['00', 7.5647840531561465], ['18', 7.94299674267101], ['12', 12.380116959064328], ['04', 9.7119341563786], ['06', 6.782051282051282], ['05', 8.794258373205741]]


6. Format Top Comments per hour

In [6]:
swap_avg = []
for item in avg_by_hour:
    swap_avg.append([item[1], item[0]])

sorted_swap = sorted(swap_avg, reverse = True)
print("Top 5 Hours:")
for item in sorted_swap[0:5]:
    print(dt.datetime.strptime(item[1], "%H").strftime("%H:%M"),":","{:.2f} average comments per post".format(item[0]))

Top 5 Hours:
15:00 : 28.68 average comments per post
13:00 : 16.32 average comments per post
12:00 : 12.38 average comments per post
02:00 : 11.14 average comments per post
10:00 : 10.68 average comments per post
