What I aim to demonstrate:
- date and time manipulation by datetime methods and classes
- isolating relevant columns/rows from large datasets
- basic calculations such as average number of posts per hour
- format information for easier presentation with str.format and sorted()

In [1]:
from csv import reader
open_file = open('../input/hacker-news-posts/HN_posts_year_to_Sep_26_2016.csv')
read_file = reader(open_file)
hn = list(read_file)

In [2]:
print(hn[:5])

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26'], ['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24'], ['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19'], ['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16']]


In [3]:
headers = hn[0]
    
hn = hn[1:]


In [4]:
print(hn[:5])
print('\n')
print(headers)

[['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26'], ['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24'], ['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19'], ['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16'], ['12578979', 'How the Data Vault Enables the Next-Gen Data Warehouse and Data Lake', 'https://www.talend.com/blog/2016/05/12/talend-and-Â\x93the-data-vaultÂ\x94', '1', '0', 'markgainor1', '9/26/2016 3:14']]


['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


In [5]:
ask_posts = []
show_posts = []
other_posts = []

## Filter Data - filtering posts beginning with either Ask HN or Show HN

In [6]:
for row in hn:
    title = row[1] 
    if title.lower().startswith('ask hn'): #to control for case, convert string to lower first
        ask_posts.append(row) #the entire row, not just 'title' must be appended
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
    
    

for i in ask_posts:
    print(i[4])

Checking the number of posts:

In [7]:
print('Ask posts count:' + " " + str(len(ask_posts)))
print('\n')
print('Show posts count:' + " " + str(len(show_posts)))
print('\n')
print('Other posts count:' + " " + str(len(other_posts))) 


Ask posts count: 9139


Show posts count: 10158


Other posts count: 273822


## Finding the total number of comments within ask and show posts

In [8]:
total_ask_comments = 0

for row in ask_posts:
    num_comments = row[4]
    num_comments = int(num_comments)
    total_ask_comments = total_ask_comments + num_comments #assign back to total_ask_comments
    
print (total_ask_comments)

94986


In [9]:
avg_ask_comments = total_ask_comments / len(ask_posts)
print(avg_ask_comments)

10.393478498741656


In [10]:
total_show_comments = 0

for row in show_posts:
    num_comments = row[4]
    num_comments = int(num_comments)
    total_show_comments = total_show_comments + num_comments #assign back to total_ask_comments
    
print (total_show_comments)

49633


In [11]:
avg_show_comments = total_show_comments / len(show_posts)
print(avg_show_comments)

4.886099625910612


On average, Ask HN posts tend to have more comments on average. Since ask posts receive more comments than show posts, the focus on the analysis will be on ask posts.

Next, I aim to caclulate:
- the number of ask posts created in each hour of the day, along with the number of comments received
- the average number of comments ask posts received by hour created

In [12]:
import datetime as dt

In [13]:
results_list = []

In [14]:
for row in ask_posts:
    created_at = row[6]
    num_comments = row[4]
    num_comments = int(num_comments)
    results_list.append([created_at, num_comments]) #appended this way to create a list of lists
    #if appended separately, wouldn't be able to access correct index[0] for time, and [1] for comments for each item
    

In [15]:
print(results_list[:5])

[['9/26/2016 2:53', 7], ['9/26/2016 1:17', 3], ['9/25/2016 22:57', 0], ['9/25/2016 22:48', 3], ['9/25/2016 21:50', 2]]


In [16]:
counts_by_hour = {} #this will be the number of ask posts created during each hour of the day
comments_by_hour = {} #and this will be the corresponding number of *comments* ask posts created at each hour received

In [17]:
for row in results_list:
    hour = row[0]
    comment = row[1]
    hour_time = dt.datetime.strptime(hour, "%m/%d/%Y %H:%M").strftime('%H') #to first convert hour str into a datetime class, then isolate the hour
    if hour_time not in counts_by_hour:
        counts_by_hour[hour_time] = 1 #creates a key in counts by hour if it's not arleady there
        comments_by_hour[hour_time] = comment #creates a key, which will be 'hour_time' with its value being 'comment'
    else:
        counts_by_hour[hour_time] += 1
        comments_by_hour[hour_time] += comment
        
#this creates frequency table with the number of ask posts created during each hour and number of comments received    

In [18]:
print(comments_by_hour)
   

{'02': 2996, '01': 2089, '22': 3372, '21': 4500, '19': 3954, '17': 5547, '15': 18525, '14': 4972, '13': 7245, '11': 2797, '10': 3013, '09': 1477, '07': 1585, '03': 2154, '23': 2297, '20': 4462, '16': 4466, '08': 2362, '00': 2277, '18': 4877, '12': 4234, '04': 2360, '06': 1587, '05': 1838}


In [19]:
print(counts_by_hour)

{'02': 269, '01': 282, '22': 383, '21': 518, '19': 552, '17': 587, '15': 646, '14': 513, '13': 444, '11': 312, '10': 282, '09': 222, '07': 226, '03': 271, '23': 343, '20': 510, '16': 579, '08': 257, '00': 301, '18': 614, '12': 342, '04': 243, '06': 234, '05': 209}


### Calculating average number of comments per post, for posts created during each hour of the day

In [20]:
avg_by_hour = []

for i in counts_by_hour:
    avg_by_hour.append([i,  comments_by_hour[i] / counts_by_hour[i]])

#to append first the key of the dictionary, i.e. hour, and the second element of the list being the amount of comments per hour 
#divided by the number of posts, i.e. total comments in an hour / total posts per hour
# = average number of comments per post

In [21]:
print(avg_by_hour)

[['02', 11.137546468401487], ['01', 7.407801418439717], ['22', 8.804177545691905], ['21', 8.687258687258687], ['19', 7.163043478260869], ['17', 9.449744463373083], ['15', 28.676470588235293], ['14', 9.692007797270955], ['13', 16.31756756756757], ['11', 8.96474358974359], ['10', 10.684397163120567], ['09', 6.653153153153153], ['07', 7.013274336283186], ['03', 7.948339483394834], ['23', 6.696793002915452], ['20', 8.749019607843136], ['16', 7.713298791018998], ['08', 9.190661478599221], ['00', 7.5647840531561465], ['18', 7.94299674267101], ['12', 12.380116959064328], ['04', 9.7119341563786], ['06', 6.782051282051282], ['05', 8.794258373205741]]


#### Sorting the list of lists
- creating an list that equals avg_by_hour but with swapped columns
- needed because if we are to sort the columns, it would sort by hour, rather than average comments per hour

In [22]:
swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]]) 

In [23]:
sorted_swap = sorted(swap_avg_by_hour, reverse=True) #reverse argument set to true to print in descending order
print(sorted_swap)

[[28.676470588235293, '15'], [16.31756756756757, '13'], [12.380116959064328, '12'], [11.137546468401487, '02'], [10.684397163120567, '10'], [9.7119341563786, '04'], [9.692007797270955, '14'], [9.449744463373083, '17'], [9.190661478599221, '08'], [8.96474358974359, '11'], [8.804177545691905, '22'], [8.794258373205741, '05'], [8.749019607843136, '20'], [8.687258687258687, '21'], [7.948339483394834, '03'], [7.94299674267101, '18'], [7.713298791018998, '16'], [7.5647840531561465, '00'], [7.407801418439717, '01'], [7.163043478260869, '19'], [7.013274336283186, '07'], [6.782051282051282, '06'], [6.696793002915452, '23'], [6.653153153153153, '09']]


In [24]:
print("Top 5 Hours for Ask Post Comments")

for row in sorted_swap[:5]:
    time = row[1]
    time = dt.datetime.strptime(time, "%H").strftime("%H:%M") #to change to datetime class, then isolate hour
    comments = row[0]
    template = "{0}: {1:.2f} average comments per post"
    output = template.format(time, comments) #using str.format and above template to paste in time and comments
    print(output)

Top 5 Hours for Ask Post Comments
15:00: 28.68 average comments per post
13:00: 16.32 average comments per post
12:00: 12.38 average comments per post
02:00: 11.14 average comments per post
10:00: 10.68 average comments per post
