In [1]:
import pandas as pd 
import datetime
import csv
import os 
import requests 
import datetime as dt
import time

# Exercise

For your exercise do the following:

1. Choose a reddit page you want to crawl
2. The following fields should be present when you crawl **(10 points)**:
    - author
    - subreddit
    - date created 
    - number of comments 
    - score
    - submission title 
    - submission description
3. After crawling, save your results to a pandas dataframe **(3 points)**. 
4. Answer the following questions **(12 points)**:
    - How many submissions were you able to gather? 
    - Who has the most submissions? 
    - Which submission has the highest score? 
    - Which submission has the highest number of comments?
    - Which day of the week has the most submissions? 
    
**Tip:** _For item#4, recall how to use the aggregation functions in `pandas` like count, value_counts, sum, etc. For getting the day of the week, look into how to get the `dayofweek` from a datetime object in `pandas`. (Hint: You may need to use `pd.to_datetime` to convert your date column...)_

In [6]:
def to_utc(date):
    #This function converts an object to UTC. This is to automate the conversion 
    #of dates instead of going to https://www.unixtimeconverter.io/ 
    return int(date.replace(tzinfo=dt.timezone.utc).timestamp())
    
def to_readable_date(timestamp):
    #This function converts the UTC format to a Year-Month-Day format 
    return dt.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

#Declare start and end of reddit posts to extract 
start_date = dt.datetime.strptime("2020-04-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2020-04-05", "%Y-%m-%d")

#Create a range of dates to iterate 
#Note: Periods here represents the number of days it will create from the start date 
#We also do a +2 since it will only generate up to April 29. We inlcude May 1 
#since we want to get data from the last day which is April 30 to May 1 
date_range = (pd.date_range(
                start_date, 
                periods=(end_date - start_date).days + 2)
              .tolist())

#prepare the parameters needed to call the API
sort_type="score"
sort="desc"
fields=["author","title","selftext","score","num_comments","subreddit","created_utc"]
subreddit = 'singapore'
url = "https://api.pushshift.io/reddit/submission/search/"
results = []
#loop through the dates 
for i, s_date in enumerate(date_range):
    #prevents us from getting an index out of range error
    if i != len(date_range)-1:
        #declare end date 
        e_date = date_range[i+1]
        #call the API
        r = requests.get(url = url, params={
            'after': to_utc(s_date),
            'before': to_utc(e_date),
            'sort_type': sort_type,
            'sort': sort,
            'subreddit': subreddit,
            'fields': fields,
            "size": 500
        })

        #add logs 
        print(f"Doing {s_date.strftime('%Y-%m-%d')} to {e_date.strftime('%Y-%m-%d')}")
        if r.status_code == 200:
            results.append(r.json()['data'])
            print("=====Done")
        else:
            print("=====Skipped")
        #so that we dont get blocked from abusing the API we call it after pausing for 1 second
        time.sleep(1)

Doing 2020-04-01 to 2020-04-02
=====Done
Doing 2020-04-02 to 2020-04-03
=====Done
Doing 2020-04-03 to 2020-04-04
=====Done
Doing 2020-04-04 to 2020-04-05
=====Done
Doing 2020-04-05 to 2020-04-06
=====Done


In [13]:
results

[[{'author': 'princetower',
   'created_utc': 1585701774,
   'num_comments': 165,
   'score': 9,
   'selftext': "This sub has been filled with news of the virus daily and rightly so. We've also given quite a lot of attention to the individuals suffering from Covid-19. I don't want to come across as minimising the severity of the situation esp with regards to human life, but there's an aspect that I want to bring up: getting by.\n\nI got laid off 2 weeks ago. Many firms are not hiring at the moment and I'm concerned about my survival. Any others of you out there facing the same situation? Do share your stories.\n\nEdit: I'm on an EP.",
   'subreddit': 'singapore',
   'title': "Those laid off or still job hunting - what's your story?"},
  {'author': 'LuciferBiscuit',
   'created_utc': 1585715003,
   'num_comments': 55,
   'score': 3,
   'selftext': "True, at the expend of a vast amount of economy and disruptions, but surely it's a cost we're willing to bear with the help of the govt rath

In [7]:
flat_list = []
#loop through the reddit results
for sublist in results:
    #check if sublist is not empty. The reason we have empty lists is because there are days wherein there are no submissions
    if sublist is not None:
        #for each dictionary in the sublist add it to the flat list 
        for item in sublist:
            flat_list.append(item)

#pandas has a useful function called from_dict which will convert a list of dictionary objects into a dataframe
df = pd.DataFrame.from_dict(flat_list)
display(df.head())
df.to_csv("reddit_singapore.csv")
df

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,princetower,1585701774,165,9,This sub has been filled with news of the viru...,singapore,Those laid off or still job hunting - what's y...
1,LuciferBiscuit,1585715003,55,3,"True, at the expend of a vast amount of econom...",singapore,If the next two weeks truly will be the crucia...
2,Zukiff,1585700368,4,3,,singapore,Electricity tariff in S’pore to fall 5.1% from...
3,Aspirant2,1585701563,17,3,,singapore,First N95 medical mask imports finally reachin...
4,Winterhymns,1585710046,10,2,Title. With regards to yesterday’s report on s...,singapore,How to report company to MOM


Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,princetower,1585701774,165,9,This sub has been filled with news of the viru...,singapore,Those laid off or still job hunting - what's y...
1,LuciferBiscuit,1585715003,55,3,"True, at the expend of a vast amount of econom...",singapore,If the next two weeks truly will be the crucia...
2,Zukiff,1585700368,4,3,,singapore,Electricity tariff in S’pore to fall 5.1% from...
3,Aspirant2,1585701563,17,3,,singapore,First N95 medical mask imports finally reachin...
4,Winterhymns,1585710046,10,2,Title. With regards to yesterday’s report on s...,singapore,How to report company to MOM
...,...,...,...,...,...,...,...
495,krnrd,1586085227,0,1,,singapore,What the horse!
496,keksikus,1586084092,0,1,[removed],singapore,singaporeans need to be way more empathetic.
497,rdjedd,1586080937,1,1,Inspired by a work group chat(s) that popped u...,singapore,Online Chat/Social Support Groups during these...
498,Bcpjw,1586106949,4,1,,singapore,Singaporeans are out and about before new coro...


In [14]:
#Total number of submissions gathered?

total_number = df.index
total_submission= len(total_number)
print (total_submission)


500


In [23]:
#Who has the most submissions?

author = df.author.mode()

df['author'].value_counts()
print (author[0])


Jammy_buttons2


In [18]:
#Which submission has the highest score? 

highscore=df['score'].argmax()
print(df.loc[highscore])


author                        AnonDooDoo
created_utc                   1585927534
num_comments                           3
score                                 35
selftext                                
subreddit                      singapore
title           Different language juice
Dates                         04-03-2020
Posting_Day                       Friday
Name: 200, dtype: object


In [19]:
#Which submission has the highest number of comments?

highcomments=df['num_comments'].argmax()

#highcommentsindex

print(df.loc[highcomments])

author                                              AutoModerator
created_utc                                            1585865094
num_comments                                                 1843
score                                                           1
selftext        Talk about your day. Anything goes, but subred...
subreddit                                               singapore
title           /r/singapore random discussion and small quest...
Dates                                                  04-03-2020
Posting_Day                                                Friday
Name: 133, dtype: object


In [24]:
#Which day of the week has the most submissions

from datetime import datetime
from datetime import date 
import calendar 
from pprint import pprint


def to_readable_date(timestamp):
    #This function converts the UTC format to a Year-Month-Day format 
    return dt.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

#extracting utc date of creation into a separate list
utcs = df['created_utc'].to_list()

#converting the utcs list into integers
utcsint = [int(i) for i in utcs] 

#utcsint

#runthrough conversion of UTC into date stamps
converted = []

for i in utcsint:
    converted.append(to_readable_date(i))

#print (converted)

#switch of date formats into strings

dates = (datetime.strptime(ts, '%Y-%m-%d') for ts in converted)
date_strings = [datetime.strftime(d, '%m-%d-%Y') for d in dates]

#pprint(date_strings)

df["Dates"] = date_strings

#df

def findDay(date): 
    dayname = datetime.strptime(date, '%Y-%m-%d').weekday() 
    return (calendar.day_name[dayname]) 

daylist = []

for i in converted:
    daylist.append(findDay(i))

#daylist

df['Posting_Day'] = daylist

#df

Day_mode = df.Posting_Day.mode()


print (Day_mode[0])




Friday
