In [1]:
import pandas as pd 
import datetime
import csv
import os 
import requests 
import datetime as dt
import time

# Exercise

For your exercise do the following:

1. Choose a reddit page you want to crawl
2. The following fields should be present when you crawl **(10 points)**:
    - author
    - subreddit
    - date created 
    - number of comments 
    - score
    - submission title 
    - submission description
3. After crawling, save your results to a pandas dataframe **(3 points)**. 
4. Answer the following questions **(12 points)**:
    - How many submissions were you able to gather? 
    - Who has the most submissions? 
    - Which submission has the highest score? 
    - Which submission has the highest number of comments?
    - Which day of the week has the most submissions? 
    
**Tip:** _For item#4, recall how to use the aggregation functions in `pandas` like count, value_counts, sum, etc. For getting the day of the week, look into how to get the `dayofweek` from a datetime object in `pandas`. (Hint: You may need to use `pd.to_datetime` to convert your date column...)_

In [2]:
def to_utc(date):
    #This function converts a date_time object to UTC. This is to automate the conversion 
    #of dates instead of going to https://www.unixtimeconverter.io/ 
    return int(date.replace(tzinfo=dt.timezone.utc).timestamp())
    
def to_readable_date(timestamp):
    #This function converts the UTC format to a Year-Month-Day format 
    return dt.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

#Declare start and end of reddit posts to extract 
start_date = dt.datetime.strptime("2020-08-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2020-08-22", "%Y-%m-%d")

#Create a range of dates to iterate 
#Note: Periods here represents the number of days it will create from the start date 
#We also do a +2 since it will only generate up to April 29. We inlcude May 1 
#since we want to get data from the last day which is April 30 to May 1 
date_range = (pd.date_range(
                start_date, 
                periods=(end_date - start_date).days)
              .tolist())

#prepare the parameters needed to call the API
sort_type="score"
sort="desc"
fields=["id", "author","title", "selftext","score","num_comments","created_utc", "subreddit"]
subreddit = 'ADMU'
url = "https://api.pushshift.io/reddit/submission/search/"
results = []
#loop through the dates 
for i, s_date in enumerate(date_range):
    #prevents us from getting an index out of range error
    if i != len(date_range)-1:
        #declare end date 
        e_date = date_range[i+1]
        #call the API
        r = requests.get(url = url, params={
            'after': to_utc(s_date),
            'before': to_utc(e_date),
            'sort_type': sort_type,
            'sort': sort,
            'subreddit': subreddit,
            'fields': fields,
            "size": 500
        })

        #add logs 
        print(f"Doing {s_date.strftime('%Y-%m-%d')} to {e_date.strftime('%Y-%m-%d')}")
        if r.status_code == 200:
            results.append(r.json()['data'])
            print("=====Done")
        else:
            print("=====Skipped")
        #so that we dont get blocked from abusing the API we call it after pausing for 1 second
        time.sleep(1)

Doing 2020-08-01 to 2020-08-02
=====Done
Doing 2020-08-02 to 2020-08-03
=====Done
Doing 2020-08-03 to 2020-08-04
=====Done
Doing 2020-08-04 to 2020-08-05
=====Done
Doing 2020-08-05 to 2020-08-06
=====Done
Doing 2020-08-06 to 2020-08-07
=====Done
Doing 2020-08-07 to 2020-08-08
=====Done
Doing 2020-08-08 to 2020-08-09
=====Done
Doing 2020-08-09 to 2020-08-10
=====Done
Doing 2020-08-10 to 2020-08-11
=====Done
Doing 2020-08-11 to 2020-08-12
=====Done
Doing 2020-08-12 to 2020-08-13
=====Done
Doing 2020-08-13 to 2020-08-14
=====Done
Doing 2020-08-14 to 2020-08-15
=====Done
Doing 2020-08-15 to 2020-08-16
=====Done
Doing 2020-08-16 to 2020-08-17
=====Done
Doing 2020-08-17 to 2020-08-18
=====Done
Doing 2020-08-18 to 2020-08-19
=====Done
Doing 2020-08-19 to 2020-08-20
=====Done
Doing 2020-08-20 to 2020-08-21
=====Done


In [3]:
results

[[{'author': 'ddazai',
   'created_utc': 1596267569,
   'id': 'i1oeot',
   'num_comments': 0,
   'score': 1,
   'selftext': 'Hello! I am an incoming freshman currently enrolled in the course of AB Psych. I’ve tried shifting to BS Chem (Honors course) before starting the school year, but my request was unfortunately rejected. Though, I am still planning to shift again to BS Chem after a year or so. I would just like to know if ADMU would still give me the chance to let me shift from a non-honors course to an honors course? And also, would I be held back by a year due to the differences in subjects and requirements?',
   'subreddit': 'ADMU',
   'title': 'Shifting Courses'},
  {'author': 'hldnfrd',
   'created_utc': 1596277467,
   'id': 'i1q257',
   'num_comments': 0,
   'score': 1,
   'selftext': 'sorry if this isn’t for this subreddit but i’m desperate )’:\n\ndebating between upd cssp and admu soss, anyone who went through the same thing? shoot a dm (?)',
   'subreddit': 'ADMU',
   'tit

In [4]:
flat_list = []
#loop through the reddit results
for sublist in results:
    #check if sublist is not empty. The reason we have empty lists is because there are days wherein there are no submissions
    if sublist is not None:
        #for each dictionary in the sublist add it to the flat list 
        for item in sublist:
            flat_list.append(item)

#pandas has a useful function called from_dict which will convert a list of dictionary objects into a dataframe
df = pd.DataFrame.from_dict(flat_list)
display(df.head())
df.to_csv("reddit_ADMU.csv")

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title
0,ddazai,1596267569,i1oeot,0,1,Hello! I am an incoming freshman currently enr...,ADMU,Shifting Courses
1,hldnfrd,1596277467,i1q257,0,1,sorry if this isn’t for this subreddit but i’m...,ADMU,enlightenment regarding college choice
2,groundhogday123,1596297592,i1uid9,4,1,thoughts guys? have any other recons done this...,ADMU,DP blasting as a recon?
3,chelly__t,1596293774,i1thjv,6,1,To those who have applied for a graduate progr...,ADMU,AISIS Details for Graduate Programs
4,92gravities,1596264372,i1nvbt,15,1,i hear about orgs a lot and how it can contrib...,ADMU,Orgs?


In [5]:
### 4.1 ####
len (df)  # I gathered 247 submissions.

247

In [6]:
### 4.2 ###
df ['freq'] = df.groupby ('author') ['author'].transform('count')
df.sort_values ('freq', ascending = False)  # This allows me to see the highest value of 'freq' among the submissions.
df.loc[df['freq'] == 6]   # Author "mytoesmyknees" has the most number of submissions at 6.

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title,freq
25,mytoesmyknees,1596434634,i2rsev,2,1,Are other Ateneo schools (Ateneo de Naga Unive...,ADMU,Other Ateneo Schools,6
61,honeyimmad,1596743167,i4yqrg,2,1,I'm taking BSM AMF and I have no idea about ho...,ADMU,BSM AMF Job Opportunities,6
71,mytoesmyknees,1596762149,i54aaf,4,1,How are they? Do most people pass? Most import...,ADMU,Chemistry and Math Diagnostics,6
77,mytoesmyknees,1596762073,i549ls,0,1,"Coverage, what type of questions, what you sug...",ADMU,How is the Chem and Math Diagnostic?,6
81,mytoesmyknees,1596764870,i54z0i,0,1,How are the chemistry and math diagnostics? Do...,ADMU,Chem and Math Diagnostics?,6
103,honeyimmad,1596995440,i6nmk7,5,1,soooo what are the different subjs offered und...,ADMU,Nat Sci,6
135,honeyimmad,1597262454,i8kzda,2,1,I've been searching for 30 mins and I still ha...,ADMU,Can someone give me the link of the 'Ateneo Pr...,6
161,mytoesmyknees,1597371617,i9dj70,9,1,What side hustles/part-time jobs do you recomm...,ADMU,Side Hustles in ADMU,6
198,mytoesmyknees,1597544416,iakh13,11,1,"What are some inside jokes, stories, or slang ...",ADMU,What are some Atenean inside jokes?,6
202,honeyimmad,1597662453,ibc3v3,3,1,im not really a fan of those things :&lt;,ADMU,what are the things that i'll miss if i dont a...,6


In [7]:
### 4.3 ###
print(df)

                   author  created_utc      id  num_comments  score  \
0                  ddazai   1596267569  i1oeot             0      1   
1                 hldnfrd   1596277467  i1q257             0      1   
2         groundhogday123   1596297592  i1uid9             4      1   
3               chelly__t   1596293774  i1thjv             6      1   
4             92gravities   1596264372  i1nvbt            15      1   
..                    ...          ...     ...           ...    ...   
242          anonymous319   1597952403  idh965             3      1   
243            ortho56789   1597900652  id4b0z             0      1   
244        stony_deakin02   1597941456  iddpxp             2      1   
245  throwaway_forposting   1597900401  id4929             1      1   
246           Tater__thot   1597924974  id8z55             4      1   

                                              selftext subreddit  \
0    Hello! I am an incoming freshman currently enr...      ADMU   
1    sorry 

In [8]:
### 4.4 ###
df.pop ('freq')
df.sort_values ('num_comments', ascending = False)   # The post with id:i5wdv0 by author: klutzyknee has the most number of comments with 40.

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title
90,klutzyknee,1596878998,i5wdv0,40,1,,ADMU,Confessions that can get your Atenean card rev...
70,Boston-fordham-admu,1596690046,i4ljdj,29,1,[ https://www.usnews.com/best-colleges/ranking...,ADMU,Top 50 US college vs. Ateneo Manila
48,thr0w4w4y4ccz,1596641461,i47f67,27,1,"inc freshie here! do some, if not all, people ...",ADMU,Atenean Environment
231,honeyimmad,1597862010,ictfgi,25,1,i saw this from other univs and i wanna try it...,ADMU,Expectations vs. Reality (ADMU edition)
147,imhiroswife,1597295286,i8tvor,21,1,,ADMU,is this final?
...,...,...,...,...,...,...,...,...
152,rootofimaginary,1597315317,i8xmue,0,1,Title question.,ADMU,How is the Legal Management program in Ateneo?
155,emman0129,1597322134,i8z3y2,0,1,Sharing this FB post from Loyola Gaming: [http...,ADMU,ADMU LS on Minecraft
160,blehbleh223,1597291426,i8t1fe,0,1,Hello! When we go to campus and we have to sub...,ADMU,Confirmation Requirements
180,Lucy-Ford-,1597557652,iand3b,0,1,"Hi folks. Incoming transferee here, looking to...",ADMU,Any Transfer Students From Previous School Yea...


In [9]:
### 4.5 ###
