In [1]:
import pandas as pd 
import datetime
import csv
import os 
import requests 
import datetime as dt
import time

# Exercise

For your exercise do the following:

1. Choose a reddit page you want to crawl
2. The following fields should be present when you crawl **(10 points)**:
    - author
    - subreddit
    - date created 
    - number of comments 
    - score
    - submission title 
    - submission description
3. After crawling, save your results to a pandas dataframe **(3 points)**. 
4. Answer the following questions **(12 points)**:
    - How many submissions were you able to gather? 
    - Who has the most submissions? 
    - Which submission has the highest score? 
    - Which submission has the highest number of comments?
    - Which day of the week has the most submissions? 
    
**Tip:** _For item#4, recall how to use the aggregation functions in `pandas` like count, value_counts, sum, etc. For getting the day of the week, look into how to get the `dayofweek` from a datetime object in `pandas`. (Hint: You may need to use `pd.to_datetime` to convert your date column...)_

In [2]:
def to_utc(date):
    #This function converts an object to UTC. This is to automate the conversion 
    #of dates instead of going to https://www.unixtimeconverter.io/ 
    return int(date.replace(tzinfo=dt.timezone.utc).timestamp())
    
def to_readable_date(timestamp):
    #This function converts the UTC format to a Year-Month-Day format 
    return dt.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

#Declare start and end of reddit posts to extract 
start_date = dt.datetime.strptime("2020-05-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2020-05-15", "%Y-%m-%d")

#Create a range of dates to iterate 
#Note: Periods here represents the number of days it will create from the start date 
#We also do a +2 since it will only generate up to April 29. We inlcude May 1 
#since we want to get data from the last day which is April 30 to May 1 
date_range = (pd.date_range(
                start_date, 
                periods=(end_date - start_date).days + 2)
              .tolist())

#prepare the parameters needed to call the API
sort_type="score"
sort="desc"
fields=["author","subreddit_id","created_utc","num_comments","score","title", "selftext"]
subreddit = 'otomeisekai'
url = "https://api.pushshift.io/reddit/submission/search/"
results = []
#loop through the dates 
for i, s_date in enumerate(date_range):
    #prevents us from getting an index out of range error
    if i != len(date_range)-1:
        #declare end date 
        e_date = date_range[i+1]
        #call the API
        r = requests.get(url = url, params={
            'after': to_utc(s_date),
            'before': to_utc(e_date),
            'sort_type': sort_type,
            'sort': sort,
            'subreddit': subreddit,
            'fields': fields,
            "size": 500
        })

        #add logs 
        print(f"Doing {s_date.strftime('%Y-%m-%d')} to {e_date.strftime('%Y-%m-%d')}")
        if r.status_code == 200:
            results.append(r.json()['data'])
            print("=====Done")
        else:
            print("=====Skipped")
        #so that we dont get blocked from abusing the API we call it after pausing for 1 second
        time.sleep(1)

Doing 2020-05-01 to 2020-05-02
=====Done
Doing 2020-05-02 to 2020-05-03
=====Done
Doing 2020-05-03 to 2020-05-04
=====Done
Doing 2020-05-04 to 2020-05-05
=====Done
Doing 2020-05-05 to 2020-05-06
=====Done
Doing 2020-05-06 to 2020-05-07
=====Done
Doing 2020-05-07 to 2020-05-08
=====Done
Doing 2020-05-08 to 2020-05-09
=====Done
Doing 2020-05-09 to 2020-05-10
=====Done
Doing 2020-05-10 to 2020-05-11
=====Done
Doing 2020-05-11 to 2020-05-12
=====Done
Doing 2020-05-12 to 2020-05-13
=====Done
Doing 2020-05-13 to 2020-05-14
=====Done
Doing 2020-05-14 to 2020-05-15
=====Done
Doing 2020-05-15 to 2020-05-16
=====Done


In [3]:
results


[[{'author': 'KuramaReinara',
   'created_utc': 1588303976,
   'num_comments': 15,
   'score': 1,
   'selftext': '',
   'subreddit_id': 't5_1zqfgn',
   'title': 'NEW STORY Death is the only ending for the Villainess'},
  {'author': 'PeachBlossomBee',
   'created_utc': 1588357979,
   'num_comments': 14,
   'score': 1,
   'selftext': 'Or at least with a genuine grudging/resentful lead, like in To Be You, Even Just For A Day, a little less so in The Villainess Reverses the Hourglass.\n\nI like stories with characters trying to outwit each other, backstabbings and assassinations, etc but I only see them in stories with romance heavy plots. \n\nI was enjoying Empress of Another World, (since at first there were minimal feelings, a wily villain, and a morally grey character) but after a while there were just too many uncomfortably gratuitous descriptions that got in the way of the intrigue. \n\nI’ve accepted that there’s likely going to be romance regardless, but still, any stories where the

In [4]:
flat_list = []
#loop through the reddit results
for sublist in results:
    #check if sublist is not empty. The reason we have empty lists is because there are days wherein there are no submissions
    if sublist is not None:
        #for each dictionary in the sublist add it to the flat list 
        for item in sublist:
            flat_list.append(item)

#pandas has a useful function called from_dict which will convert a list of dictionary objects into a dataframe
df = pd.DataFrame.from_dict(flat_list)
display(df.head())
df.to_csv("reddit_otomeisekai.csv")

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit_id,title
0,KuramaReinara,1588303976,15,1,,t5_1zqfgn,NEW STORY Death is the only ending for the Vil...
1,PeachBlossomBee,1588357979,14,1,Or at least with a genuine grudging/resentful ...,t5_1zqfgn,Any novels/manga heavy on the politics and lig...
2,tahlyn,1588366693,27,1,The [Old](https://www.reddit.com/r/OtomeIsekai...,t5_1zqfgn,Otome Isekai Wiki and the List of Series
3,Herwaerts-over,1588445722,4,2,*Otome Game no Hametsu Flag shika Nai Akuyaku ...,t5_1zqfgn,[New Episode] - Otome Game no Hametsu Flag shi...
4,tahlyn,1588428895,1,1,,t5_1zqfgn,[New Chapter] - Beware of the Villainess! - Ch...


In [19]:
#How many submissions were you able to gather?
#Who has the most submissions?
#Which submission has the highest score?
#Which submission has the highest number of comments?

print(df.shape[0]) # number of submissions
print(df['author'].value_counts()[0:1]) # most submissions
print(df[df['score']==df['score'].max()]) #highest score
print(df[df['num_comments']==df['num_comments'].max()]) #highest number of comments


114
tahlyn    65
Name: author, dtype: int64
    Unnamed: 0  author  created_utc  num_comments  score selftext  \
74          74  tahlyn   1589136461             4      9      NaN   

   subreddit_id                                              title  
74    t5_1zqfgn  [new chapter] - Who Made Me a Princess - Chapt...  
    Unnamed: 0               author  created_utc  num_comments  score  \
27          27  Heronic_Birdwatcher   1588572543            36      1   
48          48   ChampionOfKirkwall   1588885688            36      3   

                                             selftext subreddit_id  \
27  I need help. I've gone through pretty much all...    t5_1zqfgn   
48  Basically, I'm looking for any story where the...    t5_1zqfgn   

                                              title  
27                    Recommendations for new manga  
48  Non "european royalty" setting recommendations?  


In [29]:
#Which day of the week has the most submissions?
