In [None]:
import pandas as pd 
import datetime
import csv
import os 
import requests 
import datetime as dt
import time

# Exercise

For your exercise do the following:

1. Choose a reddit page you want to crawl
2. The following fields should be present when you crawl **(10 points)**:
    - author
    - subreddit
    - date created 
    - number of comments 
    - score
    - submission title 
    - submission description
3. After crawling, save your results to a pandas dataframe **(3 points)**. 
4. Answer the following questions **(12 points)**:
    - How many submissions were you able to gather? 
    - Who has the most submissions? 
    - Which submission has the highest score? 
    - Which submission has the highest number of comments?
    - Which day of the week has the most submissions? 
    
**Tip:** _For item#4, recall how to use the aggregation functions in `pandas` like count, value_counts, sum, etc. For getting the day of the week, look into how to get the `dayofweek` from a datetime object in `pandas`. (Hint: You may need to use `pd.to_datetime` to convert your date column...)_

In [None]:
def to_utc(date): 
    return int(date.replace(tzinfo=dt.timezone.utc).timestamp())
    
def to_readable_date(timestamp):
    return dt.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

start_date = dt.datetime.strptime("2020-07-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2020-08-20", "%Y-%m-%d")

date_range = (pd.date_range(
                start_date, 
                periods=(end_date - start_date).days + 2)
              .tolist())

sort_type="created_utc"
sort="desc"
fields=["author", "subreddit", "created_utc", "num_comments", "score", "title", "selftext"]
subreddit = "nba"
url = "https://api.pushshift.io/reddit/submission/search/"
results = []

for i, s_date in enumerate(date_range):
    if i != len(date_range)-1:
        e_date = date_range[i+1]
        r = requests.get(url = url, params={
            "after": to_utc(s_date),
            "before": to_utc(e_date),
            "sort_type": sort_type,
            "sort": sort,
            "subreddit": subreddit,
            "fields": fields,
            "size": 500
        })
        
        if r.status_code == 200:
            results.append(r.json()["data"])
        time.sleep(1)

In [None]:
nba_list = []
for sublist in results:
    if sublist is not None:
        for item in sublist:
            nba_list.append(item)

table = pd.DataFrame.from_dict(nba_list)
display(table.head())
table.to_csv("reddit_nba.csv")

In [None]:
table.shape #5,100 submissions

In [None]:
table["author"].value_counts() # most submission author is deleted followed by auscrisos

In [None]:
table.loc[table["score"].idxmax()] #title "Stan Van Gundy ..." has the highest score

In [None]:
table.loc[table["num_comments"].idxmax()] #Game Thread: Phila 76ers ... has the most comments

In [None]:
table["created_utc"] = pd.to_datetime(table["created_utc"], unit="s")
table["day"] = table["created_utc"].dt.dayofweek
table.head()

In [None]:
table["day"].value_counts() #Thursday[3] and Wednesday[2] have the most number of submissions