In [28]:
import requests
import time
import csv
import pandas as pd 
import pytz
import datetime as dt
from pprint import pprint

In [2]:
url = "https://api.pushshift.io/reddit/search/submission"

In [5]:
def crawl_page(subreddit:str, last_page=None):
    
    params = {"subreddit": subreddit, 
              "size": 500, 
              "sort": "desc", 
              "sort_type": "created_utc"}
    
    # Keep crawling subreddit until no more pages
    if last_page is not None:
        if len(last_page) > 0:
            params["before"] = last_page[-1]["created_utc"]
        else:
            return []
        
    results = requests.get(url, params)
    if not results.ok:
        raise Exception("Server returned status code {}".format(results.status_code))
        
    return results.json()["data"]


In [48]:
def crawl_subreddit(subreddit, max_submissions=500):
    
    submissions = []
    last_page = None
    while last_page != [] and len(submissions) < max_submissions:
        last_page = crawl_page(subreddit, last_page)
        submissions += last_page
        # Pause to not overload API with requests
        time.sleep(3)
    
    return submissions

In [49]:
latest_submissions = crawl_subreddit("wallstreetbets")

In [50]:
def get_date(created):
    utc = pytz.utc
    est = pytz.timezone('US/Eastern')
    return dt.datetime.fromtimestamp(created, tz=est)

In [51]:
df = pd.DataFrame(latest_submissions)
_timestamp = df["created_utc"].apply(get_date)
new_df = df.assign(created_utc = _timestamp)

In [52]:
final_df = new_df[["id", "created_utc", "selftext", "title"]]
final_df.head(20)

Unnamed: 0,id,created_utc,selftext,title
0,mrv2ws,2021-04-15 23:40:35-04:00,,Quit options after losing several thousand yea...
1,mrv2r5,2021-04-15 23:40:18-04:00,,PHARMACY SECTOR UPDATE: Pfizer CEO says third ...
2,mrv1qz,2021-04-15 23:38:28-04:00,,"In honor of GOT 10 year anniversary: Citadel, ..."
3,mrv1pe,2021-04-15 23:38:23-04:00,"Hey all, \n\nNeed help finding a great video. ...",Need help finding video
4,mrv1or,2021-04-15 23:38:21-04:00,,Pleas fly again.
5,mrv0df,2021-04-15 23:35:47-04:00,,DOG COIN
6,mruytr,2021-04-15 23:32:56-04:00,,"Bought 100,000 INKW today. I think they have a..."
7,mruxps,2021-04-15 23:30:55-04:00,,We need a new stock target SOS is one of the m...
8,mruxkr,2021-04-15 23:30:39-04:00,,Does EXPR have any potential? I mean if BlackB...
9,mruxiy,2021-04-15 23:30:33-04:00,,So i did a thing.


In [19]:
# keys = set().union(*(d.keys() for d in latest_submissions))
# with open('wsb_fresh_topics.csv', 'w', newline='')  as output_file:
#     dict_writer = csv.DictWriter(output_file, keys)
#     dict_writer.writeheader()
#     dict_writer.writerows(latest_submissions)

In [55]:
final_df.to_csv("wsb_fresh_topics.csv", index=False)