# Reddit Data Collection

This notebook provides code that automatically downloads reddit data (i.e., posts) and stores it such that it can be used as input data for any further processing.

### Configuration

The code can be configured with the following parameters:

* `subreddit` ... The subreddit the posts are downloaded from.
* `start_date`, `end_date` ... The first and last day of the time period in question. Format: `YYYY-MM-DD`
* `output_path` ... The directory the downloaded data is stored in.

In [None]:
subreddit = 'wallstreetbets'
start_date = '2020-12-01'
end_date = '2021-05-01'
output_path = 'data_wsb'

### Fetch Reddit Data 

In [None]:
from datetime import datetime 
import pandas as pd
import requests
import time
import os
import random

def fetch_wsb_posts(start_timestamp, end_timestamp):
    while True:
        try:
            print("fetch_wsb_posts(" + str(datetime.fromtimestamp(start_timestamp)) + " - " + str(datetime.fromtimestamp(end_timestamp)) + ")")
            response = requests.get(
                url='https://api.pushshift.io/reddit/submission/search/',
                params={
                    'sort_type': 'created_utc',
                    'sort': 'asc',
                    'limit': 1000,
                    'subreddit': subreddit,
                    'after': start_timestamp,
                    'before': end_timestamp,
                },
                headers = {'User-agent': 'reddit sentiment ' + str(random.randint(3, 9999))}
            )
            if response.status_code == 200:
                responseData = response.json()['data']
                return pd.json_normalize(responseData)
            else:
                print("ERROR: " + response.reason)
                raise Exception()
        except Exception:
            print("Wait for 5 seconds and try again...")
            time.sleep(5)

for day in pd.date_range(start=start_date, end=end_date).tolist():

    timestamp_of_last_fetched_post = int(day.timestamp())
    timestamp_of_next_day = int((day + pd.DateOffset(1)).timestamp())

    while True:
        df_next_posts = fetch_wsb_posts(timestamp_of_last_fetched_post, timestamp_of_next_day)

        if(len(df_next_posts) == 0):
            break

        timestamp_of_last_fetched_post = df_next_posts.tail(1)['created_utc'].values[0]
        
        # Re-format the response data to make it fit the application's requirements:
        df_next_posts['created_utc'] = df_next_posts['created_utc'].apply(lambda dte: datetime.fromtimestamp(dte))
        df_next_posts = df_next_posts\
            [['created_utc', 'id', 'title', 'selftext', 'num_comments', 'score']]\
            .rename(columns={'created_utc': 'timestamp', 'selftext': 'body'})\
            .reset_index(drop=True)

        # Store the data:
        output_path = f'{output_path}/wsb_posts__{day}_{day + pd.DateOffset(1)}.csv'
        df_next_posts.to_csv(output_path, header=not os.path.exists(output_path), index=False, mode='a')

    print("---")

fetch_wsb_posts(2020-12-01 00:00:00 - 2020-12-02 00:00:00)
fetch_wsb_posts(2020-12-01 01:08:53 - 2020-12-02 00:00:00)


NotADirectoryError: [Errno 20] Not a directory: 'data_wsb/wsb_posts__2020-12-01 00:00:00_2020-12-02 00:00:00.csv/wsb_posts__2020-12-01 00:00:00_2020-12-02 00:00:00.csv'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d7886875-5320-40a2-af00-cc1d95e2b7d3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>