# Datamining Reddit Posts from CryptoCurrency Subreddit
This notebook datamines Reddit posts from the CryptoCurrency subreddit. To this end, we use PushShift (wrapper for Reddit API) to retrieve all post submissions for each date. 

In [2]:
import os
import datetime
import pandas as pd
import datetime as dt
import praw
from psaw import PushshiftAPI
import pandas as pd
import pickle

In [8]:
# Establish connection with Reddit API
r = praw.Reddit(client_id=os.getenv('CLIENT_ID'),
                     client_secret=os.getenv('CLIENT_SECRET'),
                     user_agent=os.getenv('USER_AGENT'))
reddit_api = PushshiftAPI(r)

In [None]:
# Get all reddit posts in 2021
start_date = dt.datetime(2021,1,1)
end_date =  dt.datetime(2022,1,30)
current_date = start_date

submissions_by_day = []
while current_date <= end_date:
    # Convert current_date into timestamp format
    curr_date_timestamp = int(current_date.timestamp())
    
    # To get all posts that were submitted on a given day, we need to establish an end bound
    next_day = current_date + datetime.timedelta(days= 1)
    next_day_timestamp = int(next_day.timestamp())
    
    try:
        # Get all posts between curr_date_timestamp and  next_day_timestamp from the CryptoCurrency subreddit
        results = list(reddit_api.search_submissions(before=next_day_timestamp,
                                                     after=curr_date_timestamp,
                                                     subreddit="CryptoCurrency"))
        
        if results != []:
            df = pd.DataFrame([[post.name,post.title, post.permalink,post.score,post.num_comments,post.created]for post in results])
            df.columns = ['id','title','link','score','comments_num','date']
            df.drop_duplicates(subset=['id'],inplace=True)
            submissions_by_day.append(df)
            
    except: # If there is a problem with the PushShift API
        submissions_by_day.append(None)

    current_date +=  datetime.timedelta(days= 1)

# Save all reddit posts in a pickle file
with open('data/submission_data.pickle', 'wb') as handle:
    pickle.dump(submissions_by_day, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [9]:
number_of_posts = 0

for day in submissions_by_day:
    if day is not None:
        number_of_posts += len(day)
print("Total number of Reddit posts scraped: ", number_of_posts)

# Example dataframe
submissions_by_day[0].head()

Total number of Reddit posts scraped:  636009


Unnamed: 0,id,title,link,score,comments_num,date
0,t3_knxk6p,Bitcoin Closes 2020 As Best Performing Asset O...,/r/CryptoCurrency/comments/knxk6p/bitcoin_clos...,3112,418,1609447000.0
1,t3_koa2d5,Eight members of Congress asking for a 60-day ...,/r/CryptoCurrency/comments/koa2d5/eight_member...,1145,209,1609502000.0
2,t3_knuml8,Darknet Marketplace Has Stopped Supporting Pay...,/r/CryptoCurrency/comments/knuml8/darknet_mark...,781,337,1609437000.0
3,t3_knr2yv,Don't transaction fees and confirmation time b...,/r/CryptoCurrency/comments/knr2yv/dont_transac...,395,738,1609426000.0
4,t3_koatip,"Michael Saylor Says He's Bought $2,500,000,000...",/r/CryptoCurrency/comments/koatip/michael_sayl...,391,111,1609505000.0
