In [1]:
# imports
from decouple import config
import pandas as pd
import praw
import psycopg2
import schedule
from sqlalchemy import create_engine
import time



current_day = time.strftime("%m/%d/%Y")
print(f"Performing job on {current_day}")
startTime = time.time()

# connecting to reddit API
reddit = praw.Reddit(
    client_id=config("CLIENT_ID"),
    client_secret=config("SECRET"),
    user_agent=config("USER"),
    username=config("USERNAME"),
    password=config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

# storing submission data in a dictionary
submissions = {
    "title": [],
    "subreddit": [],
    "submission_author": [],
    "submission_score": [],
    "submission_id": [],
    "url": [],
    "num_comments": [],
    "submission_created": [],
    "submission_body": []
}

# iterate over each submission and store data in the submissions dictionary 
for submission in hot_wsb:
    submissions["title"].append(submission.title)
    submissions["subreddit"].append(submission.subreddit)
    submissions["submission_author"].append(submission.author)
    submissions["submission_score"].append(submission.score)
    submissions["submission_id"].append(submission.id)
    submissions["url"].append(submission.url)
    submissions["num_comments"].append(submission.num_comments)
    submissions["submission_created"].append(submission.created)
    submissions["submission_body"].append(submission.selftext)
    
# transform the submissions dictionary into a pandas dataframe
df = pd.DataFrame(submissions)

# convert created to date 
df['submission_created'] = pd.to_datetime(df['submission_created'], unit='s')

# convert subreddit column to string
df['subreddit'] = df['subreddit'].astype(str)

# convert author column to string
df['submission_author'] = df['submission_author'].astype(str)

# connect to postgresql database
db_pass = config("PASSWORD")
engine = create_engine(
    f'postgresql://postgres:{db_pass}@localhost:5432/postgres')

# store pandas dataframe in sql database
# df.to_sql('submissions', engine, if_exists='append')

# create dictionary to store comments
comments = {
    "submission_id": [],
    "comment_id": [],
    "comment_score": [],
    "comment_author": [],
    "comment_created": [],
    "comment_body": []
}

# iterating over each submission and collecting relevent comment data

submission = reddit.submission(id='n4oegm')
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    comments["submission_id"].append('n4oegm')
    comments["comment_id"].append(comment.id)
    comments["comment_score"].append(comment.score)
    comments["comment_author"].append(comment.author)
    comments["comment_created"].append(comment.created)
    comments["comment_body"].append(comment.body)
    
# converting comments dictionary to a pandas dataframe
comments_df = pd.DataFrame(comments)

comments_df["submission_id"] = comments_df["submission_id"].astype(str)
    
# convert created to date
comments_df["comment_created"] = pd.to_datetime(comments_df["comment_created"], unit='s')

# convert author to string
comments_df["comment_author"] = comments_df["comment_author"].astype(str)


# store comments_df in sql table
comments_df.to_sql('test', engine, if_exists='replace', index=False)

# calculate time it takes for script to run
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))

Performing job on 05/07/2021
Execution time in minutes: 0.17266361316045126


In [2]:
df.head()

Unnamed: 0,title,subreddit,submission_author,submission_score,submission_id,url,num_comments,submission_created,submission_body
0,"Daily Discussion Thread for May 07, 2021",wallstreetbets,OPINION_IS_UNPOPULAR,159,n6uyxz,https://www.reddit.com/r/wallstreetbets/commen...,8245,2021-05-07 18:00:15,Your daily trading discussion thread. Please k...
1,When DFV posts literally anything,wallstreetbets,squeezingyourboobs,27447,n6uwqn,https://v.redd.it/v4iz4czv7ox61,416,2021-05-07 17:56:05,
2,That's right,wallstreetbets,keenfeed,1460,n6y97z,https://v.redd.it/wuvrmysu7px61,183,2021-05-07 21:16:15,
3,It's coming people,wallstreetbets,keenfeed,1069,n6zbc0,https://v.redd.it/2lafkg5wgpx61,144,2021-05-07 22:07:00,
4,So uncivilized ft. DFV & Shitadel,wallstreetbets,Jmeshareholder,630,n6vlft,https://v.redd.it/wen033pkgox61,42,2021-05-07 18:43:01,


In [3]:
comments_df.head()

Unnamed: 0,submission_id,comment_id,comment_score,comment_author,comment_created,comment_body
0,n4oegm,gwwopgp,557,monsterbangster,2021-05-04 22:02:37,May THE stock be with you!
1,n4oegm,gwwo9hk,140,Wonderful_Court1076,2021-05-04 21:59:21,And also with you.
2,n4oegm,gwwr9o5,277,Low-Hovercraft-9849,2021-05-04 22:21:03,Game force 400 gang getting a better average e...
3,n4oegm,gwwn1u7,265,BlazinWarrior,2021-05-04 21:50:29,Now this is the kinda content I like to see in...
4,n4oegm,gwwppxl,30,fredkarlsson,2021-05-04 22:09:59,Never give me the odds!


In [4]:
# brainstorming what do to next now that I have the data pipeline set up

# make sure to select distinct comment ids to avoid duplicate data [x]
# take the data in the sql databases and practice queries [x]
# export sql databases to csv files - load them back into a notebook 
# or just reverse using psycopg2? [x]
# could create visualizations of submissions, comments over time [x]
# find most popular words/symbols across submissions and comments [x]
# could match symbols with a list of company names/tickers (maybe from RH project)
# this way even if a symbol isn't as popular as a word I can still track it
# could maybe find what's rising early on and front run
# could create a barchart race of the most mentioned tickers over time
# could find rising tickers from one day to the next with higher % mentions!
# could rank tickers by the most upvotes on submissions and comments
# sentiment analysis around certain stocks/tickers - might be tough given sarcasm/reddit speak
# could compare popularity of stocks or sentiment around stocks vs price of a stock
# create a watchlist of reddit stocks to monitor and keep track of data around those
# i.e. valuation, business model viability, comps, short interest, etc
# see if there are any users with a following similar to /u/deepfuckingvalue that
# consisitently have a lot of upvotes on their submissions/comments 
# does price drive narrative?
# put to call ratio word mentions