In [1]:
from collections import Counter
from datetime import datetime
from decouple import config
from nltk.corpus import stopwords
import numpy as np
from os import path
import pandas as pd
import praw
from praw.models import MoreComments
from profanity_filter import remove_bad_words
from PIL import Image
import psycopg2
import re
import sqlalchemy
from sqlalchemy import create_engine
import time
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# connecting to reddit API
reddit = praw.Reddit(
    client_id = config("CLIENT_ID"),
    client_secret = config("SECRET"),
    user_agent = config("USER"),
    username = config("USERNAME"),
    password = config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

In [3]:
# storing data in a pandas dataframe
dict = {"title": [],
        "subreddit": [],
        "author": [],
        "score": [],
        "upvote_ratio": [],
        "id": [],
        "url": [],
        "num_comments": [],
        "created": [],
        "body": []}

for submission in hot_wsb:
    dict["title"].append(submission.title)
    dict['subreddit'].append(submission.subreddit)
    dict['author'].append(submission.author)
    dict["score"].append(submission.score)
    dict["upvote_ratio"].append(submission.upvote_ratio)
    dict["id"].append(submission.id)
    dict["url"].append(submission.url)
    dict["num_comments"].append(submission.num_comments)
    dict["created"].append(submission.created)
    dict["body"].append(submission.selftext)
    
df = pd.DataFrame(dict)

In [4]:
# function that cleans the text in the submission
def clean_submission(text):
    text = text.lower()
    text = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t'])|(\w+:\/\/\S+)", " ", text).split())
    return text

In [5]:
# applying clean submission function to the title and body columns
df['title'] = df['title'].apply(lambda x: clean_submission(x))
df['body'] = df['body'].apply(lambda x: clean_submission(x))

body_text = " ".join(body for body in df.body)
# combining title and body text
title_text = " ".join(title for title in df.title) + body_text

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df.shape)
df.head()

(169, 10)


Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 03 2021,wallstreetbets,OPINION_IS_UNPOPULAR,366,0.83,n3sdrh,https://www.reddit.com/r/wallstreetbets/commen...,10947,1620065000.0,daily trading discussion thread please keep sh...
1,lost sec filing citadel llc,wallstreetbets,steglitsen,9346,0.95,n3tufd,https://v.redd.it/gwy3rqy26ww61,157,1620071000.0,
2,bought first shares aapl almost exactly 5 year...,wallstreetbets,Credit-Limit,941,0.95,n3ym6h,https://i.redd.it/0gae3vn99xw61.png,80,1620084000.0,
3,early redemption senior notes,wallstreetbets,ExpressionNaive1923,619,0.95,n3tby1,https://gamestop.gcs-web.com/news-releases/new...,48,1620069000.0,
4,nok potential reach 10,wallstreetbets,TheDevelopedDeed,439,0.79,n3vlr0,https://www.reddit.com/r/wallstreetbets/commen...,166,1620076000.0,nok potential reach 10 here's revenue earnings...


In [6]:
# applying profanity filter to text
# title_text = remove_bad_words(title_text)

In [7]:
df.head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 03 2021,wallstreetbets,OPINION_IS_UNPOPULAR,366,0.83,n3sdrh,https://www.reddit.com/r/wallstreetbets/commen...,10947,1620065000.0,daily trading discussion thread please keep sh...
1,lost sec filing citadel llc,wallstreetbets,steglitsen,9346,0.95,n3tufd,https://v.redd.it/gwy3rqy26ww61,157,1620071000.0,
2,bought first shares aapl almost exactly 5 year...,wallstreetbets,Credit-Limit,941,0.95,n3ym6h,https://i.redd.it/0gae3vn99xw61.png,80,1620084000.0,
3,early redemption senior notes,wallstreetbets,ExpressionNaive1923,619,0.95,n3tby1,https://gamestop.gcs-web.com/news-releases/new...,48,1620069000.0,
4,nok potential reach 10,wallstreetbets,TheDevelopedDeed,439,0.79,n3vlr0,https://www.reddit.com/r/wallstreetbets/commen...,166,1620076000.0,nok potential reach 10 here's revenue earnings...


In [8]:
df.columns

Index(['title', 'subreddit', 'author', 'score', 'upvote_ratio', 'id', 'url',
       'num_comments', 'created', 'body'],
      dtype='object')

In [9]:
# brainstorming
# what do we want to get from this data?
# perhaps some insight as to what wsb is thinking/doing in regards to certain stocks
# word frequency
# sentiment

In [10]:
# frequency for title
title_freq = Counter(" ".join(df['title']).split()).most_common(30)
title_freq = pd.DataFrame(title_freq, columns=['Word', 'Frequency'])
# add current date column
title_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
title_freq = title_freq.set_index('Word')
print(title_freq)

            Frequency        date
Word                             
yolo               21  05/03/2021
may                19  05/03/2021
clov               16  05/03/2021
earnings           16  05/03/2021
dd                 11  05/03/2021
gme                10  05/03/2021
2021                9  05/03/2021
wsb                 9  05/03/2021
mvis                9  05/03/2021
calls               9  05/03/2021
5                   8  05/03/2021
nok                 8  05/03/2021
first               7  05/03/2021
shares              7  05/03/2021
week                7  05/03/2021
like                7  05/03/2021
30                  7  05/03/2021
play                7  05/03/2021
discussion          6  05/03/2021
bought              6  05/03/2021
rkt                 6  05/03/2021
today               6  05/03/2021
long                6  05/03/2021
sell                6  05/03/2021
time                6  05/03/2021
april               6  05/03/2021
aapl                5  05/03/2021
back          

In [11]:
body_freq = Counter(" ".join(df['body']).split()).most_common(30)
body_freq = pd.DataFrame(body_freq, columns=['Word', 'Frequency'])
# add current date column
body_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
body_freq = body_freq.set_index('Word')
print(body_freq)

          Frequency        date
Word                           
1               172  05/03/2021
x200b           172  05/03/2021
market          159  05/03/2021
company         148  05/03/2021
2               144  05/03/2021
stock           144  05/03/2021
price           118  05/03/2021
3               114  05/03/2021
also            103  05/03/2021
5               100  05/03/2021
4               100  05/03/2021
earnings         95  05/03/2021
time             95  05/03/2021
see              92  05/03/2021
revenue          89  05/03/2021
may              87  05/03/2021
one              87  05/03/2021
year             82  05/03/2021
new              80  05/03/2021
like             78  05/03/2021
us               73  05/03/2021
would            70  05/03/2021
saphyr           70  05/03/2021
best             69  05/03/2021
short            69  05/03/2021
10               66  05/03/2021
growth           66  05/03/2021
value            65  05/03/2021
000              65  05/03/2021
long    

In [12]:
# is there a a way i can automatically update this 
# by having the script run everyday at a certain time
# and store data to track it over time
# see how trends change over time
# might help in spotting opportunities earlier
# could front run bubbles/capitulation 

In [13]:
db_pass = config("PASSWORD")
engine = create_engine(f'postgresql://postgres:{db_pass}@localhost:5432/postgres')

In [14]:
# convert created to date
df['created'] = pd.to_datetime(df['created'], unit='s')
df.head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 03 2021,wallstreetbets,OPINION_IS_UNPOPULAR,366,0.83,n3sdrh,https://www.reddit.com/r/wallstreetbets/commen...,10947,2021-05-03 18:00:20,daily trading discussion thread please keep sh...
1,lost sec filing citadel llc,wallstreetbets,steglitsen,9346,0.95,n3tufd,https://v.redd.it/gwy3rqy26ww61,157,2021-05-03 19:37:37,
2,bought first shares aapl almost exactly 5 year...,wallstreetbets,Credit-Limit,941,0.95,n3ym6h,https://i.redd.it/0gae3vn99xw61.png,80,2021-05-03 23:15:20,
3,early redemption senior notes,wallstreetbets,ExpressionNaive1923,619,0.95,n3tby1,https://gamestop.gcs-web.com/news-releases/new...,48,2021-05-03 19:04:27,
4,nok potential reach 10,wallstreetbets,TheDevelopedDeed,439,0.79,n3vlr0,https://www.reddit.com/r/wallstreetbets/commen...,166,2021-05-03 21:08:58,nok potential reach 10 here's revenue earnings...


In [15]:
# convert subreddit column to string
df['subreddit'] = df['subreddit'].astype(str)
# convert author column to string
df['author'] = df['author'].astype(str)

In [16]:
df.to_sql('sample_table', engine, if_exists='replace', index=False)
#           dtype={'author': sqlalchemy.String(50)})

In [17]:
# Brainstorming

# What else can we do with this data?

# get comments
# get all data?


In [18]:
df.head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 03 2021,wallstreetbets,OPINION_IS_UNPOPULAR,366,0.83,n3sdrh,https://www.reddit.com/r/wallstreetbets/commen...,10947,2021-05-03 18:00:20,daily trading discussion thread please keep sh...
1,lost sec filing citadel llc,wallstreetbets,steglitsen,9346,0.95,n3tufd,https://v.redd.it/gwy3rqy26ww61,157,2021-05-03 19:37:37,
2,bought first shares aapl almost exactly 5 year...,wallstreetbets,Credit-Limit,941,0.95,n3ym6h,https://i.redd.it/0gae3vn99xw61.png,80,2021-05-03 23:15:20,
3,early redemption senior notes,wallstreetbets,ExpressionNaive1923,619,0.95,n3tby1,https://gamestop.gcs-web.com/news-releases/new...,48,2021-05-03 19:04:27,
4,nok potential reach 10,wallstreetbets,TheDevelopedDeed,439,0.79,n3vlr0,https://www.reddit.com/r/wallstreetbets/commen...,166,2021-05-03 21:08:58,nok potential reach 10 here's revenue earnings...


In [28]:
%%time

comments = {"submission_id": [],
            "comment": []}

for id in df['id']:
    submission = reddit.submission(id=id)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments["submission_id"].append(id)
        comments["comment"].append(comment.body)
        
comments_df = pd.DataFrame(comments)

CPU times: user 2min 31s, sys: 23.5 s, total: 2min 54s
Wall time: 4h 41min 20s


In [29]:
print(comments_df.shape)
comments_df.head()

(107874, 2)


Unnamed: 0,submission_id,comment
0,n3sdrh,What random shitstock Will moon 500% today whi...
1,n3sdrh,In starting to think MVIS at $26 was a poor in...
2,n3sdrh,I can confirm there will be a 30% correction i...
3,n3sdrh,I hate wsb for making me unimpressed by +10%. ...
4,n3sdrh,* Me: Buy $1000 worth of something -33%.in a w...


In [33]:
len(comments_df["submission_id"].unique())

168

In [34]:
df['num_comments'].sum()

120072