In [1]:
from collections import Counter
from datetime import datetime
from decouple import config
from nltk.corpus import stopwords
import numpy as np
from os import path
import pandas as pd
import praw
from praw.models import MoreComments
from profanity_filter import remove_bad_words
from PIL import Image
import psycopg2
import re
import sqlalchemy
from sqlalchemy import create_engine
import time
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# connecting to reddit API
reddit = praw.Reddit(
    client_id = config("CLIENT_ID"),
    client_secret = config("SECRET"),
    user_agent = config("USER"),
    username = config("USERNAME"),
    password = config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

In [3]:
# storing data in a pandas dataframe
dict = {"title": [],
        "subreddit": [],
        "author": [],
        "score": [],
        "upvote_ratio": [],
        "id": [],
        "url": [],
        "num_comments": [],
        "created": [],
        "body": []}

for submission in hot_wsb:
    dict["title"].append(submission.title)
    dict['subreddit'].append(submission.subreddit)
    dict['author'].append(submission.author)
    dict["score"].append(submission.score)
    dict["upvote_ratio"].append(submission.upvote_ratio)
    dict["id"].append(submission.id)
    dict["url"].append(submission.url)
    dict["num_comments"].append(submission.num_comments)
    dict["created"].append(submission.created)
    dict["body"].append(submission.selftext)
    
df = pd.DataFrame(dict)

In [4]:
# function that cleans the text in the submission
def clean_submission(text):
    text = text.lower()
    text = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t'])|(\w+:\/\/\S+)", " ", text).split())
    return text

In [5]:
# applying clean submission function to the title and body columns
df['title'] = df['title'].apply(lambda x: clean_submission(x))
df['body'] = df['body'].apply(lambda x: clean_submission(x))

body_text = " ".join(body for body in df.body)
# combining title and body text
title_text = " ".join(title for title in df.title) + body_text

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df.shape)
df.head()

(155, 10)


Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 04 2021,wallstreetbets,OPINION_IS_UNPOPULAR,439,0.82,n4klyx,https://www.reddit.com/r/wallstreetbets/commen...,17477,1620151000.0,daily trading discussion thread please keep sh...
1,scam alert wallstreetbets telegram crypto,wallstreetbets,OPINION_IS_UNPOPULAR,13483,0.95,mub6t7,https://www.reddit.com/r/wallstreetbets/commen...,971,1618897000.0,investmemt wsb name unrelated wsb loser ex mod...
2,may fourth,wallstreetbets,steglitsen,11530,0.93,n4oegm,https://v.redd.it/5xqufm6zv3x61,239,1620164000.0,
3,bought appl early 20's can't wait early retire...,wallstreetbets,kingpaul0003,1113,0.91,n4q1nv,https://i.redd.it/ijep8xpg84x61.jpg,215,1620168000.0,
4,msft infinite money glitch,wallstreetbets,mullerel,17949,0.89,n4dvka,https://www.reddit.com/r/wallstreetbets/commen...,701,1620124000.0,disclaimer financial investment advisor subred...


In [6]:
# applying profanity filter to text
# title_text = remove_bad_words(title_text)

In [7]:
df.head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 04 2021,wallstreetbets,OPINION_IS_UNPOPULAR,439,0.82,n4klyx,https://www.reddit.com/r/wallstreetbets/commen...,17477,1620151000.0,daily trading discussion thread please keep sh...
1,scam alert wallstreetbets telegram crypto,wallstreetbets,OPINION_IS_UNPOPULAR,13483,0.95,mub6t7,https://www.reddit.com/r/wallstreetbets/commen...,971,1618897000.0,investmemt wsb name unrelated wsb loser ex mod...
2,may fourth,wallstreetbets,steglitsen,11530,0.93,n4oegm,https://v.redd.it/5xqufm6zv3x61,239,1620164000.0,
3,bought appl early 20's can't wait early retire...,wallstreetbets,kingpaul0003,1113,0.91,n4q1nv,https://i.redd.it/ijep8xpg84x61.jpg,215,1620168000.0,
4,msft infinite money glitch,wallstreetbets,mullerel,17949,0.89,n4dvka,https://www.reddit.com/r/wallstreetbets/commen...,701,1620124000.0,disclaimer financial investment advisor subred...


In [8]:
df.columns

Index(['title', 'subreddit', 'author', 'score', 'upvote_ratio', 'id', 'url',
       'num_comments', 'created', 'body'],
      dtype='object')

In [9]:
# brainstorming
# what do we want to get from this data?
# perhaps some insight as to what wsb is thinking/doing in regards to certain stocks
# word frequency
# sentiment

In [10]:
# frequency for title
title_freq = Counter(" ".join(df['title']).split()).most_common(30)
title_freq = pd.DataFrame(title_freq, columns=['Word', 'Frequency'])
# add current date column
title_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
title_freq = title_freq.set_index('Word')
print(title_freq)

             Frequency        date
Word                              
yolo                24  05/04/2021
may                 18  05/04/2021
earnings            18  05/04/2021
rkt                 14  05/04/2021
dd                  13  05/04/2021
2021                 9  05/04/2021
bought               9  05/04/2021
play                 9  05/04/2021
gme                  9  05/04/2021
wsb                  8  05/04/2021
clov                 7  05/04/2021
moon                 6  05/04/2021
go                   6  05/04/2021
week                 6  05/04/2021
update               5  05/04/2021
stocks               5  05/04/2021
stock                5  05/04/2021
nok                  5  05/04/2021
shares               5  05/04/2021
discussion           4  05/04/2021
early                4  05/04/2021
day                  4  05/04/2021
undervalued          4  05/04/2021
sell                 4  05/04/2021
mvis                 4  05/04/2021
time                 4  05/04/2021
tsla                

In [11]:
body_freq = Counter(" ".join(df['body']).split()).most_common(30)
body_freq = pd.DataFrame(body_freq, columns=['Word', 'Frequency'])
# add current date column
body_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
body_freq = body_freq.set_index('Word')
print(body_freq)

          Frequency        date
Word                           
1               189  05/04/2021
stock           176  05/04/2021
market          166  05/04/2021
company         154  05/04/2021
2               143  05/04/2021
x200b           139  05/04/2021
price           116  05/04/2021
3               112  05/04/2021
year            111  05/04/2021
5               106  05/04/2021
also            104  05/04/2021
time            101  05/04/2021
like             95  05/04/2021
revenue          93  05/04/2021
4                92  05/04/2021
earnings         89  05/04/2021
may              89  05/04/2021
see              88  05/04/2021
short            85  05/04/2021
million          82  05/04/2021
one              82  05/04/2021
growth           77  05/04/2021
new              76  05/04/2021
us               74  05/04/2021
2021             73  05/04/2021
2020             72  05/04/2021
10               72  05/04/2021
saphyr           70  05/04/2021
would            68  05/04/2021
high    

In [None]:
# is there a a way i can automatically update this 
# by having the script run everyday at a certain time
# and store data to track it over time
# see how trends change over time
# might help in spotting opportunities earlier
# could front run bubbles/capitulation 

In [12]:
db_pass = config("PASSWORD")
engine = create_engine(f'postgresql://postgres:{db_pass}@localhost:5432/postgres')

In [13]:
# convert created to date
df['created'] = pd.to_datetime(df['created'], unit='s')
df.head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 04 2021,wallstreetbets,OPINION_IS_UNPOPULAR,439,0.82,n4klyx,https://www.reddit.com/r/wallstreetbets/commen...,17477,2021-05-04 18:00:18,daily trading discussion thread please keep sh...
1,scam alert wallstreetbets telegram crypto,wallstreetbets,OPINION_IS_UNPOPULAR,13483,0.95,mub6t7,https://www.reddit.com/r/wallstreetbets/commen...,971,2021-04-20 05:35:18,investmemt wsb name unrelated wsb loser ex mod...
2,may fourth,wallstreetbets,steglitsen,11530,0.93,n4oegm,https://v.redd.it/5xqufm6zv3x61,239,2021-05-04 21:32:00,
3,bought appl early 20's can't wait early retire...,wallstreetbets,kingpaul0003,1113,0.91,n4q1nv,https://i.redd.it/ijep8xpg84x61.jpg,215,2021-05-04 22:43:45,
4,msft infinite money glitch,wallstreetbets,mullerel,17949,0.89,n4dvka,https://www.reddit.com/r/wallstreetbets/commen...,701,2021-05-04 10:25:08,disclaimer financial investment advisor subred...


In [14]:
# convert subreddit column to string
df['subreddit'] = df['subreddit'].astype(str)
# convert author column to string
df['author'] = df['author'].astype(str)

In [15]:
df.to_sql('sample_table', engine, if_exists='replace', index=False)
#           dtype={'author': sqlalchemy.String(50)})

In [16]:
# Brainstorming

# What else can we do with this data?

# get comments
# get all data?


In [17]:
df.head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
0,daily discussion thread may 04 2021,wallstreetbets,OPINION_IS_UNPOPULAR,439,0.82,n4klyx,https://www.reddit.com/r/wallstreetbets/commen...,17477,2021-05-04 18:00:18,daily trading discussion thread please keep sh...
1,scam alert wallstreetbets telegram crypto,wallstreetbets,OPINION_IS_UNPOPULAR,13483,0.95,mub6t7,https://www.reddit.com/r/wallstreetbets/commen...,971,2021-04-20 05:35:18,investmemt wsb name unrelated wsb loser ex mod...
2,may fourth,wallstreetbets,steglitsen,11530,0.93,n4oegm,https://v.redd.it/5xqufm6zv3x61,239,2021-05-04 21:32:00,
3,bought appl early 20's can't wait early retire...,wallstreetbets,kingpaul0003,1113,0.91,n4q1nv,https://i.redd.it/ijep8xpg84x61.jpg,215,2021-05-04 22:43:45,
4,msft infinite money glitch,wallstreetbets,mullerel,17949,0.89,n4dvka,https://www.reddit.com/r/wallstreetbets/commen...,701,2021-05-04 10:25:08,disclaimer financial investment advisor subred...


In [18]:
# what other comment data do we want?

# score - would be helpful to maybe determine how many people agree 
# author ? - maybe I could identify certain authors with consistent upvotes/influence
# created - would be helpful to know if the comment was made the same day as the submission

%%time

comments = {"submission_id": [],
            "comment_id": [],
            "score": [], 
            "author": [],
            "created": [],
            "comment": []}

for id in df['id']:
    submission = reddit.submission(id=id)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments["submission_id"].append(id) # should I change this to comment.submission ? maybe faster to leave
        comments["comment_id"].append(comment.id)
        comments["score"].append(comment.score)
        comments["author"].append(comment.author)
        comments["created"].append(comment.created)
        comments["comment"].append(comment.body)
        
comments_df = pd.DataFrame(comments)

CPU times: user 2min 4s, sys: 13.4 s, total: 2min 17s
Wall time: 3h 35min 50s


In [19]:
# comments_df = pd.read_csv('reddit_comments.csv')

In [20]:
print(comments_df.shape)
comments_df.head()

(86484, 2)


Unnamed: 0,submission_id,comment
0,n4klyx,"As per rule 4, we do not allow microcap (<$1B ..."
1,n4klyx,Jeff Bezos and Bill Gates should start dating ...
2,n4klyx,"Market is at ATH, and my portfolio is down 45%..."
3,n4klyx,PLTR is red in case you were expecting somethi...
4,n4klyx,"welcome to the stock market, the show where ev..."


In [21]:
len(comments_df["submission_id"].unique())

155

In [22]:
len(df)

155

In [23]:
df['num_comments'].sum()

97562

In [24]:
df["num_comments"].sum() - len(comments_df)

11078

In [25]:
df.sort_values(by=["num_comments"], ascending=False).head()

Unnamed: 0,title,subreddit,author,score,upvote_ratio,id,url,num_comments,created,body
71,daily discussion thread may 03 2021,wallstreetbets,OPINION_IS_UNPOPULAR,541,0.83,n3sdrh,https://www.reddit.com/r/wallstreetbets/commen...,17504,2021-05-03 18:00:20,daily trading discussion thread please keep sh...
0,daily discussion thread may 04 2021,wallstreetbets,OPINION_IS_UNPOPULAR,439,0.82,n4klyx,https://www.reddit.com/r/wallstreetbets/commen...,17477,2021-05-04 18:00:18,daily trading discussion thread please keep sh...
40,moves tomorrow may 04 2021,wallstreetbets,OPINION_IS_UNPOPULAR,495,0.88,n45qyv,https://www.reddit.com/r/wallstreetbets/commen...,13324,2021-05-04 04:00:20,daily trading discussion thread please keep sh...
97,moves tomorrow may 03 2021,wallstreetbets,OPINION_IS_UNPOPULAR,482,0.86,n3ekxx,https://www.reddit.com/r/wallstreetbets/commen...,10090,2021-05-03 04:00:13,daily trading discussion thread please keep sh...
64,wsb analyzes charts,wallstreetbets,yolocallking,46274,0.85,n3dujk,https://v.redd.it/2qjyeacadrw61,2714,2021-05-03 03:27:36,


In [26]:
comments_df.to_sql('comments', engine, if_exists='append', index=False)

In [27]:
# comments_df["comment"] = str(comments_df["comment"])

In [29]:
print(comments_df.shape)
comments_df.head()

(86484, 2)


Unnamed: 0,submission_id,comment
0,n4klyx,"As per rule 4, we do not allow microcap (<$1B ..."
1,n4klyx,Jeff Bezos and Bill Gates should start dating ...
2,n4klyx,"Market is at ATH, and my portfolio is down 45%..."
3,n4klyx,PLTR is red in case you were expecting somethi...
4,n4klyx,"welcome to the stock market, the show where ev..."


In [90]:
# applying clean submission function to comments
comments_df['comment'] = comments_df['comment'].apply(lambda x: clean_submission(x))

comments_text = " ".join(comment for comment in comments_df.comment)

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
comments_df['comment'] = comments_df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


print(comments_df.shape)
comments_df.head()

(107874, 2)


Unnamed: 0,submission_id,comment
0,n3sdrh,random shitstock moon 500 today carry pltr bags
1,n3sdrh,starting think mvis 26 poor investment
2,n3sdrh,confirm 30 correction market sure correct also...
3,n3sdrh,hate wsb making unimpressed 10 i'm like got
4,n3sdrh,buy 1000 worth something 33 week buy 250 somet...


In [92]:
comment_freq = Counter(" ".join(comments_df['comment']).split()).most_common(30)
comment_freq = pd.DataFrame(comment_freq, columns=['Word', 'Frequency'])
# add current date column
# comment_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
# comment_freq = comment_freq.set_index('Word')
comment_freq

Unnamed: 0,Word,Frequency
0,like,8157
1,buy,5293
2,get,5179
3,money,4795
4,good,4524
5,stock,4103
6,go,4084
7,calls,3947
8,going,3850
9,one,3774


In [None]:
# counts not matching, but according to praw docs:

# You can now properly extract and parse all (or most) of the comments belonging to a single submission. 
# Combine this with submission iteration and you can build some really cool stuff.
# Finally, note that the value of submission.num_comments may not match up 100% with the number of 
# comments extracted via PRAW. This discrepancy is normal as that count includes deleted, removed, and spam comments.

In [None]:
# brainstorm 
# work on streaming posts and comments as they come in

In [None]:
stream_comments = {"submission_id": [],
            "comment": []}

for id in df['id']:
    submission = reddit.submission(id=id)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments["submission_id"].append(id)
        comments["comment"].append(comment.body)
        
comments_df = pd.DataFrame(comments)

In [None]:
# this is not working the way I want right now
# gets 1st submission then just streams comments from all posts (I think)

subreddit = reddit.subreddit('wallstreetbets')


for submission in subreddit.stream.submissions():
    print('************ SUBMISSION ************')
    print(submission.title)
#     for comment in subreddit.stream.comments():
#         print('************ COMMENT ************')
#         print(comment.body)

In [None]:
# if i get comments for hot page everyday there will be overlap

# goal is to get current and future submission and comment data
# analyze that data - most common ticker mentions, phrase mentions, etc
# visualize the data to see how it changes over time