In [1]:
from collections import Counter
from datetime import datetime
from decouple import config
from nltk.corpus import stopwords
import numpy as np
from os import path
import pandas as pd
import praw
from profanity_filter import remove_bad_words
from PIL import Image
import psycopg2
import re
from sqlalchemy import create_engine
import time
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# connecting to reddit API
reddit = praw.Reddit(
    client_id = config("CLIENT_ID"),
    client_secret = config("SECRET"),
    user_agent = config("USER"),
    username = config("USERNAME"),
    password = config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

In [3]:
# storing data in a pandas dataframe
dict = {"title": [],
        "subreddit": [],
        "score": [],
        "id": [],
        "url": [],
        "comms_num": [],
        "created": [],
        "body": []}

for submission in hot_wsb:
    dict["title"].append(submission.title)
    dict['subreddit'].append(submission.subreddit)
    dict["score"].append(submission.score)
    dict["id"].append(submission.id)
    dict["url"].append(submission.url)
    dict["comms_num"].append(submission.num_comments)
    dict["created"].append(submission.created)
    dict["body"].append(submission.selftext)
    
df = pd.DataFrame(dict)

In [4]:
# function that cleans the text in the submission
def clean_submission(text):
    text = text.lower()
    text = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t'])|(\w+:\/\/\S+)", " ", text).split())
    return text

In [5]:
# applying clean submission function to the title and body columns
df['title'] = df['title'].apply(lambda x: clean_submission(x))
df['body'] = df['body'].apply(lambda x: clean_submission(x))

body_text = " ".join(body for body in df.body)
# combining title and body text
title_text = " ".join(title for title in df.title) + body_text

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df.shape)
df.head()

(157, 8)


Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,moves tomorrow april 27 2021,wallstreetbets,516,mz6iks,https://www.reddit.com/r/wallstreetbets/commen...,13044,1619496000.0,u cashflow 's dog remy pretty cool remy federe...
1,gme squeeze incoming,wallstreetbets,28744,mz69gk,https://i.redd.it/j0awzqpxnkv61.png,2957,1619495000.0,
2,got feeling whole family going,wallstreetbets,15501,mz27qe,https://v.redd.it/tggz9iaosjv61,305,1619485000.0,
3,gme technical analysis waves go brrr,wallstreetbets,3501,mz5e3m,https://www.reddit.com/r/wallstreetbets/commen...,366,1619493000.0,hi fam fam something going gme may imminent in...
4,fellowship gme,wallstreetbets,3230,mz5q2r,https://v.redd.it/ljhfhlgfjkv61,183,1619494000.0,


In [6]:
# applying profanity filter to text
# title_text = remove_bad_words(title_text)

In [7]:
df.head()

Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,moves tomorrow april 27 2021,wallstreetbets,516,mz6iks,https://www.reddit.com/r/wallstreetbets/commen...,13044,1619496000.0,u cashflow 's dog remy pretty cool remy federe...
1,gme squeeze incoming,wallstreetbets,28744,mz69gk,https://i.redd.it/j0awzqpxnkv61.png,2957,1619495000.0,
2,got feeling whole family going,wallstreetbets,15501,mz27qe,https://v.redd.it/tggz9iaosjv61,305,1619485000.0,
3,gme technical analysis waves go brrr,wallstreetbets,3501,mz5e3m,https://www.reddit.com/r/wallstreetbets/commen...,366,1619493000.0,hi fam fam something going gme may imminent in...
4,fellowship gme,wallstreetbets,3230,mz5q2r,https://v.redd.it/ljhfhlgfjkv61,183,1619494000.0,


In [8]:
df.columns

Index(['title', 'subreddit', 'score', 'id', 'url', 'comms_num', 'created',
       'body'],
      dtype='object')

In [9]:
# brainstorming
# what do we want to get from this data?
# perhaps some insight as to what wsb is thinking/doing in regards to certain stocks
# word frequency
# sentiment

In [10]:
# frequency for title
title_freq = Counter(" ".join(df['title']).split()).most_common(30)
title_freq = pd.DataFrame(title_freq, columns=['Word', 'Frequency'])
# add current date column
title_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
title_freq = title_freq.set_index('Word')
print(title_freq)

            Frequency        date
Word                             
mvis               37  04/26/2021
yolo               26  04/26/2021
update             14  04/26/2021
gme                13  04/26/2021
dd                 12  04/26/2021
2021                9  04/26/2021
stock               9  04/26/2021
shares              8  04/26/2021
april               7  04/26/2021
1                   7  04/26/2021
last                6  04/26/2021
moon                6  04/26/2021
today               6  04/26/2021
earnings            6  04/26/2021
market              5  04/26/2021
5                   5  04/26/2021
get                 5  04/26/2021
week                5  04/26/2021
discussion          5  04/26/2021
4                   5  04/26/2021
calls               5  04/26/2021
time                5  04/26/2021
apes                5  04/26/2021
lidar               5  04/26/2021
tesla               5  04/26/2021
company             5  04/26/2021
tomorrow            4  04/26/2021
go            

In [11]:
body_freq = Counter(" ".join(df['body']).split()).most_common(30)
body_freq = pd.DataFrame(body_freq, columns=['Word', 'Frequency'])
# add current date column
body_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
body_freq = body_freq.set_index('Word')
print(body_freq)

           Frequency        date
Word                            
company          114  04/26/2021
price             95  04/26/2021
market            89  04/26/2021
stock             75  04/26/2021
1                 57  04/26/2021
one               57  04/26/2021
going             54  04/26/2021
still             53  04/26/2021
shares            52  04/26/2021
year              52  04/26/2021
also              51  04/26/2021
like              50  04/26/2021
see               49  04/26/2021
10                49  04/26/2021
new               47  04/26/2021
revenue           47  04/26/2021
dd                47  04/26/2021
2                 46  04/26/2021
time              46  04/26/2021
could             43  04/26/2021
share             41  04/26/2021
x200b             41  04/26/2021
companies         41  04/26/2021
million           41  04/26/2021
3                 40  04/26/2021
get               40  04/26/2021
earnings          39  04/26/2021
play              39  04/26/2021
growth    

In [12]:
# is there a a way i can automatically update this 
# by having the script run everyday at a certain time
# and store data to track it over time
# see how trends change over time
# might help in spotting opportunities earlier
# could front run bubbles/capitulation 

In [13]:
db_pass = config("PASSWORD")
engine = create_engine(f'postgresql://postgres:{db_pass}@localhost:5432/postgres')

In [15]:
title_freq.to_sql('title_freq', engine, if_exists='replace')