In [1]:
from collections import Counter
from datetime import datetime
from decouple import config
from nltk.corpus import stopwords
import numpy as np
from os import path
import pandas as pd
import praw
from profanity_filter import remove_bad_words
from PIL import Image
import psycopg2
import re
import sqlalchemy
from sqlalchemy import create_engine
import time
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# connecting to reddit API
reddit = praw.Reddit(
    client_id = config("CLIENT_ID"),
    client_secret = config("SECRET"),
    user_agent = config("USER"),
    username = config("USERNAME"),
    password = config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

In [3]:
# storing data in a pandas dataframe
dict = {"title": [],
        "subreddit": [],
        "score": [],
        "id": [],
        "url": [],
        "comms_num": [],
        "created": [],
        "body": []}

for submission in hot_wsb:
    dict["title"].append(submission.title)
    dict['subreddit'].append(submission.subreddit)
    dict["score"].append(submission.score)
    dict["id"].append(submission.id)
    dict["url"].append(submission.url)
    dict["comms_num"].append(submission.num_comments)
    dict["created"].append(submission.created)
    dict["body"].append(submission.selftext)
    
df = pd.DataFrame(dict)

In [4]:
# function that cleans the text in the submission
def clean_submission(text):
    text = text.lower()
    text = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t'])|(\w+:\/\/\S+)", " ", text).split())
    return text

In [5]:
# applying clean submission function to the title and body columns
df['title'] = df['title'].apply(lambda x: clean_submission(x))
df['body'] = df['body'].apply(lambda x: clean_submission(x))

body_text = " ".join(body for body in df.body)
# combining title and body text
title_text = " ".join(title for title in df.title) + body_text

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df.shape)
df.head()

(211, 8)


Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,moves tomorrow april 28 2021,wallstreetbets,326,mzx686,https://www.reddit.com/r/wallstreetbets/commen...,9403,1619582000.0,daily trading discussion thread please keep sh...
1,never bet doctor,wallstreetbets,8862,mzy4qn,https://i.redd.it/3mboejkf2sv61.gif,241,1619585000.0,
2,sec filing today amc taken 500 million share d...,wallstreetbets,2788,mzzm6c,https://www.sec.gov/Archives/edgar/data/000141...,311,1619589000.0,
3,mindmed cnn people slowly realizing potential ...,wallstreetbets,6355,mzubm7,https://v.redd.it/w0g60exj7rv61,745,1619575000.0,
4,gamestop raises 551 million accelerate e comme...,wallstreetbets,24923,mznz4d,https://finance.yahoo.com/news/gamestop-raises...,835,1619557000.0,


In [6]:
# applying profanity filter to text
# title_text = remove_bad_words(title_text)

In [7]:
df.head()

Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,moves tomorrow april 28 2021,wallstreetbets,326,mzx686,https://www.reddit.com/r/wallstreetbets/commen...,9403,1619582000.0,daily trading discussion thread please keep sh...
1,never bet doctor,wallstreetbets,8862,mzy4qn,https://i.redd.it/3mboejkf2sv61.gif,241,1619585000.0,
2,sec filing today amc taken 500 million share d...,wallstreetbets,2788,mzzm6c,https://www.sec.gov/Archives/edgar/data/000141...,311,1619589000.0,
3,mindmed cnn people slowly realizing potential ...,wallstreetbets,6355,mzubm7,https://v.redd.it/w0g60exj7rv61,745,1619575000.0,
4,gamestop raises 551 million accelerate e comme...,wallstreetbets,24923,mznz4d,https://finance.yahoo.com/news/gamestop-raises...,835,1619557000.0,


In [8]:
df.columns

Index(['title', 'subreddit', 'score', 'id', 'url', 'comms_num', 'created',
       'body'],
      dtype='object')

In [9]:
# brainstorming
# what do we want to get from this data?
# perhaps some insight as to what wsb is thinking/doing in regards to certain stocks
# word frequency
# sentiment

In [10]:
# frequency for title
title_freq = Counter(" ".join(df['title']).split()).most_common(30)
title_freq = pd.DataFrame(title_freq, columns=['Word', 'Frequency'])
# add current date column
title_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
title_freq = title_freq.set_index('Word')
print(title_freq)

          Frequency        date
Word                           
mvis             50  04/27/2021
yolo             36  04/27/2021
mnmd             17  04/27/2021
2021             12  04/27/2021
today            10  04/27/2021
gme              10  04/27/2021
go               10  04/27/2021
buy              10  04/27/2021
update           10  04/27/2021
shares            9  04/27/2021
5                 9  04/27/2021
stock             9  04/27/2021
2                 9  04/27/2021
company           9  04/27/2021
earnings          9  04/27/2021
1                 8  04/27/2021
week              8  04/27/2021
april             7  04/27/2021
clov              7  04/27/2021
gains             7  04/27/2021
tomorrow          6  04/27/2021
million           6  04/27/2021
gamestop          6  04/27/2021
market            6  04/27/2021
dd                6  04/27/2021
moon              6  04/27/2021
call              6  04/27/2021
moves             5  04/27/2021
4                 5  04/27/2021
amd     

In [11]:
body_freq = Counter(" ".join(df['body']).split()).most_common(30)
body_freq = pd.DataFrame(body_freq, columns=['Word', 'Frequency'])
# add current date column
body_freq["date"] = time.strftime("%m/%d/%Y")
# drop index
body_freq = body_freq.set_index('Word')
print(body_freq)

           Frequency        date
Word                            
market            96  04/27/2021
company           87  04/27/2021
x200b             79  04/27/2021
price             74  04/27/2021
stock             73  04/27/2021
shares            67  04/27/2021
earnings          63  04/27/2021
lsd               60  04/27/2021
mindmed           60  04/27/2021
one               59  04/27/2021
best              58  04/27/2021
like              57  04/27/2021
also              56  04/27/2021
time              54  04/27/2021
data              52  04/27/2021
people            51  04/27/2021
new               51  04/27/2021
could             50  04/27/2021
going             50  04/27/2021
1                 50  04/27/2021
2                 50  04/27/2021
3                 50  04/27/2021
share             49  04/27/2021
short             45  04/27/2021
see               45  04/27/2021
companies         44  04/27/2021
use               44  04/27/2021
would             44  04/27/2021
get       

In [12]:
# is there a a way i can automatically update this 
# by having the script run everyday at a certain time
# and store data to track it over time
# see how trends change over time
# might help in spotting opportunities earlier
# could front run bubbles/capitulation 

In [13]:
db_pass = config("PASSWORD")
engine = create_engine(f'postgresql://postgres:{db_pass}@localhost:5432/postgres')

In [17]:
df = df.drop(columns=['subreddit'])
df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body
0,moves tomorrow april 28 2021,326,mzx686,https://www.reddit.com/r/wallstreetbets/commen...,9403,1619582000.0,daily trading discussion thread please keep sh...
1,never bet doctor,8862,mzy4qn,https://i.redd.it/3mboejkf2sv61.gif,241,1619585000.0,
2,sec filing today amc taken 500 million share d...,2788,mzzm6c,https://www.sec.gov/Archives/edgar/data/000141...,311,1619589000.0,
3,mindmed cnn people slowly realizing potential ...,6355,mzubm7,https://v.redd.it/w0g60exj7rv61,745,1619575000.0,
4,gamestop raises 551 million accelerate e comme...,24923,mznz4d,https://finance.yahoo.com/news/gamestop-raises...,835,1619557000.0,


In [18]:
df.to_sql('sample_table', engine, if_exists='replace')

In [None]:
df.dtypes