In [13]:
from collections import Counter
from datetime import datetime
from decouple import config
from nltk.corpus import stopwords
import numpy as np
from os import path
import pandas as pd
import praw
from profanity_filter import remove_bad_words
from PIL import Image
import psycopg2
import re
from sqlalchemy import create_engine
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# connecting to reddit API
reddit = praw.Reddit(
    client_id = config("CLIENT_ID"),
    client_secret = config("SECRET"),
    user_agent = config("USER"),
    username = config("USERNAME"),
    password = config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

In [3]:
# storing data in a pandas dataframe
dict = {"title": [],
        "subreddit": [],
        "score": [],
        "id": [],
        "url": [],
        "comms_num": [],
        "created": [],
        "body": []}

for submission in hot_wsb:
    dict["title"].append(submission.title)
    dict['subreddit'].append(submission.subreddit)
    dict["score"].append(submission.score)
    dict["id"].append(submission.id)
    dict["url"].append(submission.url)
    dict["comms_num"].append(submission.num_comments)
    dict["created"].append(submission.created)
    dict["body"].append(submission.selftext)
    
df = pd.DataFrame(dict)

In [4]:
# function that cleans the text in the submission
def clean_submission(text):
    text = text.lower()
    text = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t'])|(\w+:\/\/\S+)", " ", text).split())
    return text

In [5]:
# applying clean submission function to the title and body columns
df['title'] = df['title'].apply(lambda x: clean_submission(x))
df['body'] = df['body'].apply(lambda x: clean_submission(x))

body_text = " ".join(body for body in df.body)
# combining title and body text
title_text = " ".join(title for title in df.title) + body_text

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df.shape)
df.head()

(173, 8)


Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,moves tomorrow april 27 2021,wallstreetbets,186,mz6iks,https://www.reddit.com/r/wallstreetbets/commen...,5492,1619496000.0,daily trading discussion thread please keep sh...
1,got feeling whole family going,wallstreetbets,10318,mz27qe,https://v.redd.it/tggz9iaosjv61,238,1619485000.0,
2,gme squeeze incoming,wallstreetbets,4845,mz69gk,https://i.redd.it/j0awzqpxnkv61.png,813,1619495000.0,
3,even smallest person change course future,wallstreetbets,28801,myw1zz,https://v.redd.it/1v0moiy9civ61,625,1619467000.0,
4,rare copy melvil citadels medias bible,wallstreetbets,5913,myym53,https://i.redd.it/o7pqdntxziv61.jpg,147,1619475000.0,


In [6]:
# applying profanity filter to text
# title_text = remove_bad_words(title_text)

In [7]:
df.head()

Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,moves tomorrow april 27 2021,wallstreetbets,186,mz6iks,https://www.reddit.com/r/wallstreetbets/commen...,5492,1619496000.0,daily trading discussion thread please keep sh...
1,got feeling whole family going,wallstreetbets,10318,mz27qe,https://v.redd.it/tggz9iaosjv61,238,1619485000.0,
2,gme squeeze incoming,wallstreetbets,4845,mz69gk,https://i.redd.it/j0awzqpxnkv61.png,813,1619495000.0,
3,even smallest person change course future,wallstreetbets,28801,myw1zz,https://v.redd.it/1v0moiy9civ61,625,1619467000.0,
4,rare copy melvil citadels medias bible,wallstreetbets,5913,myym53,https://i.redd.it/o7pqdntxziv61.jpg,147,1619475000.0,


In [8]:
df.columns

Index(['title', 'subreddit', 'score', 'id', 'url', 'comms_num', 'created',
       'body'],
      dtype='object')

In [9]:
# brainstorming
# what do we want to get from this data?
# perhaps some insight as to what wsb is thinking/doing in regards to certain stocks
# word frequency
# sentiment

In [10]:
# frequency for title
title_freq = Counter(" ".join(df['title']).split()).most_common(30)
title_freq = pd.DataFrame(title_freq, columns=['Word', 'Frequency'])
print(title_freq)

        Word  Frequency
0       mvis         34
1       yolo         30
2        gme         16
3     update         16
4         dd         13
5     market         11
6      april          9
7      stock          9
8          1          8
9       2021          7
10      like          7
11   holding          6
12         5          6
13    shares          6
14     today          6
15      apes          6
16  earnings          6
17  gamestop          5
18     since          5
19       get          5
20     hands          5
21     lidar          5
22     calls          5
23      last          5
24         4          5
25      moon          5
26     tesla          5
27    coming          5
28      week          5
29  tomorrow          4


In [11]:
body_freq = Counter(" ".join(df['body']).split()).most_common(30)
body_freq = pd.DataFrame(body_freq, columns=['Word', 'Frequency'])
print(body_freq)

         Word  Frequency
0     company        113
1       price        101
2      market         95
3       stock         87
4      shares         65
5         one         64
6        like         63
7           1         60
8         see         58
9        time         58
10      going         57
11          3         57
12       also         56
13         10         56
14        new         55
15         dd         54
16      still         53
17        get         48
18       best         47
19      x200b         47
20       make         46
21       year         46
22     people         45
23      share         45
24       long         45
25  companies         45
26   earnings         44
27    million         44
28      could         43
29       play         43


In [12]:
# is there a a way i can automatically update this 
# by having the script run everyday at a certain time
# and store data to track it over time
# see how trends change over time
# might help in spotting opportunities earlier
# could front run bubbles/capitulation 

In [30]:
db_pass = config("PASSWORD")
engine = create_engine(f'postgresql://postgres:{db_pass}@localhost:5432/postgres')

In [32]:
df.to_sql('sample_table', engine, if_exists='replace')

ProgrammingError: (psycopg2.ProgrammingError) can't adapt type 'Subreddit'
[SQL: INSERT INTO sample_table (index, title, subreddit, score, id, url, comms_num, created, body) VALUES (%(index)s, %(title)s, %(subreddit)s, %(score)s, %(id)s, %(url)s, %(comms_num)s, %(created)s, %(body)s)]
[parameters: ({'index': 0, 'title': 'moves tomorrow april 27 2021', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 186, 'id': 'mz6iks', 'url': 'https://www.reddit.com/r/wallstreetbets/comments/mz6iks/what_are_your_moves_tomorrow_april_27_2021/', 'comms_num': 5492, 'created': 1619496022.0, 'body': 'daily trading discussion thread please keep shitposting minimum navigate wsb recommend best daily dd dd best daily best weekly discussion best daily  ... (95 characters truncated) ...  weekly earnings discussion thread read rules make sure people follow try meme mode also accessible top bar follow wsb twitter accounts impersonators'}, {'index': 1, 'title': 'got feeling whole family going', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 10318, 'id': 'mz27qe', 'url': 'https://v.redd.it/tggz9iaosjv61', 'comms_num': 238, 'created': 1619484837.0, 'body': ''}, {'index': 2, 'title': 'gme squeeze incoming', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 4845, 'id': 'mz69gk', 'url': 'https://i.redd.it/j0awzqpxnkv61.png', 'comms_num': 813, 'created': 1619495355.0, 'body': ''}, {'index': 3, 'title': 'even smallest person change course future', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 28801, 'id': 'myw1zz', 'url': 'https://v.redd.it/1v0moiy9civ61', 'comms_num': 625, 'created': 1619467293.0, 'body': ''}, {'index': 4, 'title': 'rare copy melvil citadels medias bible', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 5913, 'id': 'myym53', 'url': 'https://i.redd.it/o7pqdntxziv61.jpg', 'comms_num': 147, 'created': 1619475233.0, 'body': ''}, {'index': 5, 'title': 'gme technical analysis waves go brrr', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 1699, 'id': 'mz5e3m', 'url': 'https://www.reddit.com/r/wallstreetbets/comments/mz5e3m/gme_technical_analysis_waves_go_brrr/', 'comms_num': 228, 'created': 1619493094.0, 'body': "hi fam fam something going gme may imminent interpretation gme thru wave counting elliotwave refer image first yearly point view accomplished wave 1  ... (1728 characters truncated) ... r shooting blanks think mom told lost chromosome dividing meiosis mitosis per sfwsosa93 phase edit look go literally erection time 40 mins past close"}, {'index': 6, 'title': 'trying convince family invest using wsb dd', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 4547, 'id': 'myxh44', 'url': 'https://v.redd.it/p059q7m8qiv61', 'comms_num': 104, 'created': 1619471933.0, 'body': ''}, {'index': 7, 'title': 'gamestop completes market equity offering program', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 839, 'id': 'mz7exv', 'url': 'https://www.stocktitan.net/news/GME/game-stop-completes-at-the-market-equity-offering-v0ie6tw2d6y0.html', 'comms_num': 187, 'created': 1619498394.0, 'body': ''}  ... displaying 10 of 173 total bound parameter sets ...  {'index': 171, 'title': 'cvs technicals', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 54, 'id': 'mx4pv4', 'url': 'https://www.reddit.com/r/wallstreetbets/comments/mx4pv4/cvs_technicals/', 'comms_num': 27, 'created': 1619239611.0, 'body': "unless living rock probably know cvs probably need explain anything anyways think cvs might great opportunity take long position p e ratio cvs pe 13x ... (915 characters truncated) ... might get iv crushed x200b tl dr cvs 80c 5 21 careful earnings soon could iv crush good idea get cheap premiums rn i'm financial advisor i'm retarded"}, {'index': 172, 'title': '3500 less 5 minutes holding weekend ever learn', 'subreddit': Subreddit(display_name='wallstreetbets'), 'score': 57, 'id': 'mx3yh1', 'url': 'https://i.redd.it/q4rtsmrxczu61.jpg', 'comms_num': 34, 'created': 1619237387.0, 'body': ''})]
(Background on this error at: http://sqlalche.me/e/14/f405)