In [1]:
from collections import Counter
from datetime import datetime
from decouple import config
from nltk.corpus import stopwords
import numpy as np
from os import path
import pandas as pd
import praw
from profanity_filter import remove_bad_words
from PIL import Image
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# connecting to reddit API
reddit = praw.Reddit(
    client_id = config("CLIENT_ID"),
    client_secret = config("SECRET"),
    user_agent = config("USER"),
    username = config("USERNAME"),
    password = config("PASSWORD")
)

subreddit = reddit.subreddit("wallstreetbets")

hot_wsb = subreddit.hot(limit=1000)

In [3]:
# storing data in a pandas dataframe
dict = {"title": [],
        "subreddit": [],
        "score": [],
        "id": [],
        "url": [],
        "comms_num": [],
        "created": [],
        "body": []}

for submission in hot_wsb:
    dict["title"].append(submission.title)
    dict['subreddit'].append(submission.subreddit)
    dict["score"].append(submission.score)
    dict["id"].append(submission.id)
    dict["url"].append(submission.url)
    dict["comms_num"].append(submission.num_comments)
    dict["created"].append(submission.created)
    dict["body"].append(submission.selftext)
    
df = pd.DataFrame(dict)

In [4]:
# function that cleans the text in the submission
def clean_submission(text):
    text = text.lower()
    text = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t'])|(\w+:\/\/\S+)", " ", text).split())
    return text

In [5]:
# applying clean submission function to the title and body columns
df['title'] = df['title'].apply(lambda x: clean_submission(x))
df['body'] = df['body'].apply(lambda x: clean_submission(x))

body_text = " ".join(body for body in df.body)
# combining title and body text
title_text = " ".join(title for title in df.title) + body_text

# set stop words/letters
# stopwords = set(STOPWORDS)
# stopwords.add("I'm, It's, s, m")

# remove stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df.shape)
df.head()

(385, 8)


Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,daily discussion thread march 18 2021,wallstreetbets,538,m7nj0c,https://www.reddit.com/r/wallstreetbets/commen...,9449,1616090000.0,daily trading discussion thread please keep sh...
1,wsb rules please read posting,wallstreetbets,87,m7qjwf,https://www.reddit.com/r/wallstreetbets/commen...,23,1616102000.0,welcome wsb additional reading world wallstree...
2,gme megathread march 18 2021,wallstreetbets,12150,m7pr7b,https://www.reddit.com/r/wallstreetbets/commen...,11231,1616099000.0,
3,shout u lampzworldwg22 borrowed money drug dea...,wallstreetbets,9177,m7okje,https://www.reddit.com/gallery/m7okje,783,1616095000.0,
4,positive wsb ape publicity uk news,wallstreetbets,12010,m7mh56,https://i.redd.it/l94h9ns60rn61.png,266,1616086000.0,


In [6]:
# applying profanity filter to text
# title_text = remove_bad_words(title_text)

In [7]:
df.head()

Unnamed: 0,title,subreddit,score,id,url,comms_num,created,body
0,daily discussion thread march 18 2021,wallstreetbets,538,m7nj0c,https://www.reddit.com/r/wallstreetbets/commen...,9449,1616090000.0,daily trading discussion thread please keep sh...
1,wsb rules please read posting,wallstreetbets,87,m7qjwf,https://www.reddit.com/r/wallstreetbets/commen...,23,1616102000.0,welcome wsb additional reading world wallstree...
2,gme megathread march 18 2021,wallstreetbets,12150,m7pr7b,https://www.reddit.com/r/wallstreetbets/commen...,11231,1616099000.0,
3,shout u lampzworldwg22 borrowed money drug dea...,wallstreetbets,9177,m7okje,https://www.reddit.com/gallery/m7okje,783,1616095000.0,
4,positive wsb ape publicity uk news,wallstreetbets,12010,m7mh56,https://i.redd.it/l94h9ns60rn61.png,266,1616086000.0,


In [8]:
df.columns

Index(['title', 'subreddit', 'score', 'id', 'url', 'comms_num', 'created',
       'body'],
      dtype='object')

In [9]:
# brainstorming
# what do we want to get from this data?
# perhaps some insight as to what wsb is thinking/doing in regards to certain stocks
# word frequency
# sentiment

In [13]:
# frequency for title
title_freq = Counter(" ".join(df['title']).split()).most_common(25)
title_freq = pd.DataFrame(title_freq, columns=['Word', 'Frequency'])
print(title_freq)

      Word  Frequency
0      gme         76
1     uwmc         35
2     yolo         25
3     2021         21
4   shares         21
5       dd         21
6    march         20
7      wsb         18
8    stock         18
9     like         17
10   today         17
11       3         17
12   short         16
13     amc         16
14     rkt         16
15      17         13
16     get         12
17     one         12
18     100         12
19     day         11
20  market         11
21  update         11
22  bought         11
23    made         10
24    apes         10


In [11]:
body_freq = Counter(" ".join(df['body']).split()).most_common(10)
body_freq = pd.DataFrame(body_freq, columns=['Word', 'Frequency'])
print(body_freq)

     Word  Frequency
0  market        289
1   stock        284
2   price        279
3     gme        261
4  shares        260
5       1        218
6   x200b        211
7   short        207
8       2        205
9       3        192
