## 1. Scraping Current Subreddits for Comments

This notebook 
1. Scraps the top posts from the wallstreetbets subreddit currently
2. Creates a frequency table of unique words and number of mentions
3. Scraps the top level comments from the daily discussion thread
4. Scraps the top comments from the 'what are you moves" thread
5. Outputs a csv with the frequency of ticker mentions
6. Applies a sentiment analyzer to the comments
7. Outputs a csv with the polarity of tickers 

#### Importing Necessary Modules

In [25]:
import pandas as pd
from pandas import DataFrame
import praw
from praw.models import MoreComments
import re
import requests
from datetime import datetime
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import warnings
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
import os

In [2]:
# Exclude common words from analysis
exclude_words = (['VERY','A','B','GO','ARE','ON','FOR','THE','TO',' ','SO','IT','AT','BE','OR','SO','ALL','HAS','BY','CAN','AN','OUT','NOW'])

# Set to current time
current_time = datetime.now()

In [72]:
# Setting up Reddit Client for Requests 
load_dotenv() 
client_id = os.environ['CLIENT_ID']
client_secret = os.environ['CLIENT_SECRET']
user_agent = os.environ['USER_AGENT']

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent)

In [61]:
# This function is used to create a frequency table of the scraped comments
def frequency_table_comments(clean):
    for (index, row) in clean.iterrows(): # iterate over dataframe
  # titles 
        title = row['comments'].upper() # get title in lowercase
        title = regex.sub('', title)  # clean with reges
        title_words = title.split(' ') # split titles at whitespace
    for words in title_words:
        if x in exclude_words: # common words that are also stock tickers or uneccesary. 
            word_dict[x] += 1
        else:
            word_dict[x] = 1
    return pd.DataFrame.from_dict(list(word_dict.items())).rename(columns = {0:"Term", 1:"Frequency"})

In [62]:
# Defining funtion to append comments to a list and creating a dataframe from them
def appending_to_list(submission):
    comments= []
    for top_level_comment in submission.comments[:-1]: #leaving out the last comment, since it creates an error
        comments.append(top_level_comment.body) # append comment to list
# return dataframe of the list
    return DataFrame(comments,columns=['comments'])

### 1.1 Scrape current posts into dataframe.
Scapes the current hot, new, and top posts in the wallstreetbets subreddit

In [63]:
# getting posts into a dataframe
df = [] # define empty list that will hold dictionarys
#scraper =     
for post in reddit.subreddit('wallstreetbets').hot(limit=500): # call wallstreetbets subreddit "hot" section and get first 1000 posts
    content = {  # create dictionary for results
    "title" : post.title, # store title
    "text" : post.selftext # store text of the post
    }
    df.append(content) # append dataframe
for post in reddit.subreddit('wallstreetbets').new(limit=2000): # call wallstreetbets subreddit "new" section and get first 1000 posts
    content = {  # create dictionary for results
    "title" : post.title, # store title
    "text" : post.selftext # store text of the post
    }
    df.append(content) # append dataframe
for post in reddit.subreddit('wallstreetbets').top(limit=500): # call wallstreetbets subreddit "top" section and get first 1000 posts
    content = {  # create dictionary for results
    "title" : post.title, # store title
    "text" : post.selftext # store text of the post
    }
    df.append(content) # append dataframe
    df_posts = pd.DataFrame(df) # convert to pandas dataframe

### 1.2 Create frequency table of unique words and the number of their mentions

In [64]:
regex = re.compile('[^a-zA-Z ]') # remove everything thats not a letter or space aka numbers and emojis
word_dict = {} # create dictionary
for (index, row) in df_posts.iterrows(): # iterate over dataframe
  # titles 
    title = row['title'].upper() # get title in lowercase
    title = regex.sub('', title)  # clean with reges
    title_words = title.split(' ') # split titles at whitespace
  # content
    content = row['text'].upper() # get text from post in lowercase
    content = regex.sub('', content) # clean with regex
    content_words = content.split(' ') # split titles at whitespace
  # combine titles and comments
    words = title_words + content_words
    for x in words:
        if x in exclude_words:
            pass
        elif x in word_dict:
            word_dict[x] += 1
        else:
            word_dict[x] = 1
posts_freq= pd.DataFrame.from_dict(list(word_dict.items())).rename(columns = {0:"Term", 1:"Frequency"})
# We now have a frequency table of the most often used words in the top 500 hot posts

### 1.3 Scraping the top level comments from the daily discussion thread

In [26]:
# append these comments to a list
daily_comments= [] # create empty list
for top_level_comment in submission.comments[:-1]: #leaving out the last comment, since it creates an error
        daily_comments.append(top_level_comment.body) # append comment to list
len(daily_comments)

# create dataframe of the comments
df_comments = DataFrame(daily_comments,columns=['comments'])

In [22]:
# get comments of daily discussion thread
url = ("https://www.reddit.com/r/wallstreetbets/comments/n9th6n/daily_discussion_thread_for_"+
str(current_time.strftime("%B")).lower() + "_"+  # constructing the link with daytime function
str(current_time.strftime("%d")).lower() +"_" +  # since this post is created new on a daily basis
str(current_time.strftime("%Y")).lower() + "/")  # Month, day, year

submission = reddit.submission(url=url)

In [67]:
df_comments = appending_to_list(submission)

In [68]:
# calling cleaning/splitting function to create frequency table
comments_freq = frequency_table_comments(df_comments)

### 1.4 Scraping the "what are your moves" thread

In [32]:
# get comments of daily discussion thread
url = ("https://www.reddit.com/r/wallstreetbets/comments/n9eiyu/what_are_your_moves_tomorrow_"+
str(current_time.strftime("%B")).lower() + "_"+  # constructing the link with daytime function
str(current_time.strftime("%d")).lower() +"_" +  # since this post is created new on a daily basis
str(current_time.strftime("%Y")).lower() + "/")  # Month, day, year
submission = reddit.submission(url=url)

In [33]:
df_comments_move = appending_to_list(submission)

In [34]:
# calling cleaning/splitting function from above
moves_freq = frequency_table_comments(df_comments_move)

In [35]:
# merging all frequency tables on the term, performing an inner join 
merged_table = pd.merge(pd.merge(posts_freq,comments_freq,on='Term', how='left'),moves_freq,on='Term', how = 'left')
# replace NA#s with zeros
merged_table = merged_table.fillna(0)

In [36]:
# calculating overall frequency in a new column
merged_table['frequency']=merged_table['Frequency_x'] + merged_table['Frequency_y'] + merged_table['Frequency']
# drop unnecessary frequency columns, only keeping the main one
merged_table = merged_table.drop(['Frequency_x', 'Frequency_y', 'Frequency'], axis = 1) 

In [38]:
# importing a tickerlist from the nasdaq
ticker_df = pd.read_csv('NasdaqTickers.csv', index_col=None, delimiter=';').rename(columns = {"Symbol":"Term", "Name":"Company_Name"})
ticker_df

Unnamed: 0,Term,Company_Name,Sector
0,A,Agilent Technologies Inc. Common Stock,Capital Goods
1,AA,Alcoa Corporation Common Stock,Basic Industries
2,AAC,Ares Acquisition Corporation Class A Ordinary ...,Finance
3,AACG,ATA Creativity Global American Depositary Shares,Miscellaneous
4,AACQ,Artius Acquisition Inc. Class A Common Stock,Finance
...,...,...,...
7636,ZWRKW,Z-Work Acquisition Corp. Warrant,Finance
7637,ZY,Zymergen Inc. Common Stock,Basic Industries
7638,ZYME,Zymeworks Inc. Common Shares,
7639,ZYNE,Zynerba Pharmaceuticals Inc. Common Stock,Health Care


In [69]:
# merging tickerlist with mentions on reddit, inner join, so we will loose all words that are not tickers and all tickers that arent mentioned
stocks_df = pd.merge(merged_table, ticker_df, on="Term")
stocks_df 

Unnamed: 0,Term,frequency,Company_Name,Sector
0,AM,534,Antero Midstream Corporation Common Stock,Public Utilities
1,TH,168,Target Hospitality Corp. Common Stock,Energy
2,AI,312,C3.ai Inc. Class A Common Stock,Technology
3,OPEN,174,Opendoor Technologies Inc Common Stock,Finance
4,MOVE,180,Movano Inc. Common Stock,Health Care
...,...,...,...,...
502,BAND,3,Bandwidth Inc. Class A Common Stock,Technology
503,MN,3,Manning & Napier Inc. Class A Common Stock,Finance
504,YY,3,JOYY Inc. American Depositary Shares,Technology
505,PRPL,3,Purple Innovation Inc. Common Stock,Consumer Durables


### 1.5 Output ticker Frequency CSV

In [70]:
# frequency data to csv 
stocks_df.to_csv('frequency_of_tickers_' + str(current_time.strftime("%d"))+'_'+str(current_time.strftime("%m"))+ '_'+ str(current_time.strftime("%H"))+'.csv', index=False)

### 1.6 Analyzing Sentiment in comments for tickers

In [42]:
# get list of tickers that are in the above dataframe
tickers = ticker_df['Term'].tolist()

In [48]:
# using Vader to analyse social media sentiment better than with nltk
sia = SentimentIntensityAnalyzer()

In [49]:
def sentiment(text):
    try:
        scores = sia.polarity_scores(text)
        return  scores['compound']
    except: return none

In [50]:
# add polarity column to posts
df_posts['polarity'] = df_posts['text'].apply(sentiment)
# drop title column
df_posts = df_posts.drop(['title'], axis=1)
# add polarity column 
df_comments['polarity'] = df_comments['comments'].apply(sentiment)
df_comments_move['polarity'] = df_comments_move['comments'].apply(sentiment)

In [51]:
# creating a list of the three dataframes to make iterating for sentiment extraction easier
list_df = [df_posts, df_comments, df_comments]

In [52]:
# extracting sentiment and associated ticker out of the dataframe
sentiment_to_df= [] # list for sentiment score
tickers_to_df = [] # list for tickers

for dataframe in list_df: 
    for index, row in dataframe.iterrows(): # for every row in dataframe
        title_words = row[0].split(' ') # split comment at whitespace and put into list
        for word in title_words:
            if word in tickers: 
                tickers_to_df.append(word) # append the word to the word list
                sentiment_to_df.append(row[1]) # append the sentiment to the sentiment list

In [53]:
# create sentiment dataframe 
sentiment_df1 = pd.DataFrame(list(zip(tickers_to_df, sentiment_to_df)),
              columns =['Term', 'Polarity'])

In [54]:
# group table by ticker, get mean of sentiment, create new dataframe with new index
sentiment_table=sentiment_df1.groupby(['Term']).mean()
sentiment_table['Term'] = sentiment_table.index
sentiment_table=sentiment_table.reset_index(drop=True)
sentiment_table

Unnamed: 0,Polarity,Term
0,0.355886,A
1,0.000000,AAPL
2,0.901737,AI
3,0.998500,ALK
4,-0.340000,ALL
...,...,...
169,0.996900,XOM
170,0.973100,Y
171,0.075800,Z
172,0.255557,ZIM


### 1.7 Output CSV with Polarity Score for each ticker

In [58]:
# Export to CSV
sentiment_table.to_csv('sentiment_of_tickers_' + str(current_time.strftime("%d"))+'_'+str(current_time.strftime("%m"))+ '_'+ str(current_time.strftime("%H"))+'.csv', index=False)