# Scraping r/wallstreetbets from Reddit

In this notebook, we are going to use python's API wrapper for Reddit, PRAW, to scrape comments from r/wallstreetbets.

In [1]:
import os
import time
import praw
import json
import string
import requests
import http.client
import pandas as pd
from os import getcwd

# paths and keys
file_dir = getcwd()
client_id = 'INSERT CLIENT ID'
user_agent = 'INSERT USER AGENT'
secret = 'INSERT SECRET KEY'
redirect_uri = 'http://localhost:8080'

# Initialize reddit connection
reddit = praw.Reddit(client_id=client_id, client_secret=secret, user_agent=user_agent)

Our goal is to compile all of the posts we are interested in collecting comments from. We want posts that either include "Daily Discussion Thread", "Moves Tomorrow", or "Weekend Discussion Thread". These are all headings used regularly each day/weekend.

In [2]:
# Get dataframe of WSB posts
wsb = reddit.subreddit('wallstreetbets')
week_posts = []
for post in wsb.search('Daily Discussion Thread', limit=400):
    week_posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
week_posts = pd.DataFrame(week_posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])

tomorrow_posts = []
for post in wsb.search('Moves Tomorrow', limit=400):
    tomorrow_posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
tomorrow_posts = pd.DataFrame(tomorrow_posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])

weekend_posts = []
for post in wsb.search('Weekend Discussion Thread', limit=400):
    weekend_posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
weekend_posts = pd.DataFrame(weekend_posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])

wsb_posts = pd.concat([week_posts, tomorrow_posts, weekend_posts], ignore_index=True)

#Filter for only relevant wsb posts
wsb_posts['flag'] = wsb_posts['title'].apply(lambda x: 'Y' if any(w in x.upper() for w in ['DAILY DISCUSSION','WEEKEND DISCUSSION','MOVES TOMORROW']) else 'N')
wsb_posts = wsb_posts[wsb_posts['flag'] == 'Y'].reset_index(drop=True)

PRAW is great for scraping post titles. However, it is incredibly slow when scraping posts with high volumes of comments. Because of this, we are going to query the pushshift API. This is a project warehouses all data from Reddit, allowing us to query the data more efficiently using our post id's.

Before scraping all posts, we will use the code below to omit any posts we have already stored.

In [5]:
# Get list of already scraped posts
warehoused_posts = []
for filename in os.listdir(os.getcwd()+'\\Sentiment Analysis\\WallStreetBets'):
    if filename[-4:] == '.csv':
        warehoused_posts.append(filename[:-4])

wsb_posts = wsb_posts[~wsb_posts['title'].isin(warehoused_posts)]

Now we are ready to query the pushshift API and store our data in flat files for further processing.

In [2]:
# Iterate through posts in r/wallstreetbets and warehouse data
count = 0
for post_id, post_name in zip(wsb_posts['id'], wsb_posts['title']):
    
    attempts = 0
    
    while True:
        
        if attempts > 2:
            break
        
        try:
            # Query comments
            url = 'https://api.pushshift.io/reddit/comment/search/?link_id='+post_id+'&limit=100000'
            r = requests.get(url)
            data = json.loads(r.text)
            wsb_comments = data['data']
            wsb_df = pd.DataFrame(wsb_comments)
            wsb_df = wsb_df[['body','created_utc','score']]

            # Filter for TSLA comments
            key_words = ['TSLA','TESLA','MUSK','ELON']
            wsb_df['flag'] = wsb_df['body'].apply(lambda x: 'Y' if any(w in x.upper() for w in key_words) else 'N')
            wsb_df = wsb_df[wsb_df['flag'] == 'Y']

            wsb_df['id'] = post_id
            wsb_df['post'] = post_name
            wsb_df['body'] = wsb_df['body'].apply(lambda x: x.encode('utf-8'))

            wsb_df = wsb_df[['id','post','created_utc','body','score']]
            wsb_df.to_csv(file_dir+'\\Sentiment Analysis\\WallStreetBets\\'+post_name+'.csv',index=False)
            print(f"Successfully scraped - {post_name}")
            break
        except:
            print(f"Error encountered for {post_name}... restarting iteration {count}")
            attempts += 1
            time.sleep(5)
            continue
    
    count +=1