# ID DD Posts 
- ID all due diligence posts on WallStreetBets 
- Collect all comments on posts - used to identify disclosure change 
- Collect user-identifying information that will be used to pull user's information 

In [2]:
# load packages
import praw 
import pandas as pd 
import datetime as dt 
from datetime import timedelta
import numpy as np 
from psaw import PushshiftAPI
import re

### Setup:
Initialize instance of reddit API using Pushshift API wrapper

In [6]:
# load reddit and pushshift API for accessing posts
reddit = praw.Reddit(client_id = 'rAfwBiShkge-tw',
                    client_secret = '1rcUS_cY1tZ_uDGwJdpEIe-vSow',
                    user_agent = 'wsb_app',
                    username = 'Real_Measurement_493', 
                    password = 'stella96')
api = PushshiftAPI(reddit)

Version 7.1.0 of praw is outdated. Version 7.4.0 was released Friday July 30, 2021.


### Find DD Posts:
1. Search for instances of DD by flair
2. Store dictionary of vars of interest
3. Store as data frame and clean

In [5]:
# search all submissions 
sub_search = api.search_submissions(subreddit = 'wallstreetbets', 
                           filter = ['url', 'author', 'title', 'body', 'created', 'num_comments', 'score', 'link_flair_text'])

In [6]:
# initialize dictionary and save results 
topics_dict = { "title":[], "author":[],
                "score":[],
                "id":[], "url":[], 
                "comms_num": [],
                "created": [], 
              "flair" : []}

for submission in sub_search:
    topics_dict["title"].append(submission.title)
    topics_dict["author"].append(submission.author)
    topics_dict["score"].append(submission.score)
    topics_dict["id"].append(submission.id)
    topics_dict["url"].append(submission.url)
    topics_dict["comms_num"].append(submission.num_comments)
    topics_dict["created"].append(submission.created)
    topics_dict["flair"].append(submission.link_flair_text)



In [7]:
# transform topics_dict into a pandas data frame 
discussions_df = pd.DataFrame.from_dict(topics_dict, orient = 'index')
discussions_df = discussions_df.transpose()

In [8]:
# look at shape 
discussions_df.shape

(1643412, 8)

In [11]:
# Fix dates - Reddit dates are in UNIX timestamps 
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = discussions_df['created'].apply(get_date)

discussions_df = discussions_df.assign(timestamp = _timestamp)

In [12]:
# only keep DD posts
dd_df = discussions_df[discussions_df['flair'] == 'DD']

In [13]:
# look at shape
dd_df.shape

(73535, 9)

In [15]:
dd_df.to_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df.pkl')

### Pull all comments on DD posts 

In [19]:
dd_df = dd_df.reset_index()

In [22]:
# now for each submission, want to access all the comments for that submission and store in a searchable list

#initialize list 
comments_list = []
  
# loop through submissions
for i in range(0, len(dd_df)):
    if i % 1000 == 0:
        print("-"*80)
        print("On submission {}/{}".format(i, len(dd_df)))
    
    # get submission id
    id_sub = dd_df.loc[i, "id"]
    
    # search for comments on submission id
    search = api.search_comments(link_id = id_sub, subreddit = 'wallstreetbets', filter = ['id','body', 'author','parent_id', 'score', 'replies'])
   # print("after search")
    
    # initialize dictionary
    comm_dict = {}
    
    # append comment body to dictionary
    for comment in search:
        _id = comment.id
        comm_dict[_id] = {
            "body": comment.body,
            "author": comment.author,
            "parent_id": comment.parent_id,
            "score": comment.score,
            "replies": comment.replies
        }
    
    # append
    comments_list.append((id_sub, comm_dict))

--------------------------------------------------------------------------------
On submission 0/73535
--------------------------------------------------------------------------------
On submission 1000/73535
--------------------------------------------------------------------------------
On submission 2000/73535
--------------------------------------------------------------------------------
On submission 3000/73535
--------------------------------------------------------------------------------
On submission 4000/73535




--------------------------------------------------------------------------------
On submission 5000/73535
--------------------------------------------------------------------------------
On submission 6000/73535
--------------------------------------------------------------------------------
On submission 7000/73535
--------------------------------------------------------------------------------
On submission 8000/73535
--------------------------------------------------------------------------------
On submission 9000/73535
--------------------------------------------------------------------------------
On submission 10000/73535
--------------------------------------------------------------------------------
On submission 11000/73535
--------------------------------------------------------------------------------
On submission 12000/73535
--------------------------------------------------------------------------------
On submission 13000/73535
------------------------------------------



--------------------------------------------------------------------------------
On submission 67000/73535
--------------------------------------------------------------------------------
On submission 68000/73535
--------------------------------------------------------------------------------
On submission 69000/73535
--------------------------------------------------------------------------------
On submission 70000/73535
--------------------------------------------------------------------------------
On submission 71000/73535
--------------------------------------------------------------------------------
On submission 72000/73535
--------------------------------------------------------------------------------
On submission 73000/73535


### Transform comments_list into a data frame 

In [82]:
# initialize new list to store adjusted comment lists in 
comments_list2 = []

In [83]:
# loop through comments list 
for i in range(0, len(comments_list)):
    
    # print counter 
    if i % 1000 == 0:
        print('======================================')
        print('On submission ' + str(i) + ' out of ' + str(len(comments_list)))
    
    # pull post id 
    post_id = comments_list[i][0]
    
    # loop through comments for post id 
    for key in comments_list[i][1]:
        
        # pull data on comment 
        author = comments_list[i][1][key]['author']
        body = comments_list[i][1][key]['body']
        parent_id = comments_list[i][1][key]['parent_id']
        score = comments_list[i][1][key]['score']
        replies = comments_list[i][1][key]['replies']
        
        # append to comments list2
        comments_list2.append((post_id, key, author, body, parent_id, score, replies))
        
 

On submission 0 out of 73535
On submission 1000 out of 73535
On submission 2000 out of 73535
On submission 3000 out of 73535
On submission 4000 out of 73535
On submission 5000 out of 73535
On submission 6000 out of 73535
On submission 7000 out of 73535
On submission 8000 out of 73535
On submission 9000 out of 73535
On submission 10000 out of 73535
On submission 11000 out of 73535
On submission 12000 out of 73535
On submission 13000 out of 73535
On submission 14000 out of 73535
On submission 15000 out of 73535
On submission 16000 out of 73535
On submission 17000 out of 73535
On submission 18000 out of 73535
On submission 19000 out of 73535
On submission 20000 out of 73535
On submission 21000 out of 73535
On submission 22000 out of 73535
On submission 23000 out of 73535
On submission 24000 out of 73535
On submission 25000 out of 73535
On submission 26000 out of 73535
On submission 27000 out of 73535
On submission 28000 out of 73535
On submission 29000 out of 73535
On submission 30000 out

In [84]:
comments_df = pd.DataFrame(comments_list2, columns = ['submission_id', 'comment_id',
                                                     'comment_author', 'comment_body', 
                                                     'parent_id', 'score', 'replies'])

In [87]:
comments_df.to_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_comments_df.pkl')

### Forgot to pull submission text body, so create this variable and re-save dd df with text 

In [3]:
# read original dd_df
dd_df = pd.read_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df.pkl')

In [36]:
# initialize text_list 
text_list = []

In [37]:
count = 0

In [38]:
# loop through dd_df 
for index, row in dd_df.iterrows():
    
    count +=1 
    if count % 1000 == 0:
        _str = 'On submission ' + str(count) + ' out of ' + str(dd_df.shape[0])
        print(_str)
        
    # sub id 
    sub_id = row['id']
    
    # pull submission 
    submission = reddit.submission(id = sub_id)
    
    # pull body 
    body = submission.selftext 
        
    # check if removed 
    if submission.is_robot_indexable:
        removed = 0
    else:
        removed = 1
    
    # append 
    text_list.append((sub_id, body, removed))

On submission 1000 out of 73535
On submission 2000 out of 73535
On submission 3000 out of 73535
On submission 4000 out of 73535
On submission 5000 out of 73535
On submission 6000 out of 73535
On submission 7000 out of 73535
On submission 8000 out of 73535
On submission 9000 out of 73535
On submission 10000 out of 73535
On submission 11000 out of 73535
On submission 12000 out of 73535
On submission 13000 out of 73535
On submission 14000 out of 73535
On submission 15000 out of 73535
On submission 16000 out of 73535
On submission 17000 out of 73535
On submission 18000 out of 73535
On submission 19000 out of 73535
On submission 20000 out of 73535
On submission 21000 out of 73535
On submission 22000 out of 73535
On submission 23000 out of 73535
On submission 24000 out of 73535
On submission 25000 out of 73535
On submission 26000 out of 73535
On submission 27000 out of 73535
On submission 28000 out of 73535
On submission 29000 out of 73535
On submission 30000 out of 73535
On submission 31000

In [39]:
text_df = pd.DataFrame(text_list, columns = ['submission_id', 'body', 'removed'])

# merge with dd_df 
dd_df = dd_df.merge(text_df, left_on = 'id', right_on = 'submission_id')

# re-save 
dd_df.to_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df.pkl')


In [None]:
# re-load to save sample
dd_df = pd.read_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df.pkl')

In [10]:
# save sample 
dd_sample = dd_df[dd_df['removed'] == 0]

In [11]:
# remove missing author
dd_sample = dd_sample[~dd_sample['author'].isna()]

In [13]:
# remove Kremfloete
dd_sample = dd_sample[dd_sample['author'].astype(str) != 'Kremfloete']

In [14]:
dd_sample.to_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df_sample.pkl')

### Save dd_df as csv to use in R 

In [15]:
dd_df = pd.read_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df.pkl')

In [16]:
dd_df.to_csv('/Volumes/Elements/Research/Reddit_Credibility/Data/full_dd_df.csv')