In [49]:
import pandas as pd
import numpy as np

In [50]:
# Reading in the data to cleaning
wsb_posts = pd.read_csv('./data/wsb_posts.csv')
wsb_com = pd.read_csv('./data/wsb_comments.csv')
btc_posts = pd.read_csv('./data/btc_posts.csv')
btc_com = pd.read_csv('./data/btc_comments.csv')

In [51]:
wsb_com.head()

Unnamed: 0,id,comment
0,eld4i6,Can't buy stock in the company that made the p...
1,eld4i6,Honestly if someone can find me an underground...
2,eld4i6,Are there any ETFs that has all three companie...
3,eld4i6,Wait why do the lobsters still have the rubber...
4,eld4i6,Raytheon is merging with United technology \n\...


In [52]:
wsb_posts.head()

Unnamed: 0,title,id,url,body,date_created,number_comments,author
0,All In The War Machine,eld4i6,https://i.redd.it/cj2vw26nmd941.jpg,,1578440000.0,780,everythingorange9
1,How to get oil back up,g55or2,https://i.redd.it/w5iqqihjo2u41.jpg,,1587461000.0,516,futuretrollshark
2,Type yy into google.,c75d5x,https://www.reddit.com/r/wallstreetbets/commen...,That’s all I’m saying.,1561875000.0,17281,Alopez2897
3,Oil is now expenzive,d51f4o,https://i.redd.it/2j386s5iuym31.png,,1568673000.0,1019,
4,"My dad, working through a 15-hour time zone di...",d29nov,https://i.redd.it/60asaz4zhsl31.jpg,,1568160000.0,1633,SerraTL


In [53]:
# Removing the rows with deleted comments.  They don't show up as 
# NaNs because the text of them is '[removed]'
wsb_com = wsb_com[wsb_com['comment'] != '[removed]']
btc_com = btc_com[btc_com['comment'] != '[removed]']

wsb_com.dropna(inplace=True)
btc_com.dropna(inplace=True)

In [54]:
# Remove empty/deleted posts
# Will not drop any of these because many of the blank bodys mean the submission
# was either a URL or an image/video because reddit does not allow for both
wsb_posts.isna().sum()

title                0
id                   0
url                  0
body               812
date_created         0
number_comments      0
author              33
dtype: int64

In [55]:
# Add subreddit column to each dataframe
def add_subred(df, sub):
    df['sub'] = sub

In [56]:
# Add column for submission/comment
def add_subcol(df, sub):
    if sub == 'sub':
        df['submission'] = 1
    else:
        df['submission'] = 0

In [57]:
# Add column to submissions for if self-post
def self_post(df):
    df['self_post'] = np.where((df['body'].notna()) & (df['url'].str.contains('reddit.com')), 1, 0)

In [58]:
self_post(wsb_posts)
self_post(btc_posts)

In [59]:
# Adding column for what subreddit the post/comment was from
add_subred(wsb_posts, 'wsb')
add_subred(wsb_com, 'wsb')
add_subred(btc_posts, 'btc')
add_subred(btc_com, 'btc')
# Adding column where if it is a submission it is a 1, comments are a 0
wsb_posts['submission'] = 1
wsb_com['submission'] = 0
btc_posts['submission'] = 1
btc_com['submission'] = 0
# Renaming 'comment' to 'body'
btc_com.rename(columns={'comment' : 'body'}, inplace=True)
wsb_com.rename(columns={'comment' : 'body'}, inplace=True)

In [60]:
# Make dataframe with all info combined
wsb = wsb_posts.append(wsb_com)
btc = btc_posts.append(btc_com)
both = wsb.append(btc)

In [61]:
# Export as csvs
def export_csv(df, name):
    df.to_csv(f'./data/{name}.csv', index=False)


In [62]:
dfs = {
    'wsb_com_cleaned' : wsb_com,
    'wsb_posts_cleaned' : wsb_posts,
    'wsb' : wsb,
    'btc_com_cleaned' : btc_com,
    'btc_posts_cleaned' : btc_posts,
    'btc' : btc,
    'wsb_and_btc_all' : both
}
for name, df in dfs.items():
    export_csv(df, name)
    print(f'Exported {name}')

Exported wsb_com_cleaned
Exported wsb_posts_cleaned
Exported wsb
Exported btc_com_cleaned
Exported btc_posts_cleaned
Exported btc
Exported wsb_and_btc_all
