In [107]:
# import statements
import pandas as pd
import numpy as np

### Cleaning the Comments Dataset

In [108]:
# read comments data
comments = pd.read_json('trp_comments.json')

In [109]:
comments.columns.unique()

Index(['author', 'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'body', 'created_utc',
       'gildings', 'id', 'is_submitter', 'link_id', 'locked', 'no_follow',
       'parent_id', 'permalink', 'retrieved_on', 'score', 'send_replies',
       'stickied', 'subreddit', 'subreddit_id', 'updated_utc', 'distinguished',
       'author_created_utc', 'can_gild', 'collapsed', 'collapsed_reason',
       'controversiality', 'gilded', 'nest_level', 'reply_delay',
       'subreddit_name_prefixed', 'subreddit_type', 'user_removed', 'edited',
       'mod_removed', 'author_cakeday', 'score_hidden', 'rte_mode',
       'all_awardings', 'associated_award', 'author_premium', 'awarders',
       'collapsed_because_crowd_control', 'total_awards_received',
       'treatment_tags', 'top_awarded_type'],
      dtype='

In [110]:
# filter to relevant columns
comments = comments[['id', 'created_utc','user_removed', 'author', 'author_fullname', 'body']]

In [111]:
# set index
comments.set_index('id', inplace=True)

In [112]:
# figure out how many comments are removed to be used in future parsing

# number of comments that weren't removed
comments['user_removed'].isnull().sum()

189806

In [113]:
# total number of rows
len(comments['user_removed'])

200139

In [114]:
# calculate percentage of rows with no body data
# (num of comments that were removed / total comments) * 100
((200139-189806)/200139)*100

5.16291177631546

In [115]:
# fill nulls with 0 to make column binary
comments.fillna({'user_removed': 0}, inplace=True)

### Cleaning the Submissions Dataset

In [116]:
submissions = pd.read_json('trp_submissions.json')

In [117]:
submissions.columns.unique()

Index(['author', 'author_flair_css_class', 'author_flair_richtext',
       'author_flair_text', 'author_flair_type', 'brand_safe', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'id',
       'is_crosspostable', 'is_original_content', 'is_reddit_media_domain',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'rte_mode', 'score', 'selftext', 'send_replies', 'spoiler', 'stickied',
       'subreddit', 'subreddit_id', 'subreddit_subscribers', 'subreddit_type',
       'thumbnail', 'title', 'url', 'whitelist_status', 'edited',
       'author_flair_background_color', 'author_flair_text_color', 'banned_by',
       'post_hint', 'preview', 'distinguished',

In [119]:
# filter to relevant columns
submissions = submissions[['id', 'created_utc', 'author', 'selftext']]

In [121]:
# set index
submissions.set_index('id', inplace=True)

In [122]:
submissions.head()

Unnamed: 0_level_0,created_utc,author,selftext
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8blq21,1523489914,Heathcliff--,Just fucking lift already.\n\nThe vast majorit...
8bl1d2,1523484118,timber_ghost,[removed]
8bknm0,1523481125,Leg_Of_Lamb,[removed]
8bkimw,1523480066,SexdictatorLucifer,"A long time ago, the human male decided to emb..."
8bk5r4,1523477234,stacysmomlovesme,I recently wrote how water fasting can improve...


In [123]:
submissions[submissions['selftext']==('[removed]')]

Unnamed: 0_level_0,created_utc,author,selftext
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8bl1d2,1523484118,timber_ghost,[removed]
8bknm0,1523481125,Leg_Of_Lamb,[removed]
8bjypu,1523475770,SlightlyCyborg,[removed]
8bjv74,1523475008,Saberinbed,[removed]
8bjui4,1523474855,traktor28,[removed]
...,...,...,...
9f750q,1536753939,Classy_Amir,[removed]
9f6pae,1536750037,akkimadhuri94,[removed]
9f5q18,1536738549,Southiesir,[removed]
9f5gbm,1536735492,loveinterbeingwisdom,[removed]


In [None]:
# total # of rows
submissions.shape[0]