# Data Extract and Transform Process

In [None]:
# import statements
import pandas as pd
import numpy as np
import seaborn as sns
from  datetime import datetime

### Cleaning the Comments Dataset

In [None]:
# read comments data
comments = pd.read_json('trp_comments.json')

In [None]:
comments.columns.unique()

In [None]:
# filter to relevant columns
comments = comments[['id', 'created_utc','user_removed', 'author', 'author_fullname', 'body']]

In [None]:
# set index to comment id
comments.set_index('id', inplace=True)

In [None]:
# Change from UNIX time to human-readable format
comments['datetime'] = pd.to_datetime(comments['created_utc'], infer_datetime_format=True, unit='s')
comments['date'] = pd.to_datetime(comments['datetime']).dt.date
comments['day'] = pd.to_datetime(comments['datetime']).dt.day
comments['month'] = pd.to_datetime(comments['datetime']).dt.month
comments['year'] = pd.to_datetime(comments['datetime']).dt.year

In [None]:
# figure out how many comments are removed to be used in future parsing
# if comment is removed, we will not use it for toxicity parsing

# number of comments that weren't removed
comments['user_removed'].isnull().sum()

In [None]:
# total number of rows
len(comments['user_removed'])

In [None]:
# calculate percentage of rows with no body data
# (num of comments that were removed / total comments) * 100
((200139-189806)/200139)*100

~5.17% of all comments in the dataset do not have text that can be parsed

In [None]:
# fill nulls with 0 to make column binary 
comments.fillna({'user_removed': 0}, inplace=True)

### Cleaning the Submissions Dataset

In [None]:
submissions = pd.read_json('trp_submissions.json')

In [None]:
submissions.columns.unique()

In [None]:
# filter to relevant columns
submissions = submissions[['id', 'created_utc', 'author', 'selftext']]

In [None]:
# set submissions index
submissions.set_index('id', inplace=True)

In [None]:
submissions.head()

In [None]:
# Change from UNIX time to human-readable format
submissions['datetime'] = pd.to_datetime(submissions['created_utc'], infer_datetime_format=True, unit='s')
submissions['date'] = pd.to_datetime(submissions['datetime']).dt.date
submissions['day'] = pd.to_datetime(submissions['datetime']).dt.day
submissions['month'] = pd.to_datetime(submissions['datetime']).dt.month
submissions['year'] = pd.to_datetime(submissions['datetime']).dt.year

In [None]:
# determine how many submissions were removed/no longer viewable
len(submissions[submissions['selftext']==('[removed]')])

In [None]:
# number of rows
submissions.shape[0]

3646 out of 7118 rows (~51%) have had their posts removed.

This is too large of a missing dataset.

# Data Sanity Check

Checking to see which months have missing data in the collection process

In [None]:
comments.groupby([comments['year'], comments['month']]).size()

In [None]:
submissions.groupby([submissions['year'], submissions['month']]).size()

In [None]:
comments.groupby([comments['year'], comments['month']]).size().plot(figsize=(10,3))

In [None]:
submissions.groupby([submissions['year'], submissions['month']]).size().plot(figsize=(10,3))