In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import os 
import datetime as dt

# Import viz
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)

# Pandas display options
pd.options.display.max_columns = 50
pd.options.display.max_rows = 25
pd.options.display.max_colwidth = 50
pd.set_option.precision = 3

### Load the article data

In [None]:
# Load articles w/ comments
print(os.listdir('../data/raw/'))
comments = pd.read_csv('../data/raw/pft_comments_collected_20201028.csv', header=0)
comments.shape

In [None]:
# Inspect the head of the data
comments.head()

### Check for null values

In [None]:
# Looks like we're missing some commentors
comments.isnull().sum()

### Check for uniqueness

In [None]:
# Looks like no columns are entirely unique but article URL is the most unique
comments.describe(datetime_is_numeric=False)

### Check for correct dtypes

In [None]:
# Both the dates are being treated as objects but we really only care about comment_datetime  so let's fix that
comments.dtypes

### Fix the formatting of the comment datetime

In [None]:
comments['comment_datetime'].tail()

In [None]:
dt.datetime.strptime(comments.comment_datetime.iloc[-3], '%B %d, %Y at %I:%M %p')

In [None]:
# comment_datetime are formatted as: Month DD, YYYY at HH:MM [am|pm]
# %B %d, %Y, at %I:%M %p

comment_date_dict = dict()
for d in comments['comment_datetime'].unique():
    try:
        date = dt.datetime.strptime(d, '%B %d, %Y at %I:%M %p', )
        comment_date_dict[d] = date
    except:
        pass

comments['comment_datetime_clean'] = comments['comment_datetime'].map(comment_date_dict)
comments[['comment_datetime','comment_datetime_clean']].head()

In [None]:
comments = comments.drop(labels=['comment_datetime'], axis=1)

### Remove extraneous header rows caused from inconsistent data collection

In [None]:
print(comments.shape)
comments = comments[~comments.comment_datetime_clean.isna()]
print(comments.shape)

### Remove incorrectly collected records

In [None]:
# Since there are so few, let's remove rows with missing commentor
print(comments.shape)
comments = comments[~comments.commentor.isnull()]
print(comments.shape)

### Output cleaned data to CSV

In [None]:
comments.to_csv('../data/cleaned/comments.csv', header=0, index=False)