In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import os 
import datetime as dt

# Import viz
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)

# Pandas display options
pd.options.display.max_columns = 50
pd.options.display.max_rows = 10
pd.options.display.max_colwidth = 50
pd.set_option.precision = 3

### Load the article data

In [None]:
# Load articles w/ comments
print(os.listdir('../data/raw/'))
articles = pd.read_csv('../data/raw/pft_articles_collected_w_comment_counts_20201017.csv', header=0)
articles.shape

In [None]:
# Inspect the head of the data
articles.head()

### Check for null values

In [None]:
# Looks like we're missing some authors and our dates are being recognized as dates
articles.info()

In [None]:
articles[articles.article_author.isnull()]

### Check for uniqueness

In [None]:
# Looks like no columns are entirely unique but article URL is the most unique
articles.describe(datetime_is_numeric=False)

In [None]:
article_url_counts = pd.DataFrame(articles[['article_url']].article_url.value_counts() > 1)
article_url_counts[article_url_counts.article_url == True]

### Check for correct dtypes

In [None]:
# Both the dates are being treated as objects but we really only care about post_date so let's fix that
articles.dtypes

In [None]:
articles['article_post_date'].head()

In [None]:
# post_dates are formatted as: Month DD, YYYY, HH:MM [AM|PM] [EDT|EST] 
# %B %d, %Y, %I:%M %p

post_date_dict = dict()
for post_date in articles['article_post_date'].unique():
    if 'EDT' in post_date:
        date = dt.datetime.strptime(post_date, '%B %d, %Y, %I:%M %p EDT')
    if 'EST' in post_date:
        date = dt.datetime.strptime(post_date, '%B %d, %Y, %I:%M %p EST')
    if date not in post_date_dict.keys():
        post_date_dict[post_date] = date

articles['post_datetime'] = articles['article_post_date'].map(post_date_dict)
articles[['article_post_date','post_datetime']].head()

### Clean article author and comment count columns

In [None]:
# Inspect the two columns that need cleaning
articles[['article_author','comment_count']].head()

In [None]:
# Looks like we have 1 invalid value in the author column
author_dict = dict()
for author in articles['article_author'].unique():
    if isinstance(author, str):
        author1 = author.strip()
        if author1.startswith('Posted by'):
            author_clean = author1.replace('Posted by','')
            if author_clean == ' NBC Sports':
                author_clean = 'NBCSports.com'
            if author_clean == ' nickmensio':
                author_clean = 'Nick Mensio'
            if author not in author_dict.keys():
                author_dict[author] = author_clean.strip()
    else:
        author_dict[author] = ''

articles['author'] = articles['article_author'].map(author_dict)
articles['author'].value_counts().head(30)

In [None]:
# Looks like we have at least 1 invalid value in comment count column
comment_dict = dict()
for c in articles['comment_count'].unique():
    number = c.replace(',', '').split(' ')[0]
    try:
        number = int(number)
        comment_dict[c] = number
    except:
        comment_dict[c] = np.NaN

articles['comment_counts'] = articles['comment_count'].map(comment_dict)
articles['comment_counts'].value_counts(sort=False, dropna=False)

In [None]:
# Inspect the two columns now that they've been cleaned
articles[['author','comment_counts']].head()

### Remove incorrectly collected records

In [None]:
# oward Kunreuther and Erwann Michel-Kerjan discusses the importance of the law and the NFL’s efforts to lobby Congress to renew it.  But there’s no indication that they asked the NFL whether the Super Bowl would indeed be in danger of not being played absent renewal of the TRIA.        1