In [None]:
# Import libraries
import os
import boto3
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import statistics
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
from nltk.tokenize import sent_tokenize, word_tokenize

### Upload data to S3 bucket created for ML Guild project

In [None]:
# Upload articles and comment files to S3
articles = 'data/cleaned/articles.csv'
comments = 'data/cleaned/comments.csv'
files = [articles, comments]

# Ensure credentials file is reachable in Windows OS
# https://docs.aws.amazon.com/credref/latest/refdocs/file-location.html
# %USERPROFILE%\.aws\credentials

# Check if bucket exists
s3 = boto3.client('s3', verify=False)
response = s3.list_buckets()

# If bucket doesn't exist, create it
bucket_name = 'ml-guild-project-pft-comments'
if bucket_name not in response['Buckets']:
    print('Creating bucket...')
    s3.create_bucket(Bucket=bucket_name)
    print('Bucket created')

# Check if data is already uploaded to S3 bucket
keys = []
for key in s3.list_objects(Bucket=bucket_name)['Contents']:
    keys.append(key['Key'])
print(sorted(keys))
    
# Upload data to S3 bucket if keys not found
if sorted(keys) != sorted(files):
    for file in files:
        with open(file, 'rb') as data:
            s3.upload_fileobj(data, 'ml-guild-project-pft-comments', file)
            print(f'{file} has been uploaded')

In [None]:
# Download cleaned data
for key in keys:
    s3.download_file(bucket_name, key, key)
    print(f'{key} has been download')

### Load data

In [None]:
# Read in data
comments_all = pd.read_csv('data/cleaned/comments.csv', parse_dates=['comment_datetime_clean'])
articles = pd.read_csv('data/cleaned/articles.csv', header=0, parse_dates=['scrape_datetime','post_datetime'])

In [None]:
print(comments_all.shape)
print(articles.shape)

### Remove outliers and single commenters

###### Single commenters

In [None]:
# What % of commentors only engaged exactly once/year?
comments_all['year'] = comments_all['comment_datetime_clean'].dt.year
percent_single_commentors = list()
for year in comments_all.year.unique():
    one_year = comments_all[comments_all.year == year]
    table = one_year.commentor.value_counts().reset_index()
    num_unique_commentors = table.shape[0]
    num_single_commentors = table[table.commentor == 1].shape[0]
    print(f'{year}: {num_single_commentors / num_unique_commentors:,.2f}%, {num_single_commentors} / {num_unique_commentors}')
    percent_single_commentors.append(num_single_commentors / num_unique_commentors)
    
f'mean: {np.mean(percent_single_commentors)}'

In [None]:
# What % of commentors only engaged exactly once across the entire dataset?
table = comments_all.commentor.value_counts().reset_index()
num_unique_commentors = table.shape[0]
num_single_commentors = table[table.commentor == 1].shape[0]
print(f'all years: {num_single_commentors / num_unique_commentors:,.2f}%, {num_single_commentors} / {num_unique_commentors}')

###### Commenters above 5 stddev ("super users")

In [None]:
# Let's determine a cut-off for "outlier"
num_nonsingle_commentors = table[table.commentor != 1]['commentor'].values
five_stdev_cutoff = statistics.stdev(num_nonsingle_commentors) * 5
print(sum(num_nonsingle_commentors > five_stdev_cutoff))
print(sum(num_nonsingle_commentors > five_stdev_cutoff) / len(num_nonsingle_commentors))

In [None]:
# Remove single commentors and those above cut-off
table = comments_all.commentor.value_counts().reset_index()
table_outliers_removed = table[(table.commentor != 1) & (table.commentor < five_stdev_cutoff)]
table_outliers_removed.columns = ['commentor','num_comments']
commentors = table_outliers_removed.commentor.values
print(table_outliers_removed.commentor.nunique())

# Filter comment df based on cut-offs
comments = comments_all[comments_all.commentor.isin(commentors)]
comments.to_csv('data/in_process/comments.csv', header=True, index=False)
print(comments_all.shape)
print(comments.shape)

In [None]:
# Visualize remaining commentors distribution
table = comments.commentor.value_counts().reset_index()
fig = go.Figure(data=[go.Histogram(x=table.commentor.values, histfunc='sum', histnorm='probability', cumulative_enabled=False)])
fig.show()

### Engineer features to use in clustering (all years)

In [None]:
#1 Calculate the total number of comments by each commenter
num_comments = comments.commentor.value_counts()
num_comments.tail()

In [None]:
#2 Calculate the total number of unique articles commented on by each commenter
comments_by_articles = comments.groupby('commentor')['article_url'].nunique()
comments_by_articles.head()

In [None]:
#3 Calculate the time of day commenters posted
def times_of_date_by_hour(hour):
    if hour >= 5 and hour < 12:
        return 'morning'
    if hour >= 12 and hour < 17:
        return 'afternoon'
    if hour >= 17 and hour < 21:
        return 'evening'
    if hour >= 21 or hour < 5:
            return 'night'
    
comments['comment_time_of_day'] = comments.comment_datetime_clean.dt.hour.apply(times_of_date_by_hour)
comments_by_time_of_day = pd.pivot_table(comments[['article_url','comment_time_of_day','commentor']],
                                         index=['commentor'],
                                         columns=['comment_time_of_day'],
                                         aggfunc='count',
                                         fill_value=0
                                        )
comments_by_time_of_day.head()

In [None]:
#4 Calculate the number of unique articles the commentor commented on more than once
unique_articles_mulitple_comment = comments.groupby(['commentor','article_url']).size()
unique_articles_mulitple_comment_df = pd.DataFrame(unique_articles_mulitple_comment)
unique_articles_mulitple_comment_df = unique_articles_mulitple_comment_df[unique_articles_mulitple_comment_df[0] > 1]
unique_articles_mulitple_comment_df = pd.DataFrame(unique_articles_mulitple_comment_df.groupby(['commentor']).size())
unique_articles_mulitple_comment_df.columns = ['number_of_articles_w_more_than_one_comment']
unique_articles_mulitple_comment_df.head()

In [None]:
#5 Calculate how long (in days) a commentor has been active on pft
commentor_activity_duration = comments.groupby(['commentor']).agg({'comment_datetime_clean':['min','max']})
commentor_activity_duration.columns = commentor_activity_duration.columns.droplevel()
commentor_activity_duration['commentor_activity_duration_in_days'] = (commentor_activity_duration['max'] - commentor_activity_duration['min']).dt.days
commentor_activity_duration.drop(['min','max'], axis=1, inplace=True)
commentor_activity_duration.head()

In [None]:
#6 Calculate the total number of characters, words, and sentences written by the commenter
comments['comment_body_length'] = comments['comment_body'].str.replace('\n','').str.len()
# comments['comment_word_tokens'] = comments['comment_body'].apply(lambda x: len(word_tokenize(x))
# comments['comment_sent_tokens'] = comments['comment_body'].apply(lambda x: len(sent_tokenize(x))

comments.head()

# sent_tokenize, word_tokenize

# commentor_comment_body_metrics = comments.groupby(['commentor']).agg({'comment_body_length':['mean','median','min','max','sum']})
# commentor_comment_body_metrics.columns = commentor_comment_body_metrics.columns.droplevel()
# commentor_comment_body_metrics.columns = ['comment_length_mean','comment_length_median','comment_length_min','comment_length_max','comment_length_total']
# commentor_comment_body_metrics

In [None]:
comments.shape

### Explore features before dimensionality reduction

### Scale and reduce dimensionality using UMAP

### Cluster data using DBSCAN

### Determine feature importance using XGBoost