In [None]:
# Import libraries
import pandas as pd
import numpy as np

### Load cleaned comments and article data 

In [None]:
%%time

# Load cleaned comment data
comments = pd.read_csv('../data/cleaned/comments.csv', header=0, parse_dates=['scrape_datetime','comment_datetime_clean'])
comments.shape

In [None]:
comments.head()

In [None]:
%%time

# Load cleaned article data
articles = pd.read_csv('../data/cleaned/articles.csv', header=0, parse_dates=['scrape_datetime','post_datetime'])
articles.shape

In [None]:
articles.head()

### Engineer numerical features

In [None]:
# Calculate the total number of comments by each commentor
commentor_features = pd.DataFrame(comments.groupby(['commentor']).size())
commentor_features['commentor'] = commentor_features.index
commentor_features.columns = ['total_number_of_comments','commentor']
# commentor_features.head()

In [None]:
# Calculate the number of unique articles the commentor commented on
unique_articles = comments.groupby(['commentor','article_url']).size()
unique_articles_df = pd.DataFrame(unique_articles.groupby(['commentor']).size())
unique_articles_df.columns = ['number_of_articles_commented_on']
# unique_articles_df.head()

In [None]:
# Calculate the number of unique articles the commentor commented on exactly once
unique_articles_single_comment = comments.groupby(['commentor','article_url']).size()
unique_articles_single_comment_df = pd.DataFrame(unique_articles_single_comment)
unique_articles_single_comment_df = unique_articles_single_comment_df[unique_articles_single_comment_df[0] == 1]
unique_articles_single_comment_df = pd.DataFrame(unique_articles_single_comment_df.groupby(['commentor']).size())
unique_articles_single_comment_df.columns = ['number_of_articles_w_exactly_one_comment']
# unique_articles_single_comment_df.head()

In [None]:
# Calculate the number of unique articles the commentor commented on more than once
unique_articles_mulitple_comment = comments.groupby(['commentor','article_url']).size()
unique_articles_mulitple_comment_df = pd.DataFrame(unique_articles_mulitple_comment)
unique_articles_mulitple_comment_df = unique_articles_mulitple_comment_df[unique_articles_mulitple_comment_df[0] > 1]
unique_articles_mulitple_comment_df = pd.DataFrame(unique_articles_mulitple_comment_df.groupby(['commentor']).size())
unique_articles_mulitple_comment_df.columns = ['number_of_articles_w_more_than_one_comment']
# unique_articles_mulitple_comment_df.head()

In [None]:
# Calculate how long (in days) a commentor has been active on pft
commentor_activity_duration = comments.groupby(['commentor']).agg({'comment_datetime_clean':['min','max']})
commentor_activity_duration.columns = commentor_activity_duration.columns.droplevel()
commentor_activity_duration['commentor_activity_duration_in_days'] = (commentor_activity_duration['max'] - commentor_activity_duration['min']).dt.days
# commentor_activity_duration.head()

In [None]:
# Calcualte the length of the commentor's username
commentor_username_length = comments.groupby(['commentor']).size()
commentor_username_length = pd.DataFrame(commentor_username_length)
commentor_username_length['username'] = commentor_username_length.index
commentor_username_length['username_length'] = commentor_username_length['username'].str.len()
commentor_username_length.drop([0], axis=1, inplace=True)

# Calculate the number of letters, numbers, and spaces in the commentor's username
commentor_username_length['username_alpha_chars'] = commentor_username_length['username'].apply(lambda username: sum(x.isalpha() for x in username))
commentor_username_length['username_numeric_chars'] = commentor_username_length['username'].apply(lambda username: sum(x.isdigit() for x in username))
commentor_username_length['username_space_chars'] = commentor_username_length['username'].apply(lambda username: sum(x.isspace() for x in username))
# commentor_username_length.head()

In [None]:
# Calculate the mean, median, min and max length of comments (characters)
comments['comment_body_length'] = comments['comment_body'].str.len()
commentor_comment_body_metrics = comments.groupby(['commentor']).agg({'comment_body_length':['mean','median','min','max']})
commentor_comment_body_metrics.columns = commentor_comment_body_metrics.columns.droplevel()
commentor_comment_body_metrics.columns = ['comment_length_mean','comment_length_median','comment_length_min','comment_length_max']
# commentor_comment_body_metrics.head()

In [None]:
# Calculate the average, median, min, max hours between when article was published and comment was made
articles_w_dates = articles.drop_duplicates(subset=['article_url','post_datetime'])
comments_between = pd.merge(comments, articles_w_dates[['article_url','post_datetime']], how='left', on='article_url')
comments_between = comments_between[(comments_between.comment_datetime_clean >= comments_between.post_datetime)]
comments_between['hours_btween'] = (comments_between.comment_datetime_clean - comments_between.post_datetime) / pd.Timedelta(hours=1)
 
hours_between_metrics = comments_between.groupby(['commentor']).agg({'hours_btween':['mean','median','min','max']})
hours_between_metrics.columns = hours_between_metrics.columns.droplevel()
hours_between_metrics.columns = ['hours_between_mean','hours_between_median','hours_between_min','hours_between_max']
# hours_between_metrics.head()

In [None]:
# Calculate which days of the week comments were made on
comments['comment_date_dow'] = comments['comment_datetime_clean'].dt.day_name()
comments_dow = pd.pivot_table(comments[['article_url','commentor','comment_date_dow']], index=['commentor'],
                    columns=['comment_date_dow'], aggfunc='count', fill_value=0)
comments_dow.columns = comments_dow.columns.droplevel()
comments_dow.columns = ['comments_on_' + c for c in comments_dow.columns]
# comments_dow.head()

In [None]:
# Calculate which hours of the day comments were made on
comments['comment_date_hour'] = comments['comment_datetime_clean'].dt.hour
comments_hour = pd.pivot_table(comments[['article_url','commentor','comment_date_hour']], index=['commentor'],
                    columns=['comment_date_hour'], aggfunc='count', fill_value=0)
comments_hour.columns = comments_hour.columns.droplevel()
comments_hour.columns = ['comments_on_hour_' + str(c) for c in comments_hour.columns]
comments_hour.head()

In [None]:
# Count number of comment made "in-season" vs "off-season"
# In-season being between 9/1 and 2/1, inclusive
comments['comment_date_month'] = comments['comment_datetime_clean'].dt.month
comments['in_season_flag'] = np.where((comments['comment_date_month'] >= 9) | (comments['comment_date_month'] <= 2), 1, 0)
in_season_comments = comments.groupby(['commentor']).agg({'in_season_flag':['count','sum']})
in_season_comments.columns = in_season_comments.columns.droplevel()
in_season_comments.columns = ['total_comments','number_in_season_comments']
in_season_comments['number_out_season_comments'] = in_season_comments['total_comments'] - in_season_comments['number_in_season_comments']
# in_season_comments.head()

In [None]:
# Calculate the most number of comments each commentor posted in a single day
commentor_max_comments = pd.DataFrame(comments.groupby(['commentor','comment_datetime_clean']).size()).reset_index()
commentor_max_comments_df = pd.DataFrame(commentor_max_comments.groupby(['commentor'])[0].max())
# commentor_max_comments_df.head()


### Engineer categorical features

### Combine all features

In [None]:
commentor_features_all = pd.concat([commentor_features, 
                                    unique_articles_df, 
                                    
                                    multiple_comments, 
                                    commentor_activity_duration,
                                    commentor_comment_body_metrics,
                                    comments_dow,
                                    comments_hour,
                                    ] ,axis=1)
print(commentor_features_all.shape)
commentor_features_all.head()