In [2]:
# Import libraries
import pandas as pd

### Load cleaned comments data 

In [21]:
%%time

# Load cleaned comment data
comments = pd.read_csv('../data/cleaned/comments.csv', header=0, parse_dates=['scrape_datetime','comment_datetime_clean'])
comments.shape

Wall time: 48.7 s


(5689832, 5)

In [22]:
comments.head()

Unnamed: 0,article_url,commentor,comment_body,scrape_datetime,comment_datetime_clean
0,https://profootballtalk.nbcsports.com/2020/10/...,amaf21,\nthe best bet is the cowboys giving up 10 sac...,2020-10-28 19:15:17.148123,2020-10-25 10:57:00
1,https://profootballtalk.nbcsports.com/2020/10/...,amaf21,"\n1 strip sack for a safety down, 9 more to go...",2020-10-28 19:15:17.148123,2020-10-25 13:20:00
2,https://profootballtalk.nbcsports.com/2020/10/...,eagleswin,\nTalent still trumps all in the NFL. In betw...,2020-10-28 19:15:17.169123,2020-10-25 10:46:00
3,https://profootballtalk.nbcsports.com/2020/10/...,cobrala2,\nTom Brady wants this guy.\n,2020-10-28 19:15:17.169123,2020-10-25 10:54:00
4,https://profootballtalk.nbcsports.com/2020/10/...,freakylj8,\nUse AB to let Godwin get healthy I guess\n,2020-10-28 19:15:17.169123,2020-10-25 11:01:00


### Engineer numerical features

In [26]:
# Calculate the total number of comments by each commentor
commentor_features = pd.DataFrame(comments.groupby(['commentor']).size())
commentor_features['commentor'] = commentor_features.index
commentor_features.columns = ['total_number_of_comments','commentor']
commentor_features.head()

Unnamed: 0_level_0,total_number_of_comments,commentor
commentor,Unnamed: 1_level_1,Unnamed: 2_level_1
\nguitarkevin,1,\nguitarkevin
\npraetorian12,1,\npraetorian12
"""All Eyez On Me"" in theaters NOW!!!",6,"""All Eyez On Me"" in theaters NOW!!!"
"""All Eyez On Me"" in theaters june 16 2017",49,"""All Eyez On Me"" in theaters june 16 2017"
"""Coach""Davis",31,"""Coach""Davis"


In [32]:
# Calculate the number of unique articles the commentor commented on
unique_articles = comments.groupby(['commentor','article_url']).size()
unique_articles_df = pd.DataFrame(unique_articles.groupby(['commentor']).size())
unique_articles_df.columns = ['number_of_articles_commented_on']
# unique_articles_df.head()

Unnamed: 0_level_0,number_of_articles_commented_on
commentor,Unnamed: 1_level_1
\nguitarkevin,1
\npraetorian12,1
"""All Eyez On Me"" in theaters NOW!!!",6
"""All Eyez On Me"" in theaters june 16 2017",49
"""Coach""Davis",30


In [44]:
# Calculate the number of unique articles the commentor commented on exactly once
unique_articles_single_comment = comments.groupby(['commentor','article_url']).size()
unique_articles_single_comment_df = pd.DataFrame(unique_articles_single_comment)
unique_articles_single_comment_df = unique_articles_single_comment_df[unique_articles_single_comment_df[0] == 1]
unique_articles_single_comment_df = pd.DataFrame(unique_articles_single_comment_df.groupby(['commentor']).size())
unique_articles_single_comment_df.columns = ['number_of_articles_w_exactly_one_comment']
# unique_articles_single_comment_df.head()

Unnamed: 0_level_0,number_of_articles_w_exactly_one_comment
commentor,Unnamed: 1_level_1
\nguitarkevin,1
\npraetorian12,1
"""All Eyez On Me"" in theaters NOW!!!",6
"""All Eyez On Me"" in theaters june 16 2017",49
"""Coach""Davis",29


In [48]:
# Calculate the number of unique articles the commentor commented on more than once
unique_articles_mulitple_comment = comments.groupby(['commentor','article_url']).size()
unique_articles_mulitple_comment_df = pd.DataFrame(unique_articles_mulitple_comment)
unique_articles_mulitple_comment_df = unique_articles_mulitple_comment_df[unique_articles_mulitple_comment_df[0] > 1]
unique_articles_mulitple_comment_df = pd.DataFrame(unique_articles_mulitple_comment_df.groupby(['commentor']).size())
unique_articles_mulitple_comment_df.columns = ['number_of_articles_w_more_than_one_comment']
# unique_articles_mulitple_comment_df.head()

Unnamed: 0_level_0,number_of_articles_w_more_than_one_comment
commentor,Unnamed: 1_level_1
"""Coach""Davis",1
"""Cue the haters in 3….2…..1……”",1
"""Hate""",1
"""Stats-Are-For-Losers""",1
#1,8


In [51]:
# Calculate how long (in days) a commentor has been active on pft
commentor_activity_duration = comments.groupby(['commentor']).agg({'comment_datetime_clean':['min','max']})
commentor_activity_duration.columns = commentor_activity_duration.columns.droplevel()
commentor_activity_duration['commentor_activity_duration_in_days'] = (commentor_activity_duration['max'] - commentor_activity_duration['min']).dt.days
# commentor_activity_duration.head()

Unnamed: 0_level_0,min,max,commentor_activity_duration_in_days
commentor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
\nguitarkevin,2013-07-09 14:05:00,2013-07-09 14:05:00,0
\npraetorian12,2013-07-09 14:02:00,2013-07-09 14:02:00,0
"""All Eyez On Me"" in theaters NOW!!!",2017-06-21 07:43:00,2017-07-01 12:47:00,10
"""All Eyez On Me"" in theaters june 16 2017",2017-04-09 07:14:00,2017-06-20 08:00:00,72
"""Coach""Davis",2010-04-25 13:14:00,2011-10-08 12:48:00,530


In [None]:
# Check if the first comment was "in-season" vs during the "off-season"
# In-season being between 9/1 and 2/10, inclusive
commentor_activity_duration['first_comment_was_during_season'] = np.where(commentor_activity_duration['min'] < dt.date(2020, 9, 1), 1, 0)
commentor_activity_duration = commentor_activity_duration.drop(labels=['min','max'], axis=1)
# commentor_activity_duration.head()

### Engineer categorical features

### Combine all features

In [None]:
commentor_features_all = pd.concat([commentor_features, 
                                    unique_articles_df, 
                                    
                                    multiple_comments, 
                                    commentor_activity_duration,
                                    commentor_comment_body_metrics,
                                    comments_dow,
                                    comments_hour,
                                    ] ,axis=1)
print(commentor_features_all.shape)
commentor_features_all.head()