## Create DD Author Stats
Using previous posts and comments made by DD authors, pull stats posted by VisualMod bot 
- earliest post or comment date 
- links to previous DD posts 
- number of previous posts and comments 

Process:
1. Loop through DD Posts 
2. Pull author posts and comments before DD post date 
3. Collect earliest post/comment date, num posts/comments, and IDs of past DD posts 

### Required packages 

In [1]:
# set up 
import pandas as pd 
import datetime  
from datetime import timedelta
import numpy as np 
import re
import bisect

### Load Data

In [5]:
# dd posts 
dd_posts = pd.read_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_df_sample.pkl')

# dd author posts 
ddauthor_posts = pd.read_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/posts_by_dd_authors.pkl')

# dd author comments 
ddauthor_comments = pd.read_pickle('/Volumes/Elements/Research/Reddit_Credibility/Data/comments_by_dd_authors.pkl')


In [6]:
# create timestamp variable for ddauthor data frames 
# Fix dates - Reddit dates are in UNIX timestamps 
def get_date(created):
    return datetime.datetime.fromtimestamp(created)

# for posts
_timestamp = ddauthor_posts['created'].apply(get_date)
ddauthor_posts = ddauthor_posts.assign(timestamp = _timestamp)

# for comments 
_timestamp = ddauthor_comments['created'].apply(get_date)
ddauthor_comments = ddauthor_comments.assign(timestamp = _timestamp)

### Create Author ID variable

In [7]:
# get list of authors
authors = dd_posts.author.unique().tolist()

# convert to df
authorsdf = pd.DataFrame(authors, columns = ['author'])

# set author ID to index
authorsdf['authorID'] = authorsdf.index

### Merge AuthorID and remove missing authors 

In [13]:
# merge
dd_posts = dd_posts.merge(authorsdf, on = 'author')
ddauthor_posts = ddauthor_posts.merge(authorsdf, on = 'author')
ddauthor_comments = ddauthor_comments.merge(authorsdf, on = 'author')

### Loop through posts, collect stats 

In [17]:
# initialize author stats list 
author_stats_list = []

In [15]:
# set index
ddauthor_comments.set_index('authorID', inplace = True)
ddauthor_posts.set_index('authorID', inplace = True)

dd_posts.set_index('authorID', inplace = True)

In [29]:
posts

Unnamed: 0_level_0,timestamp
authorID,Unnamed: 1_level_1
2,2021-10-13 08:10:05
2,2021-10-08 10:40:07
2,2021-10-07 09:38:44


In [30]:
# loop through dd_posts 
count = 0

for index, row in dd_posts.iterrows():
    
    count += 1
    
    ### counter ### 
    if count % 1000 == 0:
        _str = 'On dd post ' + str(count) + ' out of ' + str(dd_posts.shape[0])
        print("**************************")
        print(_str)
    
    # pull author and timestamp and dd id 
    _author = index
    _timestamp = row['timestamp']
    _id = row['id']
    
       
    ### pull all comments by author before timestamp ###
    
    # check if author is in comment df
    if _author in ddauthor_comments.index:
        
        # pull comments
        comms = ddauthor_comments.loc[_author, ['timestamp']]  
        
        # check if only one comment 
        if len(comms) == 1:
            
            # check if before date 
            if comms.timestamp >= _timestamp:
                num_comms = 0
                min_comm = np.nan
            else:
                num_comms = 1
                min_comm = comms.timestamp
        
        # else, if more than one comment 
        else:
        
            # check if no comments before timestamp 
            if comms.timestamp.min() >= _timestamp:
                num_comms = 0 
                min_comm = np.nan

            # if comments before timestamp
            
            else: 
                # pull all
                comms = comms.loc[lambda row: (row['timestamp'] < _timestamp)]

                # stats 
                num_comms = comms.shape[0]
                min_comm = comms.timestamp.min()
        
    # if author not in comment df    
    else:
        num_comms = 0 
        min_comm = np.nan
        
    ### pull all posts by author before timestamp ###  
    
    #posts = ddauthor_posts.loc[_author, ['timestamp']]
    posts = ddauthor_posts.loc[[_author]]
    
    # check if dd is the only post 
    if len(posts) == 1:
        num_posts = 0
        min_post = _timestamp
        avg_post_score = np.nan
        total_post_score = np.nan
        avg_post_comms = np.nan
        total_post_comms = np.nan
    
    # if dd is not the only posts, pull all posts less than timestamp 
    else:
        
        # filter
        posts = posts.loc[lambda row:(row['timestamp'] <= _timestamp)]
    
        # if only post, set stats
        if len(posts) == 1:
            num_posts = 0 
            min_post = _timestamp
            avg_post_score = np.nan
            total_post_score = np.nan
            avg_post_comms = np.nan
            total_post_comms = np.nan
           
    
        # if not only post, compute stats
        else:
            posts = posts[posts['timestamp'] != _timestamp]
            # stats 
            num_posts = posts.shape[0]
            min_post = posts.timestamp.min()
            # avg and total post comments and post scores
            avg_post_score = posts['score'].mean()
            total_post_score = posts['score'].sum()
            avg_post_comms = posts['comms_num'].mean()
            total_post_comms = posts['comms_num'].sum()
            
        
    ### pull all dd by author before timestamp ### 
    
    dds = dd_posts[dd_posts.index == _author]
    dds = dds.loc[lambda row: (row['timestamp'] <= _timestamp)]
    
    # if only one dd, empy list 
    if dds.shape[0] == 1:
        dd_ids = []
        num_dd = 0
        avg_dd_score = np.nan
        total_dd_score = np.nan
        avg_dd_comms = np.nan
        total_dd_comms = np.nan
        
    
    else:
        # filter again 
        dds = dds[dds['timestamp'] != _timestamp]
        dd_ids = dds.id.unique().tolist()
        num_dd = len(dd_ids)
        # average and total score and average and total num comments 
        avg_dd_score = dds['score'].mean()
        total_dd_score = dds['score'].sum()
        avg_dd_comms = dds['comms_num'].mean()
        total_dd_comms = dds['comms_num'].sum()
       
    
    
    ### append ###
    author_stats_list.append((_id, _author, num_comms, num_posts, min_comm, min_post, dd_ids, num_dd, 
                             avg_dd_score, total_dd_score, avg_dd_comms, total_dd_comms, 
                             avg_post_score, total_post_score, avg_post_comms, total_post_comms))

**************************
On dd post 1000 out of 9197
**************************
On dd post 2000 out of 9197
**************************
On dd post 3000 out of 9197
**************************
On dd post 4000 out of 9197
**************************
On dd post 5000 out of 9197
**************************
On dd post 6000 out of 9197
**************************
On dd post 7000 out of 9197
**************************
On dd post 8000 out of 9197
**************************
On dd post 9000 out of 9197


In [31]:
ddauthor_stats_df = pd.DataFrame(author_stats_list, 
                                columns = ['dd_id', 'authorID', 'num_previous_comments', 
                                          'num_previous_posts', 'min_comment_date', 'min_post_date', 
                                          'prev_dd_ids', 'num_prev_dd', 'avg_dd_score', 'total_dd_score', 
                                          'avg_dd_comms', 'total_dd_comms', 'avg_post_score', 'total_post_score', 
                                          'avg_post_comms', 'total_post_comms'])

In [32]:
# merge with dd_posts 
dd_posts = dd_posts.merge(ddauthor_stats_df, left_on = ['id', 'authorID'], right_on = ['dd_id', 'authorID'])

In [33]:
# save dd_posts as dd_posts_stats
dd_posts.to_csv('/Volumes/Elements/Research/Reddit_Credibility/Data/dd_posts_stats.csv')

In [34]:
# save author/author ID linking table
authorsdf.to_csv('/Volumes/Elements/Research/Reddit_Credibility/Data/authorID_link.csv')

#-----------