In [None]:
import numpy as np 
import pandas as pd
import praw
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import pickle
import config
#import networkx as nx


In [None]:
#Setting up the Reddit API in python
reddit = praw.Reddit(client_id=config.APP_ID,
                     client_secret=config.APP_SECRET,
                     user_agent=config.REDDIT_USERNAME)

## Get Recent Posts from Relevant Subreddits

In [None]:
def get_recent_posts(subred_list, n, max_com):
    posts_info = []
    sub_dict = {}
    for subred in subred_list:
        subreddit = reddit.subreddit(subred)
        sub_dict[subreddit.id] = subred
        print('gathering submissions from',subred,'subreddit...')

        for subm in subreddit.top(time_filter="month", limit=n):
            
            subm.comment_sort = "top"
            subm.comments.replace_more(limit=max_com) ##


            subred_info = []
            subred_info.append(subm.id)  
            subred_info.append(str(subm.author)) 
            subred_info.append(subm.score)  
            subred_info.append(subm.upvote_ratio)
            subred_info.append(subm.num_comments)
            subred_info.append(subred)
            subred_info.append(datetime.fromtimestamp(subm.created_utc).strftime('%Y-%m-%dT%H:%M:%SZ'))
            subred_info.append(subm.comments)
            
            posts_info.append(subred_info)
        print(len(posts_info), 'posts gathered so far...')
    
    sorted_info = sorted(posts_info, key=lambda x: x[1], reverse = True)
    posts_df = pd.DataFrame(sorted_info, columns = ['id','author', 'score','upvote_ratio' ,'num_comments','subreddit','created','comments'])
    print('complete with',posts_df.num_comments.sum(),'comments and',len(posts_df),'submissions')
    return sub_dict, posts_df

## Get Influencers

In [None]:
def get_influencers(subred_list, num_submissions, num_influencers, min_occur, mvp_flag, load_subm, orig_entry, max_com, equal_sub):
    
    '''
    This function retrieves a ranked list of redditors (influencers) within relevant subreddits related 
    to the category of the search term. 

    Inputs: 
        subred_list - list of relevant subreddits
        num_submission - number of submissions retrieved for each subreddit
        num_influencers - number of influencers returned as a ranked list (highest to lowest)
        min_occur - minimum number of times a redditor must appear in ranking list (comment or submission)
        mvp_flag - indicates if we are running a pre-aggregated data set for our minimum viable product
        load_subm - if pickle file is available for posts
        orig_entry - search term used for naming/finding pickle files
        max_com - maximum comment request limit (32 each) for each submission (sorted by top)
        equal_sub - if true, changes function to return an num_influencers from each subreddit in subred_list 

    Output: 
        influencer_lst - a list of influential authors of length = num_influencers
    '''  
    
    orig_entry_mod = orig_entry.replace(' ','_')
    file_name = '/work/MADS_698_Capstone/output_step3/influencers_' + orig_entry_mod + '.pickle'

    if mvp_flag:
        print('MVP mode - loading pickle file...')

        #Load pickle file of relevant posts
        try:
            with open(file_name, 'rb') as f:
                influencer_lst = pickle.load(f)
            print('pickle file loaded.')
        except:
            print('Unable to find pickle file for ', orig_entry)

    else:
        #retrieve top n=num_submissions recent posts from each subreddit
        #print('Retrieving posts...')
        subm_filename = '/work/MADS_698_Capstone/output_step3/influencers_submissions_' + orig_entry_mod + '.pickle'
        dict_filename = '/work/MADS_698_Capstone/output_step3/influencer_submissions/influencers_dict_' + orig_entry_mod + '.pickle'
        if load_subm:
            try:
                with open(dict_filename, 'rb') as f:
                    sub_dict = pickle.load(f)
                subm_df = pd.read_pickle(subm_filename)  

            except:
                print('Unable to find pickle file for ', orig_entry) 
        
        else:
            sub_dict, subm_df = get_recent_posts(subred_list, num_submissions,max_com)
            with open (dict_filename, 'wb') as f:
                pickle.dump(sub_dict, f)
            subm_df.to_pickle(subm_filename)

        
        submission_list = list(subm_df['id'])
        
        #normalize submission scoring by total of score for submissions
        rank_subm = subm_df[['id','score','author','subreddit']]
        rank_subm = rank_subm.assign(type='submission')
        rank_subm_sum = rank_subm.score.sum()
        rank_subm['score'] = rank_subm['score']/rank_subm_sum

        #retrieve comments from submission_list
        com_info = []
        for com_list in subm_df['comments']:
            for com in com_list:
                com_info.append([com.id,com.score,str(com.author),com.body,com.subreddit_id,com.parent_id,'comment'])

        com_df = pd.DataFrame(com_info, columns=['id','score','author','body','subreddit_id','parent_id','type'])
        
        #print('Retrieving comments...')
        #com_df = get_comments(submission_list)
        #print('Done...',len(com_df),' comments retrieved.')
        
        sub_map = {'t5_' + str(key): val for key, val in sub_dict.items()}
        com_df['subreddit'] = com_df['subreddit_id'].map(sub_map)

        #filter out comments with no author or comments that have been removed or deleted      
        com_df = com_df[com_df['author'] != 'None']
        com_df = com_df[com_df['author'] != 'AutoModerator']
        com_df = com_df[~com_df['body'].isin(['[removed]','[deleted]'])]
        
        #normalize comment scoring by total of score for comments
        rank_com = com_df[['id','score','author','subreddit','type']].copy()
        rank_com_sum = rank_com.score.sum()
        rank_com['score'] = rank_com['score']/rank_com_sum

        #combined normalized rankings for submissions and comments
        rank_df = pd.concat([rank_com, rank_subm], axis=0)

        #aggregate submissiona and comment scores by author
        rank_df_agg = pd.pivot_table(rank_df,index=['author'], aggfunc={'score': np.sum, 'id': len}).rename(columns={'id': 'count'})
        rank_df_agg = rank_df_agg[rank_df_agg['count'] > min_occur].sort_values(by='score',ascending=False)
        
        #take top 25 scoring authors to include in influencer list
        author_lst_top25_score = list(rank_df_agg.index[:25])
        author_df_top250 = rank_df_agg[:250].copy()
               
        print('collecting comment karma for top 250 authors...')
        karma_list = []
        for author in list(author_df_top250.index):
            try:
                karma_list.append(reddit.redditor(author).comment_karma)
            except:
                karma_list.append(0)

        author_df_top250['karma'] = karma_list
        karma_df = author_df_top250.copy()
        karma_df = karma_df[~karma_df.index.isin(author_lst_top25_score)]
        karma_df = karma_df.sort_values(by='karma', ascending=False)

        author_lst_top50_karma = list(karma_df.index[:50])


        influencer_lst = author_lst_top25_score + author_lst_top50_karma
        #print(influencer_lst)
        print(len(influencer_lst),'influencers found so far...')

        if equal_sub:
            #filter scoring for each subreddit
            #author_lst = []
            for sub in subred_list:
                #aggregate scoring for both comments and submissions by author and rank
                author_df = rank_df[rank_df['subreddit'] == sub]
                if len(author_df) > 0:
                    author_df = pd.pivot_table(author_df,index=['author'], aggfunc={'score': np.sum, 'id': len}).rename(columns={'id': 'count'})
                    author_df = author_df[~author_df.index.isin(influencer_lst)]
                    author_df = author_df[author_df['count'] > min_occur].sort_values(by='score',ascending=False)
                    influencer_lst = influencer_lst + list(author_df.index)[:num_influencers] 
                    #author_lst.append(list(author_df.index)[:num_influencers])
                    print('Top',num_influencers,'influencers added from',sub,'subreddit.',len(influencer_lst),'found so far.')
                else:
                    continue
            #author_lst = [item for sublist in author_lst for item in sublist]

        else:
            
            
            #aggregate scoring for both comments and submissions by author and rank
            author_df = pd.pivot_table(rank_df,index=['author'], aggfunc={'score': np.sum, 'id': len}).rename(columns={'id': 'count'})
            author_df = author_df[author_df['count'] > min_occur].sort_values(by='score',ascending=False)
            
            #retrieve list of top n authors
            author_lst = list(author_df.index)
            author_lst = author_lst[:num_influencers]

        #influencer_lst = influencer_lst + author_lst 
        inf_com_df = com_df[com_df['author'].isin(influencer_lst)]
        #Create pickle file from our list of influencers
            
        with open (file_name, 'wb') as f:
            pickle.dump(influencer_lst, f)
        


        ## save dataframes for analysis
        df_list = [inf_com_df, author_df, com_df, rank_com, rank_subm, author_df_top250]
        df_names = ['inf_com_df', 'author_df', 'com_df', 'rank_com', 'rank_subm','author_df_top250']
        count = 0

        for df in df_list:
            df.name = df_names[count] 
            file_name_df = '/work/MADS_698_Capstone/output_step3/'+ df.name + '_' + orig_entry_mod + '.pickle'
            df.to_pickle(file_name_df)
            count += 1

        print('pickle file of influencers has been created with',len(set(influencer_lst)),'redditors')
    return influencer_lst

In [None]:
#num_submissions = 50
#num_influencers = 5
#min_occur = 2
#max_com = 0
#subred_list = []
#search_term = 'Elon Musk'

In [None]:
#influencer_lst = get_influencers(subred_list, num_submissions, num_influencers, min_occur, True, False,search_term, max_com,True)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=90b052a7-f47d-474e-888f-9345355cfd9a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>