### Step 4: Find Influencer Relevant Posts
This step finds all posts made by the influential users in the identified subreddits in a provided timeframe. If there are more posts than the parameter limit, cosine similarity between the post and the best_wiki_cats is used to determine the top posts to move to the next step of the process.

In [None]:
import praw 
import pandas as pd
import time
import datetime as dt
import pmaw
from datetime import datetime, timezone, timedelta, date

import pickle

from sentence_transformers import SentenceTransformer, util


In [None]:
## reddit and app credentials

REDDIT_USERNAME = 'nbt_capstone '
APP_ID = '38HKH06bLOdSlpZIVtO5-w'
APP_SECRET = 'dmU0nd_1tYIDn9AldwjEDgYU-hj-Lw' 
APP_NAME = 'nextbigthing'
PASSWORD = 'Capstone698!'

In [None]:
#Set up the Reddit API in python
reddit = praw.Reddit(client_id=APP_ID,
                     client_secret=APP_SECRET,
                     user_agent=REDDIT_USERNAME)

#Set up pushshift
pmaw_api = pmaw.PushshiftAPI()

In [None]:
#Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L12-v2')

In [None]:
def mdy_to_timestamp(value: str):
    '''Converts a date to timestamp

    Input - date in mm/dd/yyyy format
    Output - date in timestamp format
    '''
    my_val = dt.datetime.strptime(value, "%m/%d/%Y")
    return int(my_val.timestamp())

def get_influencer_comments(author, subreddits, start_date, end_date,limit):
    ''' Use pushshift to get comments based on author and subreddits within a time period

        Inputs: author - string containing influential user
                subreddits - list of subreddits found for our item
                start_date - first date to include for comments
                end_date - last date to include for comments
                limit - max number of comments to return
               
        Output: df - comments created by the author in the subreddits  '''

    before = mdy_to_timestamp(end_date)
    after = mdy_to_timestamp(start_date)

    comment_list = []
    for subreddit in subreddits:
        #print('Finding comments in the {} subreddit...'.format(subreddit))
        comments = pmaw_api.search_comments(subreddit=subreddit, author=author, limit=limit, 
                                before=before, after=after,
                                filter=['body','id','score','author','subreddit','created_utc'])
    

        comment_list.append(comments)

    #Flatten comment list
    comment_list = [item for sublist in comment_list for item in sublist]  

    #If no comments are returned, exit
    if len(comment_list) == 0:
        return pd.DataFrame()  
    
    df = pd.DataFrame(comment_list)
    
    df['created'] = df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
    df['author'] = author
    df['type'] = 'comment'
    df['text_to_process'] = df['body']
    df = df.drop(columns=['body'])

    return df


In [None]:
def get_influencer_submissions(author, subreddits, start_date, end_date,limit):
    ''' Use pushshift to get submissions based on author and subreddits within a time period

        Inputs: author - string containing influential user
                subreddits - list of subreddits found for our item
                start_date - first date to include for comments
                end_date - last date to include for comments
                limit - max number of comments to return
               
        Output: df - submissions created by the author in the subreddits  '''

    before = mdy_to_timestamp(end_date)
    after = mdy_to_timestamp(start_date)

    submission_list = []
    for subreddit in subreddits:
        #print('Finding submissions in the {} subreddit...'.format(subreddit))
        submissions = pmaw_api.search_submissions(subreddit=subreddit, author=author, limit=limit, 
                                before=before, after=after,
                                filter=['selftext','id','score','author','title','subreddit',
                                        'created_utc'])
    

        submission_list.append(submissions)

    #Flatten submission list
    submission_list = [item for sublist in submission_list for item in sublist]  

    #If no submissions are returned, exit
    if len(submission_list) == 0:
        return pd.DataFrame()  
    
    df = pd.DataFrame(submission_list)
    
    df['created'] = df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
    df['author'] = author
    df['type'] = 'submission'
    df['text_to_process'] = df['title'] + '.' + df['selftext']
    df = df.drop(columns=['title','selftext'])

    return df



In [None]:
def get_post_cos_sim_score(list_of_text, posts_df):
    '''Computes cosine similarity for all posts against each item in a list, then calculates
       the average cosine similarity for each post. Returns a dataframe where the posts are 
       sorted by descending cosine similiarity

       Inputs:
       list_of_text - list of strings that will be used to compare to each post
       post_df - dataframe of reddit comments and submissions

       Output:
       final_df - dataframe of posts sorted by descending cosine similarity 
    '''
    cosine_result = []
    
    posts = list(posts_df['text_to_process'])
    posts_embeddings = model.encode(posts)

    for text in list_of_text: 
        text_embedding= model.encode(text)

        for post_num, post in enumerate(posts):
            cos_similarity = util.pytorch_cos_sim(text_embedding, posts_embeddings[post_num]).numpy()[0][0]
            cosine_result.append([text, post_num, post,cos_similarity])

    cos_df = pd.DataFrame(cosine_result,columns=['text','post_num','post','cosine_similarity'])

    final_df = cos_df.groupby(['post_num', 'post']).mean().reset_index().sort_values(by='cosine_similarity',ascending=False)
    
    
    return final_df

In [None]:
def get_relevant_posts(orig_entry, authors, subreddits, start_date, end_date, wiki_categories, 
                        per_author_limit, post_limit, mvp_flag):
    '''
    This function retrieves comments and submissions for a group of Reddit authors
    in specific subreddits based on a start date and end date. The number of comments/submissions
    retrieved per author is limited by the per_author_limit parameter. Once all posts for all 
    authors are retrieved, a cosine similarity calculation is performed against the wikipedia
    categories for the original item entered by the user. Then, the top posts are returned
    based on the post_limit parameter

    Inputs: 
        orig_entry - string containing original item entered by our user for the NBT
        authors - list of reddit users
        subreddits - list of subreddits
        start_date - start date to use for posts
        end_date - end date to use for posts
        wiki_categories - list of wikipedia categories for the original item entered by the user
        per_author_limit - max number of comments/submissions (separately) for each author in 
                           a subreddit
        post_limit - max number of posts to return
        mvp_flag - indicates if we are running a pre-aggregated data set for our minimum viable
                   product

    Output: 
        relevant_posts - a list of relevant comments and submissions 
    '''
    #Set pickle file name, will either be creating it or loading it
    orig_entry_mod = orig_entry.replace(' ','_')
    file_name = './output_step4/posts_' + orig_entry_mod + '.pickle'

    #If we are using a pre-aggregated comment set, load its pickle file
    if mvp_flag:
        print('MVP mode - loading pickle file...')

        #Load pickle file of relevant posts
        try:
            with open(file_name, 'rb') as f:
                relevant_posts = pickle.load(f)
        except:
            print('Unable to find pickle file for', orig_entry)
            relevant_posts = []

    else:
        #Remove duplicate users in our influential users list
        authors = list(set(authors))
        
        print('Start time for retrieving comments: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
        
        #Remove 'Category:' from beginning of wikipedia categories
        wiki_categories = [cat[9:] for cat in wiki_categories]

        #Get comments for our influential users
        comments_df_list = []
        for author in authors:
            print('Finding comments for {}...'.format(author))
            author_comments_df = get_influencer_comments(author, subreddits, start_date, end_date,per_author_limit)
            
            #If there are comments for the author then add to our list
            if len(author_comments_df) > 0:
                comments_df_list.append(author_comments_df)

        #Put all of the comments in one dataframe 
        if len(comments_df_list) > 0:
            comments_df = pd.concat(comments_df_list)
        else:
            comments_df = pd.DataFrame()
        print('Number of influential user comments: ', len(comments_df))
        

        #Get submissions for our influential users
        print('Start time for retrieving submissions: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
        
        submissions_df_list = []
        for author in authors:
            print('Finding submissions for {}...'.format(author))
            author_submissions_df = get_influencer_submissions(author, subreddits,start_date, end_date, per_author_limit)

        #If there are submissions for the author then add to our list
            if len(author_submissions_df) > 0:
                submissions_df_list.append(author_submissions_df)

        #Put all of the submissions in one dataframe 
        if len(submissions_df_list) > 0:
            submissions_df = pd.concat(submissions_df_list)
        else:
            submissions_df = pd.DataFrame()
        print('Number of influential user submissions: ', len(submissions_df))
       
        #Put comments and submissions together
        frames = [comments_df, submissions_df]
        posts_df = pd.concat(frames)

        #If we have too many posts, evaluate relevance and rank
        print('Ranking posts start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))

        if len(posts_df) > 0:
        
            if len(posts_df) > post_limit:
                #Find average cosine similarity between posts and all wikipedia categories
                wiki_categories_avg_score_df = get_post_cos_sim_score(wiki_categories, posts_df)
                relevant_posts = list(wiki_categories_avg_score_df['post'])[:post_limit]

            else:
                #Return all posts
                relevant_posts = list(posts_df['text_to_process'])
        else:
            print('No posts for these authors during the time period')
            relevant_posts = []

        #Create pickle file from our posts
       
        with open (file_name, 'wb') as f:
            pickle.dump(relevant_posts, f)

        print('Ranking posts end time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))


    return relevant_posts

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=90b052a7-f47d-474e-888f-9345355cfd9a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>