### Step 4: Find Influencer Relevant Posts
This step finds all posts made by the influential users in the identified subreddits in a provided timeframe. If there are more posts than the parameter limit, cosine similarity between the post and the best_wiki_cats is used to determine the top posts to move to the next step of the process.

In [None]:
import praw 
import pandas as pd
import time
import datetime as dt
import pmaw
from datetime import datetime, timezone, timedelta, date

import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from sentence_transformers import SentenceTransformer, util

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
date.today().strftime("%m/%d/%Y") 
(date.today() + timedelta(days=1)).strftime("%m/%d/%Y") #'3/1/2022'

'04/12/2022'

In [None]:
## reddit and app credentials

REDDIT_USERNAME = 'kdicam'  
REDDIT_PASSWORD = 'h3x69jGYiLesJPW' 
APP_ID = 'wBJnSmnLH0JD5sg0yfkEDw' 
APP_SECRET = 'a2w2ZgS8pCnzurFQSJrU5immYzRyvA' 
APP_NAME = 'next-big-thing' 

In [None]:
#Set up the Reddit API in python
reddit = praw.Reddit(client_id=APP_ID,
                     client_secret=APP_SECRET,
                     user_agent=REDDIT_USERNAME)

#Set up pushshift
pmaw_api = pmaw.PushshiftAPI()

In [None]:
#Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L12-v2')

In [None]:
def mdy_to_timestamp(value: str):
    my_val = dt.datetime.strptime(value, "%m/%d/%Y")
    return int(my_val.timestamp())

def get_influencer_comments_pmaw(author, subreddits, start_date, end_date,limit):
    ''' Use pmaw pushshift to get comments based on author and subreddits within a time period

        Inputs: author - string containing influential user
                subreddits - list of subreddits found for our item
                start_date - first date to include for comments
                end_date - last date to include for comments
                limit - max number of comments to return
               
        Output: df - comments created by the author in the subreddits  '''

    #Note: Consider adding search term in here as well using q

    before = mdy_to_timestamp(end_date)
    after = mdy_to_timestamp(start_date)

    comment_list = []
    for subreddit in subreddits:
        #print('Finding comments in the {} subreddit...'.format(subreddit))
        comments = pmaw_api.search_comments(subreddit=subreddit, author=author, limit=limit, 
                                before=before, after=after,
                                filter=['body','id','score','author','subreddit','created_utc'])
    

        comment_list.append(comments)

    #Flatten comment list
    comment_list = [item for sublist in comment_list for item in sublist]  

    #If no comments are returned, exit
    if len(comment_list) == 0:
        return pd.DataFrame()  
    
    df = pd.DataFrame(comment_list)
    
    df['created'] = df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
    df['author'] = author
    df['type'] = 'comment'
    df['text_to_process'] = df['body']
    df = df.drop(columns=['body'])

    #print(df.head())
    #print('# records is: ',len(df))
    #print('Columns are: ',list(df.columns))
    return df

def get_influencer_comments_w_submission(author, subreddits, start_date, end_date,limit):
    ''' Get comment using pmaw pushshift, then use praw to grab original submission title
        based on author and subreddits within a time period

        Inputs: author - string containing influential user
                subreddits - list of subreddits found for our NBT item
                start_date - first date to include for comments
                end_date - last date to include for comments
               
        Output: comments_df - comments created by the author in the subreddits, 
                with original submission title  '''

    #print('Start time for retrieving comments: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
    comments_df = get_influencer_comments_pmaw(author, subreddits, start_date, end_date,limit)
    #print('End time for retrieving comments: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
   
    #If there are any comments, retrieve the original submission for each comment
    #Commenting this if statement out - it is taking too long to get the title - can condense
    # these two functions now
    # if len(comments_df) > 0:
    #     print('Start time for retrieving submission title: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
    #     comments_df['submission_title'] = comments_df['id'].apply(lambda x: reddit.comment(x).submission.title)
    #     print('End time for retrieving submission title: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
    #     comments_df['title_and_body'] = comments_df['submission_title'] + '. ' + comments_df['body']
    
    return comments_df

In [None]:
def get_influencer_submissions(author, subreddits, start_date, end_date,limit):
    ''' Use pushshift to get submissions based on author and subreddits within a time period

        Inputs: author - string containing influential user
                subreddits - list of subreddits found for our item
                start_date - first date to include for comments
                end_date - last date to include for comments
                limit - max number of comments to return
               
        Output: df - submissions created by the author in the subreddits  '''

    #Note: Consider adding search term in here as well using q

    before = mdy_to_timestamp(end_date)
    after = mdy_to_timestamp(start_date)

    submission_list = []
    for subreddit in subreddits:
        #print('Finding submissions in the {} subreddit...'.format(subreddit))
        submissions = pmaw_api.search_submissions(subreddit=subreddit, author=author, limit=limit, 
                                before=before, after=after,
                                filter=['selftext','id','score','author','title','subreddit',
                                        'created_utc'])
    

        submission_list.append(submissions)

    #Flatten submission list
    submission_list = [item for sublist in submission_list for item in sublist]  

    #If no submissions are returned, exit
    if len(submission_list) == 0:
        return pd.DataFrame()  
    
    df = pd.DataFrame(submission_list)
    
    df['created'] = df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
    df['author'] = author
    df['type'] = 'submission'
    df['text_to_process'] = df['title'] + '.' + df['selftext']
    df = df.drop(columns=['title','selftext'])

    #print(df.head())
    #print('# records is: ',len(df))
    #print('Columns are: ',list(df.columns))
    return df



In [None]:
def get_relevant_posts(orig_entry, authors, subreddits, start_date, end_date, wiki_categories, 
                        per_author_limit, post_limit, mvp_flag):
    '''
    This function retrieves comments and submissions (not yet) for a group of Reddit authors
    in specific subreddits based on a start date and end date. The number of comments/submissions
    retrieved per author is limited by the per_author_limit parameter. Once all posts for all 
    authors are retrieved, a cosine similarity calculation is performed against the wikipedia
    categories for the original item entered by the user. Then, the top posts are returned
    based on the post_limit parameter

    Inputs: 
        orig_entry - string containing original item entered by our user for the NBT
        authors - list of reddit users
        subreddits - list of subreddits
        start_date - start date to use for posts
        end_date - end date to use for posts
        wiki_categories - list of wikipedia categories for the original item entered by the user
        per_author_limit - max number of comments/submissions (separately) for each author in 
                           a subreddit
        post_limit - max number of posts to return
        mvp_flag - indicates if we are running a pre-aggregated data set for our minimum viable
                   product

    Output: 
        relevant_posts - a list of relevant comments and submissions 
    '''
    #Set pickle file name, will either be creating it or loading it
    orig_entry_mod = orig_entry.replace(' ','_')
    file_name = './output_step4/posts_' + orig_entry_mod + '.pickle'

    #If we are using a pre-aggregated comment set, load its pickle file
    if mvp_flag:
        print('MVP mode - loading pickle file...')

        #Load pickle file of relevant posts
        try:
            with open(file_name, 'rb') as f:
                relevant_posts = pickle.load(f)
        except:
            print('Unable to find pickle file for', orig_entry)
            relevant_posts = []

    else:
        #Remove duplicate users in our influential users list
        authors = list(set(authors))
        
        print('Start time for retrieving comments: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
        #Remove 'Category:' from beginning of wikipedia categories
        wiki_categories = [cat[9:] for cat in wiki_categories]

        #Get comments for our influential users
        comments_df_list = []
        for author in authors:
            print('Finding comments for {}...'.format(author))
            author_comments_df = get_influencer_comments_w_submission(author, subreddits, start_date, end_date,per_author_limit)
            
            #If there are comments for the author then add to our list
            if len(author_comments_df) > 0:
                comments_df_list.append(author_comments_df)

        #Put all of the comments in one dataframe 
        if len(comments_df_list) > 0:
            comments_df = pd.concat(comments_df_list)
        else:
            comments_df = pd.DataFrame()
        print('Number of relevant comments: ', len(comments_df))
        

        #Get submissions for our influential users
        print('Start time for retrieving submissions: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
        
        submissions_df_list = []
        for author in authors:
            print('Finding submissions for {}...'.format(author))
            author_submissions_df = get_influencer_submissions(author, subreddits,start_date, end_date, per_author_limit)

        #If there are submissions for the author then add to our list
            if len(author_submissions_df) > 0:
                submissions_df_list.append(author_submissions_df)

        #Put all of the submissions in one dataframe 
        if len(submissions_df_list) > 0:
            submissions_df = pd.concat(submissions_df_list)
        else:
            submissions_df = pd.DataFrame()
        print('Number of relevant submissions: ', len(submissions_df))
       
        #Put comments and submissions together
        frames = [comments_df, submissions_df]
        posts_df = pd.concat(frames)

        #If we have too many posts, evaluate relevance and rank
        print('Ranking posts start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))

        if len(posts_df) > 0:
        
            if len(posts_df) > post_limit:
                #Find average cosine similarity between posts and all wikipedia categories
                wiki_categories_avg_score_df = get_post_cos_sim_score(wiki_categories, posts_df)
                relevant_posts = list(wiki_categories_avg_score_df['post'])[:post_limit]

            else:
                #Return all posts
                relevant_posts = list(posts_df['text_to_process'])
        else:
            print('No posts for these authors during the time period')
            relevant_posts = []

        #Create pickle file from our posts
       
        with open (file_name, 'wb') as f:
            pickle.dump(relevant_posts, f)

        print('Ranking posts end time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))


    return relevant_posts

In [None]:
# def get_post_cos_sim_score(list_of_text, posts_df, text_type):
#     '''Computes cosine similarity for all posts against each item in a list, then calculates
#        the average cosine similarity for each post. Returns a dataframe where the posts are 
#        sorted by descending cosine similiarity

#        Inputs:
#        list_of_text - list of strings that will be used to compare to each post
#        post_df - dataframe of reddit comments and submissions
#        text_type - indicates what the list_of_text represents, at this point we are only using
#                    wiki_categories as this proved to have the best results

#        Output:
#        final_df - dataframe of posts sorted by descending cosine similarity 
#     '''
#     cosine_result = []
    
#     posts = list(posts_df['text_to_process'])
#     posts_embeddings = model.encode(posts)

#     for text in list_of_text: 
#         text_embedding= model.encode(text)

#         for i, post in enumerate(posts):
#             cos_similarity = util.pytorch_cos_sim(text_embedding, posts_embeddings[i]).numpy()[0][0]
#             cosine_result.append([text, post,cos_similarity])

#     #Assign column name based on type of data that was sent in
#     cos_df = pd.DataFrame(cosine_result,columns=[text_type,'post','cosine_similarity'])

#     final_df = cos_df.groupby(['post']).mean().sort_values(by='cosine_similarity',ascending=False)

#     final_df = final_df.rename(columns={'cosine_similarity':text_type + 'cos_sim'})
#     #print(final_df)
    
#     return final_df

In [None]:
def get_post_cos_sim_score(list_of_text, posts_df):
    '''Computes cosine similarity for all posts against each item in a list, then calculates
       the average cosine similarity for each post. Returns a dataframe where the posts are 
       sorted by descending cosine similiarity

       Inputs:
       list_of_text - list of strings that will be used to compare to each post
       post_df - dataframe of reddit comments and submissions

       Output:
       final_df - dataframe of posts sorted by descending cosine similarity 
    '''
    cosine_result = []
    
    posts = list(posts_df['text_to_process'])
    posts_embeddings = model.encode(posts)

    for text in list_of_text: 
        text_embedding= model.encode(text)

        for post_num, post in enumerate(posts):
            cos_similarity = util.pytorch_cos_sim(text_embedding, posts_embeddings[post_num]).numpy()[0][0]
            cosine_result.append([text, post_num, post,cos_similarity])

    #Assign column name based on type of data that was sent in
    cos_df = pd.DataFrame(cosine_result,columns=['text','post_num','post','cosine_similarity'])

    final_df = cos_df.groupby(['post_num', 'post']).mean().reset_index().sort_values(by='cosine_similarity',ascending=False)
    
    
    return final_df

In [None]:
#No longer using this
# def vectorize_wiki_articles(articles_pickle_file, vectorizer_type):
#     '''This function creates an average vector over all wikipedia articles in a pickle file
    
#     Inputs:
#     articles_pickle_file - contains a dictionary for each wikipedia article related to our item
#     vectorizer_type - indicates which vectorizer should be used
    
#     Outputs:
#     avg_article_vector - a single row containing an average of each feature over all articles
#     vectorizer - trained vectorizer'''
    
#     #Load in pickle file
#     with open(articles_pickle_file, 'rb') as f:
#         articles = pickle.load(f)
    
#     articles_values = [v for k,v in articles.items()]

#     #vectorizer = CountVectorizer(stop_words = 'english', max_df = 0.8)#,ngram_range = (1,2))
#     vectorizer = vectorizer_type(stop_words = 'english', max_df = 0.8)#,ngram_range = (1,2))
#     article_vectors = vectorizer.fit_transform(articles_values)
#     avg_article_vector = np.asarray(article_vectors.mean(axis=0))
    
#     return avg_article_vector, vectorizer


In [None]:
#No longer using this
# def get_articles_post_cos_sim(avg_article_vector, vectorizer, posts_df):

#     vectorized_posts = vectorizer.transform(posts_df['title_and_body'])
#     print(vectorized_posts.shape)

#     article_post_cos_sim_result = []
#     for post in vectorized_posts:
#         article_post_cos_sim_result.append(cosine_similarity(avg_article_vector, post)[0][0])


#     df = posts_df[['title_and_body']]
#     df['article_post_cos_sim_result'] = article_post_cos_sim_result
#     df = df.sort_values('article_post_cos_sim_result', ascending = False)
#     return df

### Main Program

In [None]:
# print('Start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))

# orig_entry = 'Squid Game'
# #authors = ['ZockerDog619','92sn','avidwatcherz']
# # authors = ['Pizzacakecomic','None','avidwatcherz','ZockerDog619','92sn','palanquin_dva','SQUID_FUCKER',
# #  'Invoke-the-Sunbird','MarvelsGrantMan136','Sprizys','thekyledavid','SolomonCRand','TrickyTalon','reddituser45673',
# #  'Ok_Bite8099','tired_succulent','Alberts_Hat','spencermiddleton','Unknown_User_66','sergiocamposnt']

# authors = ['asilvertintedrose',
#  'AutoLovepon',
#  'WhoiusBarrel',
#  'Lovro26',
#  'AutoShonenpon',
#  'realrimurutempest',
#  'Celized',
#  'elfratar',
#  'DekMelU',
#  'magnwn',
#  'Ok-Okra-5033',
#  'Ragernarate',
#  'zenzen_0',
#  'meh_potato',
#  'SocialLoser739',
#  'Yaggamy',
#  'Mhogen',
#  'Ghoste-Face',
#  'LeonKevlar',
#  'dorkmax_executives',
#  'Kirosh2',
#  'Turbostrider27',
#  'hell-schwarz',
#  'shanks_you',
#  'Hachirumi',
#  'OTPh1l25',
#  'DogusEUW',
#  'Prince-Dizzytoon',
#  'MarvelsGrantMan136',
#  'MD_AM',
#  'Abysswatcherbel',
#  'Mazen141',
#  'Aerodynamic41',
#  'Rulaku',
#  '_non_royal',
#  'Kezja',
#  'Se7en_Sinner',
#  'TurkeyPhat',
#  'Zaibatsu_HQ',
#  'fanime693',
#  'Ni7roM',
#  'xX_Edgyname_Xx',
#  'EdSaPro',
#  'Xanirran',
#  'steven4869',
#  'Aniboy43',
#  'pietya',
#  'Sneakynation',
#  'Fools_Requiem',
#  'Shimmering-Sky']

# subreddits = ['korea',
#  'squidgame',
#  'KDRAMA',
#  'MangaCollectors',
#  'NANIKPosting',
#  'manga',
#  'yourturntodie',
#  'ObscureMedia',
#  'anime',
#  'GlobalOffensive',
#  'WoT',
#  'scifi',
#  'ActionFigures',
#  'television',
#  'Sonsofanarchy',
#  'startrek',
#  'dvdcollection',
#  'DunderMifflin',
#  'XboxSeriesS',
#  'Documentaries']
 
# wiki_categories = ['2021 South Korean television series debuts', 'South Korean action television series', 'South Korean horror fiction television series', 'South Korean thriller television series']
# ['Category:2021 South Korean television series debuts',
#   'Category:South Korean action television series',
#   'Category:South Korean horror fiction television series',
#   'Category:South Korean thriller television series',
#   'Category:Television shows set in Seoul']

# # wiki_summary = ['''Squid Game (Korean: 오징어 게임; RR: Ojing-eo Geim) is a South Korean survival drama television series created by Hwang Dong-hyuk for Netflix. Its cast includes Lee Jung-jae, Park Hae-soo, Wi Ha-joon, HoYeon Jung, O Yeong-su, Heo Sung-tae, Anupam Tripathi, and Kim Joo-ryoung.
# # The series revolves around a contest where 456 players, all of whom are in deep financial debt, risk their lives to play a series of deadly children's games for the chance to win a ₩45.6 billion[a] prize. The title of the series draws from a similarly named Korean children's game. Hwang had conceived of the idea based on his own economic struggles early in life, as well as the class disparity in South Korea. Though he had initially written it in 2009, he was unable to find a production company to fund the idea until Netflix took an interest around 2019 as part of their drive to expand their foreign programming offerings.
# # Squid Game was released worldwide on September 17, 2021, to critical acclaim and international attention. It is Netflix's most-watched series, becoming the top-viewed program in 94 countries and attracting more than 142 million member households and amassing 1.65 billion viewing hours during its first four weeks from launch, surpassing Bridgerton for the title of most watched show. The series has also received numerous accolades, including the Golden Globe Award for Best Supporting Actor – Series, Miniseries or Television Film for O Yeong-su and the Screen Actors Guild Award for Outstanding Performance by a Male Actor in a Drama Series and Outstanding Performance by a Female Actor in a Drama Series for Lee Jung-jae and HoYeon Jung, respectively, with all three making history as the first Korean actors to win in those categories. A second season is in development.''']


# per_author_limit=500
# post_limit = 2000

#Set dates to today and today - 30 days (api uses after start_date and before end_date)
# end_date = (date.today() + timedelta(days=1)).strftime("%m/%d/%Y") 
# start_date = (date.today() - timedelta(days=31)).strftime("%m/%d/%Y") 
# mvp_flag = False #True uses an existing pickle file

# #Get relevant posts for our influential users
# print('Retrieving posts start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
# relevant_posts = get_relevant_posts(orig_entry, authors, subreddits, start_date, end_date,wiki_categories, per_author_limit, post_limit,mvp_flag)
# print('Retrieving posts end time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))


# print('End time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))


In [None]:
#relevant_posts


In [None]:
#len(relevant_posts)

In [None]:


# #Evaluate posts relevance
# if len(posts_df) > 0:
    
#     #Find average cosine similarity between posts and all wikipedia categories
#     wiki_categories_avg_score_df = get_post_cos_sim_score(wiki_categories, posts_df, 'wiki_categories')
    
#     # #The evaluation was completed and the best cosine similarity comes from the wikipedia 
#     # # categories
#     # #Find cosine similarity between posts and the wikipedia summary
#     # wiki_summary_score_df = get_post_cos_sim_score(wiki_summary, posts_df, 'wiki_summary')
    
#     # #Find cosine similarity between posts and all wikipedia articles using count vectorizer
#     # avg_article_vector, vectorizer = vectorize_wiki_articles('/work/wiki_100summaries_Squid_Game.pkl',CountVectorizer)
#     # wiki_articles_count_score_df = get_articles_post_cos_sim(avg_article_vector, vectorizer, posts_df)
#     # wiki_articles_count_score_df = wiki_articles_count_score_df.rename(columns={'article_post_cos_sim_result':'article_post_count_cos_sim_result'})
    
#     # #Find cosine similarity between posts and all wikipedia articles using tfidf vectorizer
#     # avg_article_vector, vectorizer = vectorize_wiki_articles('/work/wiki_100summaries_Squid_Game.pkl',TfidfVectorizer)
#     # wiki_articles_tfidf_score_df = get_articles_post_cos_sim(avg_article_vector, vectorizer, posts_df)
#     # wiki_articles_tfidf_score_df = wiki_articles_tfidf_score_df.rename(columns={'article_post_cos_sim_result':'article_post_tfidf_cos_sim_result'})

# else:
#     print('No comments for these authors during the time period')


In [None]:
#Comparison is complete - no longer needed
# comparison_df = wiki_categories_avg_score_df.merge(wiki_summary_score_df, how = 'inner', 
#                         left_index = True, right_index = True)

# comparison_df = comparison_df.merge(wiki_articles_count_score_df, how = 'inner', left_on = comparison_df.index, 
#                         right_on = 'title_and_body')

# comparison_df = comparison_df.merge(wiki_articles_tfidf_score_df, how = 'inner', on = 'title_and_body')

# comparison_df.to_csv('/work/output/compare_post_cosine_similarities.csv')

# comparison_df


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=90b052a7-f47d-474e-888f-9345355cfd9a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>