### Step 2: Find Subreddits
This step runs combinations of the NLP category words as well as the Wiki term through the Reddit subreddit search API to find subreddits that are likely to contain the words. After pulling back many possibilities for subreddits, it uses cosine similarity between the potential subreddit (using a combination of name, title, and description) and the wiki categories to select the "best" subreddits to interrogate for information on our Next Big Thing.

In [None]:
import numpy as np 
import pandas as pd
import praw 
from datetime import datetime, timezone, timedelta, date
import pickle

In [None]:
### reddit and app credentials

REDDIT_USERNAME = 'kdicam'  
REDDIT_PASSWORD = 'h3x69jGYiLesJPW' 
APP_ID = 'wBJnSmnLH0JD5sg0yfkEDw' 
APP_SECRET = 'a2w2ZgS8pCnzurFQSJrU5immYzRyvA' 
APP_NAME = 'next-big-thing' 

In [None]:
#Set up the Reddit API in python
reddit = praw.Reddit(client_id=APP_ID,
                     client_secret=APP_SECRET,
                     user_agent=REDDIT_USERNAME)



In [None]:
#Set up the language model

#https://stackoverflow.com/questions/65199011/is-there-a-way-to-check-similarity-between-two-full-sentences-in-python
# https://www.sbert.net/docs/pretrained_models.html
from sentence_transformers import SentenceTransformer, util
#model = SentenceTransformer('all-MiniLM-L12-v2')
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def get_search_terms(nlp_category):
    '''This function creates single search terms by using the last word provided and adding one word to
       the front of this term until there are no words left.
       So, ['American','financial','corporation'] would result in 3 search terms: 
       "corporation", "financial corporation", and "American financial corporation"
       
       Input:
       nlp_category - a list of one or more comma separated single words
       
       Output:
       terms - a list of one or more comma separated multi-words'''

    terms = []
    for i, word in enumerate(reversed(nlp_category)):
        if i == 0:
            terms.append(nlp_category[-1])
        else:
            terms.append(word + ' ' + terms[i-1])
        

    return terms

In [None]:
def search_subreddits(search_term):
    '''This function uses the reddit search method to search subreddits for a search term
    Input: 
    search_term - a string value

    Output: 
    subreddit_details - a list containing a list for each subreddit found. The inner list
                                 contains the details of the subreddit: name, title, description,
                                 count of subscribers, and over18 flag
    '''

    #Subreddits are searched by both title and description for keyword
    subreddits = reddit.subreddits
    
    subreddits_result = subreddits.search(search_term)
    subreddit_details = []
    
    for subreddit in subreddits_result:
        subreddit_details.append([subreddit.display_name, 
                                subreddit.title,
                                subreddit.public_description,
                                subreddit.subscribers,
                                subreddit.over18])

    
        
    return subreddit_details

In [None]:
def run_subreddit_search_options(orig_entry,nlp_category,wiki_categories):
    '''Search reddit using multiple methods to return relevant subreddits
    Inputs: 
    orig_entry - Wikipedia term for string entered by user
    nlp_category - list containing nlp category words for orig_entry

    Output: 
    subreddit_details_df - dataframe containing all subreddits found and their details,
                            including name, title, description, subsriber count, and over18 flag
    '''

    #print('Wikipedia term is: ',orig_entry)
    #print('nlp_category is: ',nlp_category)
    #print('******************************')

    #Use search on the the collection of words in the nlp_category
    # print('Using search endpoint on terms in the nlp_category: ')
    search_nlp_category_list = []

    nlp_category_terms = get_search_terms(nlp_category)
    
    for term in nlp_category_terms:
        
        term_subreddits = search_subreddits(term)
        search_nlp_category_list.append(term_subreddits)
        # print('term is: ',term)
        # print('term subreddits are: ',[subreddit[0] for subreddit in term_subreddits])
    
    #Flatten the results
    search_nlp_category_list = [item for sublist in search_nlp_category_list for item in sublist]
    # print('Count: ',len(search_nlp_category_list))

    #Use search on the original entry
    search_entry_list = search_subreddits(orig_entry)
    # print('Using search endpoint on original entry: ')
    # print('original entry subreddits are: ',[subreddit[0] for subreddit in search_entry_list])
    # print('Count: ',len(search_entry_list))
    # print()

    #Use search on the wiki_categories
    search_wiki_categories_list = []

    for category in wiki_categories:
        category_subreddits = search_subreddits(category)
        search_wiki_categories_list.append(category_subreddits)
        # print('wikipedia category is: ',category)
        # print('category subreddits are: ',[subreddit[0] for subreddit in category_subreddits])

    #Flatten the results
    search_wiki_categories_list = [item for sublist in search_wiki_categories_list for item in sublist]
    # print('Count: ',len(search_wiki_categories_list))

    #Put the results together 
    # full_subreddit_list = search_nlp_category_list + search_entry_list
    full_subreddit_list = search_nlp_category_list + search_entry_list + search_wiki_categories_list

    #Add results to dataframe
    subreddit_details_df = pd.DataFrame(full_subreddit_list,
        columns=['display_name', 'title', 'description', 'subscriber_count', 'over18'])

    #print('Count of Unique Subreddits: ',subreddit_details_df['display_name'].nunique())
    #print(subreddit_details_df['display_name'].unique())


    return subreddit_details_df


In [None]:
def get_subreddit_by_cos_sim_categories(subreddits, wiki_categories, subreddit_details):
    '''Creates cosine similarity scores for a list of subreddit with details against a list of 
       wikipedia categories
    
    Inputs: 
    subreddits - list of subreddits
    wiki_categories - wikipedia categories for the NBT original entry
    subreddit_details - concatenation of subreddit display name, title,and public description
            
    Output: 
    avg_df - a dataframe containing each subreddit and its average cosine similarity score
            over the list of wikipedia categories'''
    cosine_result = []

    subreddits_embeddings = model.encode(subreddit_details)

    for wiki_category in wiki_categories: 
        category_embedding= model.encode(wiki_category)

        for i, subreddit in enumerate(subreddit_details):
            cos_similarity = util.pytorch_cos_sim(category_embedding, subreddits_embeddings[i]).numpy()[0][0]
            cosine_result.append([wiki_category, subreddits[i],subreddit,cos_similarity])

    cos_df = pd.DataFrame(cosine_result,columns=['wiki_category','subreddit','subreddit_details','cosine_similarity'])
    
    avg_df = cos_df.groupby(['subreddit','subreddit_details']).mean().reset_index()
    avg_df = avg_df.rename(columns={'cosine_similarity':'avg_cosine_similarity'})
    avg_df['subreddit_details_wiki_categories_rank'] = avg_df['avg_cosine_similarity'].rank(method='max',ascending=False)
    
    return avg_df


In [None]:
def get_subreddits(orig_entry, nlp_category, wiki_categories, num_subreddits, min_subreddit_subscribers,mvp_flag):
    '''Identify subreddits most likely to contain information about siblings of an entered string 
       (orig_entry) using subreddit search methods and cosine similarity between the subreddit details
       and wikpedia categories. 
       
        Inputs: 
        orig_entry - Wikipedia term for string entered by user
        nlp_category - list containing nlp category words for orig_entry
        wiki_categories - list of filtered wikipedia categories found for the orig_entry
        num_subreddits - number of subreddits to return
        min_subreddit_subscribers - smallest number of subscribers a subreddit must have in order
                                    to be included for consideration
        mvp_flag - indicates if we are running a pre-aggregated data set for our minimum viable
                   product
                            
        Output: 
        subreddits_list - list of display names for selected subreddits'''

    #Set pickle file name, will either be creating it or loading it
    orig_entry_mod = orig_entry.replace(' ','_')
    file_name = './output_step2/subreddits_' + orig_entry_mod + '.pickle'

    #If we are using a pre-aggregated data set, load its pickle file
    if mvp_flag:
        print('MVP mode - loading pickle file...')

        #Load pickle file of relevant posts
        try:
            with open(file_name, 'rb') as f:
                subreddits_list = pickle.load(f)
        except:
            print('Unable to find pickle file for', orig_entry)
            subreddits_list = []

    else:

        #Remove 'Category:' from beginning of wikipedia categories
        wiki_categories = [cat[9:] for cat in wiki_categories]

        #Find subreddits based on api search 
        #print('Initial Reddit Search start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
        
        subreddit_details_df = run_subreddit_search_options(orig_entry, nlp_category, wiki_categories)

        #print('Initial Reddit Search end time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))

        #Remove any duplicate subreddits, keep last because it was the most recent query
        subreddit_details_df = subreddit_details_df.drop_duplicates(subset = 'display_name',keep = 'last')

        #Remove subreddits that are nsfw and have less than a certain number of subscribers
        #print('Original number of subreddits: ',len(subreddit_details_df))
        subreddit_details_filtered_df = \
            subreddit_details_df[(subreddit_details_df['subscriber_count'] >= min_subreddit_subscribers) & 
            (subreddit_details_df['over18']==False)]
        #print('New # of subreddits after filtering by nsfw and subscriber count: ',len(subreddit_details_filtered_df))

        #Remove subreddits that have not had a recent post
        current_subreddit_list = list(subreddit_details_filtered_df['display_name'])
        last_post_dates = []
        for subreddit in current_subreddit_list:
            last_post = []
            for post in reddit.subreddit(subreddit).new(limit = 1):
                last_post.append(subreddit)
                last_post.append(datetime.fromtimestamp(post.created_utc).date())
            last_post_dates.append(last_post)
        
        last_post_df = pd.DataFrame(last_post_dates,columns=['subreddit','last_post'])
        
        last_post_df['active'] = last_post_df['last_post'] + timedelta(days=3) > date.today()
        last_post_df = last_post_df[last_post_df['active']==True]
        subreddit_details_filtered_df = subreddit_details_filtered_df.merge(last_post_df, 
                            how = 'inner', left_on = 'display_name', right_on = 'subreddit')

        #print('New # of subreddits after filtering out those with no recent posts: ',len(subreddit_details_filtered_df))

        #Cosine Similarity to choose best subreddits
        #print('Filtering and cos sim final selection start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
        
        #Prep data for cosine similarity by concatenating the subreddit name, title, and description
        subreddit_details_filtered_df['full_details'] = subreddit_details_filtered_df['display_name'] + \
            '.' + subreddit_details_filtered_df['title'] + '.' + subreddit_details_filtered_df['description']
        
        subreddit_details = list(subreddit_details_filtered_df['full_details'])
        subreddits = list(subreddit_details_filtered_df['display_name'])
        
        #Run cosine similarity to rank results
        cos_sim_result_df = get_subreddit_by_cos_sim_categories(subreddits, wiki_categories, 
                                                                            subreddit_details)
        
        subreddits_result_df = cos_sim_result_df.nsmallest(num_subreddits,'subreddit_details_wiki_categories_rank')[['subreddit','subreddit_details','subreddit_details_wiki_categories_rank']]
        
        subreddits_list = list(subreddits_result_df['subreddit'])

        #print('Filtering and cos sim final selection end time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
    
        #Create pickle file from our subreddits
       
        with open (file_name, 'wb') as f:
            pickle.dump(subreddits_list, f)


    return subreddits_list


### Test Data to Run the Program

In [None]:

# print('Start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))

#Set Constants
# num_subreddits = 10
# min_subreddit_subscribers = 20000
# mvp_flag = False


# orig_entry = 'Beastie Boys'
# nlp_category = ['American', 'rap', 'group', 'City']
# wiki_categories = ['Category:Alternative hip hop groups',
#     'Category:Hardcore hip hop groups',
#     'Category:Musical groups from New York City',
#     'Category:Hip hop groups from New York City',
#     'Category:Rap rock groups']

# orig_entry = 'TikTok'
# nlp_category = ['video-focused', 'social', 'networking', 'service']
# wiki_categories = ['Category:Social networking services',
#   'Category:Social media companies',
#   'Category:Video software',
#   'Category:Internet culture',
#   'Category:Video hosting']

# print('Start time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))


# #Get list of subreddits to be used for finding the Next Big Thing
# subreddits_list = get_subreddits(orig_entry, nlp_category, wiki_categories, num_subreddits, min_subreddit_subscribers,mvp_flag)
# print('End time: ',datetime.now(timezone(timedelta(hours=-4), 'EST')).strftime('%Y-%m-%d %H:%M:%S'))
# subreddits_list





In [None]:
#subreddits_list

['socialmedia',
 'youtube',
 'NewTubers',
 'Cisco',
 'TikTokHumor',
 'CorporateFacepalm',
 'videos',
 'privacy',
 'GamerGhazi',
 'cordcutters']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=90b052a7-f47d-474e-888f-9345355cfd9a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>