In [129]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw

In [104]:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

In [105]:
r = praw.Reddit(client_id='',
                     client_secret='',
                     user_agent='reddit_matchmaker')

print(r.read_only)

True


In [106]:
def getThreads(subreddit, limit = 3):
    thread_ids = []
    
    for t in subreddit.hot(limit = limit):
        thread_ids.append(t.id)
    
    return(thread_ids)

In [117]:
def getUsersFromSubreddit(subredditIds):
    if(len(subredditIds) < 1):
        return()

    usernames = []

    for thread_id in subredditIds:
        thread = r.submission(id = thread_id)
        username = thread.author.name
        if (username not in usernames):
            usernames.append(username)
    return(usernames)      

In [108]:
def cleanText(text):
    text = text.replace(',', ' ').replace(':', ' ')\
                .replace('...', ' ')\
                .replace('?', ' ')\
                .replace('!', ' ')\
                .replace(';', ' ')\
                .replace('\n', ' ').replace('\r', '') # replace newlines and page breaks     
                
    text = re.sub(r'([^\s\w]|_)+', '', text) # remove non-alphanumeric characters but leave the spaces
    text = re.sub(' +',' ', text) # remove double spaces        
    return(text)

In [109]:
def getUserComments(username):
    userComments = []
    user = r.redditor(username)

    # new comments
    for c in user.comments.new(limit = 3):        
        comment = cleanText(c.body)        
        if (comment not in userComments):
            userComments.append(comment)
    
    # hot comments
    for c in user.comments.hot(limit = 3):        
        comment = cleanText(c.body)        
        if (comment not in userComments):
            userComments.append(comment)

    # hot comments
    for c in user.comments.controversial(limit = 3):        
        comment = cleanText(c.body)     
        if (comment not in userComments):
            userComments.append(comment)
        
    return(userComments)

In [140]:
def scrapeCommentsFromSubreddit(subreddit):
    
    UserArray = []
    CommentArray = []
        
    sub = r.subreddit(subreddit)
    threads = getThreads(sub)
    users = getUsersFromSubreddit(threads)
    for user in users:        
        userComments = getUserComments(user)
        userComments = ".".join(userComments)
        UserArray.append(user)
        CommentArray.append(userComments)      
        
    df = pd.DataFrame(UserArray, columns = ["Username"])
    df["Comments"] = CommentArray
    
    print(len(df))    
    return(df)

In [141]:
consulting = scrapeCommentsFromSubreddit("consulting")

2


In [142]:
consulting.head()

Unnamed: 0,Username,Comments
0,QiuYiDio,Dark well fitting suit Doesnt need to be espec...
1,totezenguy,Yeah man Im 6 months in already and have a yea...


In [143]:
corpus = []
consulting["Comments"].apply(lambda row: corpus.append(str(row)))
len(corpus)

2

In [145]:
def find_similar(matrix, index, top_n = 10):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [152]:
def findMatches(your_username, subreddit):
    corpus = []
    corpusDf = scrapeCommentsFromSubreddit(subreddit)
    corpusDf["Comments"].apply(lambda row: corpus.append(str(row)))
    
    your_comments = getUserComments(your_username)
    your_comments = ".".join(your_comments)
    
    corpus.insert(0, your_comments)
    print(corpus[0][:500])
    
    print("=====")
    
    tf = TfidfVectorizer(analyzer = "word", 
                            ngram_range = (1, 3),
                            min_df = 0, 
                            stop_words = "english")
    
    matrix = tf.fit_transform(corpus)
    for index, score in find_similar(matrix, 0):
        print(index, score, corpus[index][:2000])
        print("===================")

In [153]:
findMatches("ohai123456789", "consulting")

2
So basicallyadd value to the companys bottom line see the bigger picture and go beyond your job description Did that understand that correctly .Would you say the MBA accelerated your path or would you have gone into a leadership role anyway .Sounds like a constraint optimization problem Look up Operations Research on nursing capacity.RIGHT I thought I was the only one I kept telling myself that itll get better and then I realized were only 1 ep from season finale Im like uh that was it .This que
=====
2 0.0325739025266 Yeah man Im 6 months in already and have a year to go I got my PMP took a course on BI and liberally use my jet pack to drive around and maintain VPN Im focused on health and mindfulness but Im on diminishing marginal returns Ive hit some epic audiobooks while driving studied systems science beefed up accounting knowledge read a bunch of strategy books became a charity board member started a PMO mentorship programmer non profits Still fucking bored.I am a consultant I 