In [None]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw

In [None]:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

In [None]:
CLIENT_ID = 'INSERT_YOURS'
CLIENT_SECRET_KEY = 'INSERT_YOURS'


r = praw.Reddit(client_id = CLIENT_ID,
                client_secret = CLIENT_SECRET_KEY,
                user_agent = 'RedditorMatch')

print(r.read_only)

In [None]:
def getThreads(subreddit, limit = 50):
    thread_ids = []
    
    for t in subreddit.hot(limit = limit):
        thread_ids.append(t.id)
    
    return(thread_ids)

In [None]:
def getUsersFromSubreddit(subredditIds):
    if(len(subredditIds) < 1):
        return None
    
    usernames = []

    for thread_id in subredditIds:
        time.sleep(2)
        thread = r.submission(id = thread_id)
        username = thread.author.name
        if (username not in usernames):
            usernames.append(username)
    return(usernames)      

In [None]:
def cleanText(text):
    text = text.replace(',', ' ').replace(':', ' ')\
                .replace('...', ' ')\
                .replace('?', ' ')\
                .replace('!', ' ')\
                .replace(';', ' ')\
                .replace('\n', ' ').replace('\r', '') # replace newlines and page breaks     
                
    text = re.sub(r'([^\s\w]|_)+', '', text) # remove non-alphanumeric characters but leave the spaces
    text = re.sub(' +',' ', text) # remove double spaces        
    return(text)

In [None]:
def getUserComments(username, commentsLimit = 50):   
    userComments = []
    user = r.redditor(username)

    # new comments
    for c in user.comments.new(limit = commentsLimit):
        time.sleep(2)
        comment = cleanText(c.body)        
        if (comment not in userComments):
            userComments.append(comment)
        
    
    # hot comments
    for c in user.comments.hot(limit = commentsLimit):  
        time.sleep(2)
        comment = cleanText(c.body)        
        if (comment not in userComments):
            userComments.append(comment)

    # hot comments
    for c in user.comments.controversial(limit = commentsLimit):
        time.sleep(2)
        comment = cleanText(c.body)     
        if (comment not in userComments):
            userComments.append(comment)
        
    return(userComments)

In [None]:
def scrapeCommentsFromSubreddit(subreddit):
    
    UserArray = []
    CommentArray = []  
    
    sub = r.subreddit(subreddit)
    threads = getThreads(sub)
    users = getUsersFromSubreddit(threads)
    for user in users:        
        userComments = getUserComments(user)
        userComments = ".".join(userComments)
        UserArray.append(user)
        CommentArray.append(userComments)      
        
    df = pd.DataFrame(UserArray, columns = ["Username"])
    df["Comments"] = CommentArray

    return(df)

In [None]:
def find_similar(matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [None]:
def findMatches(your_username, subreddit):
    corpus = []
    corpusDf = scrapeCommentsFromSubreddit(subreddit)
    corpusDf["Comments"].apply(lambda row: corpus.append(str(row)))
    
    your_comments = getUserComments(your_username)
    your_comments = ".".join(your_comments)
    
    corpus.insert(0, your_comments)
    
    tf = TfidfVectorizer(analyzer = "word", 
                            ngram_range = (1, 3),
                            min_df = 0, 
                            stop_words = "english")
    
    matrix = tf.fit_transform(corpus)
    results = []
    
    for index, score in find_similar(matrix, 0):        
        index = index - 1 # because we prepended our comments onto the corpus, the index number was shifted by 1.
        user = corpusDf.iloc[index, 0]
        results.append(user)
        print("Score:", score, "| Username:", user)
        print("=========================================================")
        
    return(results)

In [None]:
matches = findMatches(your_username = "INSERT_YOUR_USERNAMES", subreddit = "datascience")

In [None]:
len(matches)

In [None]:
matches[0:10]

In [None]:
def whyMatch(your_username, their_usernames):
    # find subreddit subscriptions
    # find common words
    return