In [28]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw
from datetime import datetime

import credentials as creds

## import custom helper functions
import helpers as h

In [12]:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

In [13]:
CLIENT_ID = creds.client_id()
CLIENT_SECRET_KEY = creds.client_secret_key()


r = praw.Reddit(client_id = CLIENT_ID,
                client_secret = CLIENT_SECRET_KEY,
                user_agent = 'RedditorMatch')

print(r.read_only)

True


In [14]:
scraped_subreddits = ["mizzou", "umich"]

In [15]:
def find_similar(matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [16]:
def getDf(subreddit_name):
    filePath = "data/" + subreddit_name + ".csv"
    df = pd.read_csv(filePath, encoding = "ISO-8859-1")
    print("--- Retrieved", len(df), "corpuses/corpi(?) for", subreddit_name)
    return(df)

In [17]:
def findMatches(your_username, subreddit_name, redditInstance):
    corpus = []    
    corpusDf = getDf(subreddit_name)
    corpusDf["Comments"].apply(lambda row: corpus.append(str(row)))
    
    your_comments = h.getUserComments(your_username, redditInstance)
    your_comments = ".".join(your_comments)
    
    corpus.insert(0, your_comments)
    
    print("--- Creating Tfidf vector...")
    
    tf = TfidfVectorizer(analyzer = "word", 
                            ngram_range = (1, 3),
                            min_df = 0, 
                            stop_words = "english")
    
    print("--- Fitting the matrix...")
    matrix = tf.fit_transform(corpus)
    results = []
    
    for index, score in find_similar(matrix, 0):        
        index = index - 1 # because we prepended our comments onto the corpus, the index number was shifted by 1.
        user = corpusDf.iloc[index, 0]
        results.append(user)
        print("...")
        print("...")
        print("...")
        print("Score:", score, "| Username:", user)
        print("=========================================================")
        
    return(results)

In [38]:
matches = findMatches("wingzeromkii", scraped_subreddits[0], r)

--- Retrieved 288 corpuses/corpi(?) for mizzou
------ Retrieved 1079 comments for: wingzeromkii
--- Creating Tfidf vector...
--- Fitting the matrix...
...
...
...
Score: 0.309977958799 | Username: PrancingPeach
...
...
...
Score: 0.295643024075 | Username: SexyMcBeast
...
...
...
Score: 0.292215385742 | Username: BrettGilpin
...
...
...
Score: 0.281451263154 | Username: KCTigerGrad
...
...
...
Score: 0.276196507847 | Username: Volum3


In [29]:
def getRedditorInfo(redditor_name, r):
    user = r.redditor(redditor_name)
    commentList = user.comments.top(limit = 1000)
    
    subreddit = []
    comment = []
    created_utc = []
    score = []
    ups = []
    downs = []
    controversiality = []
    flair = []
    gilded = []
    over_18 = []
    link = []
    
    for c in commentList:
        subreddit.append(c.subreddit_name_prefixed)
        comment.append(h.cleanText(c.body))
        
        parsed_date = datetime.utcfromtimestamp(c.created_utc)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day

        
        created_utc.append(parsed_date)
        score.append(c.score)
        ups.append(c.ups)
        downs.append(c.downs)
        controversiality.append(c.controversiality)
        flair.append(c.author_flair_text)
        gilded.append(c.gilded)
        over_18.append(c.over_18)
        link.append(c.link_permalink)
        
    df = pd.DataFrame(subreddit, columns = ["subreddit"])
    df["comment"] = comment
    df["created_utc"] = created_utc
    df["score"] = score
    df["ups"] = ups
    df["downs"] = downs
    df["controversiality"] = controversiality
    df["flair"] = flair
    df["gilded"] = gilded
    df["over_18"] = over_18
    df["link"] = link
    
    print("Retrieved", len(df), "comments for user:", redditor_name)
    return(df)

In [36]:
u = getRedditorInfo("wingzeromkii", r)

Retrieved 1000 comments for user: wingzeromkii


In [37]:
u.head(5)

Unnamed: 0,subreddit,comment,created_utc,score,ups,downs,controversiality,flair,gilded,over_18,link
0,r/WTF,He also promoted his current girlfriend to the...,2016-10-14 16:47:47,467,467,0,0,,0,False,https://www.reddit.com/r/WTF/comments/57fqbw/t...
1,r/AskReddit,Fuck it Were doing it live Bill OReilly on the...,2009-10-20 02:26:32,390,390,0,0,,0,False,https://www.reddit.com/r/AskReddit/comments/9v...
2,r/movies,ukatiepornhub had grown so rich she wanted to ...,2014-12-20 01:25:46,227,227,0,0,,0,False,https://www.reddit.com/r/movies/comments/2ptv6...
3,r/IAmA,Now you have the strength if a grown man and a...,2010-10-03 19:26:13,152,152,0,0,,0,False,https://www.reddit.com/r/IAmA/comments/dma01/i...
4,r/Gunners,I have a theory that Arsenal and Tottenham dra...,2018-03-08 20:01:16,152,152,0,0,,0,False,https://www.reddit.com/r/Gunners/comments/830g...


In [39]:
def whyMatch(your_username, their_username, r):
    your_comments = getRedditorInfo(your_username, r)
    other_comments = getRedditorInfo(their_username, r)
        
    # find top 10 subreddits
    # word cloud the comments (overall)
    # word cloud the comments (common subreddits)
    # flairs for common subreddits
        
    return

In [None]:
# sentiment analysis for comments on a particular subreddit over time

In [None]:
# classification of flairs