In [None]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
import time
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction import text 
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
import json 
import math

# set seaborn settings
sns.set()
plt.rcParams["patch.force_edgecolor"] = True # set lines
plt.style.use('seaborn-darkgrid')


import warnings
warnings.filterwarnings('ignore')

# import credentials and helper functions
import credentials as creds
import helpers as h

In [None]:
# API AND LIBRARY DOCUMENTATIONS:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

## Instantiate Reddit

In [None]:
CLIENT_ID = creds.client_id()
CLIENT_SECRET_KEY = creds.client_secret_key()


r = praw.Reddit(client_id = CLIENT_ID,
                client_secret = CLIENT_SECRET_KEY,
                user_agent = 'RedditorMatch')

## Specify the scraped datasets

Unfortunately, it takes quite some time to retrieve user comments from a particular subreddit. Remember, we're going into a specified subreddit, finding a list of users who posted, and then scraping out every single comment that those users made in the past. In order to remove this bottleneck from my demonstration, I have scraped the comments from users who have posted on r/mizzou. 

In [None]:
scraped_subreddits = ["mizzou"]

## Recommendation engine logic

* ***find_similar():*** Takes in the TFIDF matrix (matrix), the index of the document that you want to match against (index), and the number of results to be displayed (top_n / optional). 6 is chosen as the default for top_n because we're really only interested in the top 5 matched redditors. However, your own username might be chosen as well because well....your comments are very similar to your own comments and thus would have the highest score. The similarity is calculated using the cosine similarity.

* ***getDf():*** Takes in the string of the subreddit name and retrieves the scraped comments from the appropriate csv file. 

* ***stem():*** Takes in a corpus and returns a stemmed corpus. It first tokenizes and stems each word before putting it back together as a single document. 

* ***findMatches():*** Takes in your username (string), subreddit name (string), and reddit API instance. It calls getDf() to retrieve the scraped comments and applies it to a corpus of comments. It then calls one of the helper functions that I wrote (getUserComments()) to gather the comments of the input username and then prepends it onto the corpus array. It then builds the TFIDF vectorizer with the n_gram specificity of 1 to 3 words. In addition, I'm adding more stopwords into the mix. Once we fit and transform the corpus to get the matrix, we call "find_similar()" to retrieve the top 5 matched usernames. 

In [None]:
def find_similar(matrix, index, top_n = 6):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [None]:
def getDf(subreddit_name):
    filePath = "data/" + subreddit_name + ".csv"
    df = pd.read_csv(filePath, encoding = "ISO-8859-1")
    print("--- Retrieved", len(df), "corpuses/corpi(?) for", subreddit_name)
    return(df)

In [None]:
def stem(corpus):
    newCorpus = []
    print("------ Stemming the words")
    
    stemmer = PorterStemmer()
    
    for c in corpus:
        try:
            tokens = nltk.word_tokenize(c)
            
            new_tokens = []
            
            for t in tokens:
                try:
                    new_t = stemmer.stem(t)
                    new_tokens.append(new_t)          
                except:
                    print("can't stem the word. moving on...")

            new_c = ' '.join(new_tokens)
            newCorpus.append(new_c)
                
        except:
            print("skipping the comment. something went wrong...")
        
    return(newCorpus)

In [None]:
def findMatches(your_username, subreddit_name, redditInstance):
    corpus = []    
    corpusDf = getDf(subreddit_name)
    corpusDf["Comments"].apply(lambda row: corpus.append(str(row)))
    
    corpus = stem(corpus)
    
    your_comments = h.getUserComments(your_username, redditInstance)
    your_comments = stem(your_comments)    
    your_comments = " ".join(your_comments)
    
    corpus.insert(0, your_comments)
    
    print("--- Creating Tfidf vector...")
    
    myStopWords = stopwords.words('english')
    myStopWords = text.ENGLISH_STOP_WORDS.union(myStopWords)
    
    tf = TfidfVectorizer(analyzer = "word", max_df = 0.8, min_df = 0.2, 
                            ngram_range = (1, 3),
                            stop_words = myStopWords)
    
    print("--- Fitting the matrix...")
    matrix = tf.fit_transform(corpus)
    results = []
    
    for index, score in find_similar(matrix, 0):        
        index = index - 1 # because we prepended our comments onto the corpus, the index number was shifted by 1.
        user = corpusDf.iloc[index, 0]
        results.append(user)
        print("...")
        print("...")
        print("Username:", user, "| Score:", score)
        print("=========================================================")
        
    return(results)

### Example:

In [None]:
# NOTE: I'm displaying the top 6 usernames who are similar. 
# I chose 6 because if the username is contained in my scraped dataset, 
# then that username will always show up as most similar. 
# But we really only care about the top 5 usernames. 

matches = findMatches("Max_W_", scraped_subreddits[0], r)

## Gather user information

In [None]:
def getRedditorInfo(redditor_name, r):
    user = r.redditor(redditor_name)
    top = user.comments.top(limit = 1000)
    hot = user.comments.hot(limit = 1000)
    contro = user.comments.controversial(limit = 1000)
    
    subreddit = []
    comment = []
    created_utc = []
    score = []
    ups = []
    downs = []
    controversiality = []
    flair = []
    gilded = []
    over_18 = []
    link = []
    
    for c in top:
        subreddit.append(c.subreddit_name_prefixed)
        comment.append(h.cleanText(c.body))
        
        parsed_date = datetime.utcfromtimestamp(c.created_utc)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day
        
        created_utc.append(parsed_date)
        score.append(c.score)
        ups.append(c.ups)
        downs.append(c.downs)
        controversiality.append(c.controversiality)
        flair.append(c.author_flair_text)
        gilded.append(c.gilded)
        over_18.append(c.over_18)
        link.append(c.link_permalink)
        
    for c in hot:
        subreddit.append(c.subreddit_name_prefixed)
        comment.append(h.cleanText(c.body))
        
        parsed_date = datetime.utcfromtimestamp(c.created_utc)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day
        
        created_utc.append(parsed_date)
        score.append(c.score)
        ups.append(c.ups)
        downs.append(c.downs)
        controversiality.append(c.controversiality)
        flair.append(c.author_flair_text)
        gilded.append(c.gilded)
        over_18.append(c.over_18)
        link.append(c.link_permalink)
        
    for c in contro:
        subreddit.append(c.subreddit_name_prefixed)
        comment.append(h.cleanText(c.body))
        
        parsed_date = datetime.utcfromtimestamp(c.created_utc)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day
        
        created_utc.append(parsed_date)
        score.append(c.score)
        ups.append(c.ups)
        downs.append(c.downs)
        controversiality.append(c.controversiality)
        flair.append(c.author_flair_text)
        gilded.append(c.gilded)
        over_18.append(c.over_18)
        link.append(c.link_permalink)
        
    df = pd.DataFrame(subreddit, columns = ["subreddit"])
    df["comment"] = comment
    df["created_utc"] = created_utc
    df["score"] = score
    df["ups"] = ups
    df["downs"] = downs
    df["controversiality"] = controversiality
    df["flair"] = flair
    df["gilded"] = gilded
    df["over_18"] = over_18
    df["link"] = link
    
    df = df.drop_duplicates(subset = ["comment"], keep = "first")    
    print("Retrieved", len(df), "comments for user:", redditor_name)
    return(df)

### Examples

In [None]:
comments1 = getRedditorInfo("Max_W_", r)
comments2 = getRedditorInfo("BrettGilpin", r)

In [None]:
comments1.head()

## Find common subreddits between 2 users

In [None]:
def commonSubreddits(user1, user2, redditInstance):
    
    df1 = getRedditorInfo(user1, redditInstance)
    df2 = getRedditorInfo(user2, redditInstance)
    
    df1 = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)
            
    df2 = df2.groupby(["subreddit"])[['comment']]\
            .count().reset_index()\
            .sort_values(["comment"], ascending = False)
            
    df1 = df1.merge(df2, on = "subreddit", how = "inner")["subreddit"]
    result = np.array(df1)
    
    return(result)

In [None]:
def commonSubredditCounts(user1, user2, redditInstance):
    
    common = commonSubreddits(user1, user2, redditInstance)  
    print(len(common), "common subreddits found...")
    
    df1 = getRedditorInfo(user1, redditInstance)
    df2 = getRedditorInfo(user2, redditInstance)
    
    df1 = df1[df1["subreddit"].isin(common)]
    df2 = df2[df2["subreddit"].isin(common)]
        
    df1Counts = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)\
                .reset_index(drop = True)
                
    df2Counts = df2.groupby(["subreddit"])[['comment']]\
            .count().reset_index()\
            .sort_values(["comment"], ascending = False)\
            .reset_index(drop = True)      
            
    renameCols = ["id", "value"]
            
    df1Counts.columns = renameCols
    df2Counts.columns = renameCols
    
    df1Counts["id"] = df1Counts["id"].str.lower()
    df2Counts["id"] = df2Counts["id"].str.lower()
    
    df1Counts["id"] = df1Counts["id"].str.replace("r/", "")
    df2Counts["id"] = df2Counts["id"].str.replace("r/", "")
    
    df1Counts = df1Counts.sort_values(by = ["id"], ascending = True)
    df2Counts = df2Counts.sort_values(by = ["id"], ascending = True)    
    
                
    return(df1Counts, df2Counts)

In [None]:
def subredditCounts(user, redditInstance):
    df1 = getRedditorInfo(user, redditInstance)
    
    df1Counts = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)\
                .reset_index(drop = True)
                
    renameCols = ["id", "value"]
    df1Counts.columns = renameCols
    df1Counts["id"] = df1Counts["id"].str.lower()
    df1Counts["id"] = df1Counts["id"].str.replace("r/", "")
    df1Counts = df1Counts.sort_values(by = ["id"], ascending = True)   
    
    return(df1Counts)  

In [None]:
allSubreddits1 = subredditCounts("Max_W_", r)
allSubreddits2 = subredditCounts("PrancingPeach", r)

# Output to CSV files for visualization. 
allSubreddits1.to_csv("frontend/all_sub1.csv", index = False)
allSubreddits2.to_csv("frontend/all_sub2.csv", index = False)

### Examples

In [None]:
one, two = commonSubredditCounts("Max_W_", "BrettGilpin", r)

# Output to CSV files for visualization. 
one.to_csv("frontend/common_sub1.csv", index = False)
two.to_csv("frontend/common_sub2.csv", index = False)

In [None]:
one.head(20)

In [None]:
two.head(20)

## Get user flairs

In [None]:
def getFlairs(username, redditInstance):
    df = getRedditorInfo(username, redditInstance)
    flairs = set(np.unique(np.array((df[(df["flair"].isnull() == False) & (df["flair"] != "")]["flair"]))))
    if (len(flairs) < 1):
        print("No flairs for this user")
        return None
    else:
        return(flairs)

In [None]:
getFlairs("Max_W_", r)

In [None]:
getFlairs("BrettGilpin", r)

## Find top features

In [None]:
matches

In [None]:
def top_tfidf_feats(row, features, top_n = 50):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
def top_feats_in_doc(Xtr, features, row_id, top_n = 50):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=50):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [None]:
def getTopFeatures(matches, redditInstance):
    
    myStopWords = stopwords.words('english')
    myStopWords = text.ENGLISH_STOP_WORDS.union(myStopWords)
    
    stemmer = PorterStemmer()
    
    for matched in matches:
        print("------ Working on", matched)
        allCorpuses = []
        other_comments = h.getUserComments(matched, redditInstance)
        
        for oc in other_comments:
            tokens = nltk.word_tokenize(oc)
            new_oc = ' '.join(stemmer.stem(t) for t in tokens)
            allCorpuses.append(new_oc)
    
        tf1 = TfidfVectorizer(analyzer = "word",
                        ngram_range = (1, 3),
                        stop_words = myStopWords)  
            
        print("------ Fitting and tranforming the TFIDF vectorizer")
        matrix1 = tf1.fit_transform(allCorpuses)
        features1 = tf1.get_feature_names()
        
        topFeatures = top_mean_feats(matrix1, features1)
        cols = ["features", "mean score"]
        topFeatures.columns = cols
        print("")
        print(topFeatures.head(20))
        print("-----------------------------------------------------------------------")
        
    return

### Example

In [None]:
top = getTopFeatures(matches, r)

In [None]:
def getFeatures(your_username, redditInstance):
    
    myStopWords = stopwords.words('english')
    myStopWords = text.ENGLISH_STOP_WORDS.union(myStopWords)
    
    stemmer = PorterStemmer()
    allCorpuses = []
    
    your_comments = h.getUserComments(your_username, redditInstance)
    your_corpus = []
    for yc in your_comments:
        tokens = nltk.word_tokenize(yc)
        new_yc = ' '.join(stemmer.stem(t) for t in tokens)
        your_corpus.append(new_yc)
    
    tf = TfidfVectorizer(analyzer = "word",
                        ngram_range = (1, 3),
                        stop_words = myStopWords) 
    
    matrix = tf.fit_transform(your_corpus)
    features = tf.get_feature_names()
    idf = tf.idf_
    result = dict(zip(features, idf))
    
    return(result)

In [None]:
def getCommonFeatures(user1, user2, redditInstance):
    f1 = getFeatures(user1, redditInstance)
    f2 = getFeatures(user2, redditInstance)
    
    res1 = sorted(((value, key) for (key, value) in f1.items()), reverse = True)[0:5000]
    res2 = sorted(((value, key) for (key, value) in f2.items()), reverse = True)[0:5000]
    
    set1 = set(t[1] for t in res1)
    set2 = set(t[1] for t in res2)
    
    commonFeats = set1 & set2
    return(commonFeats)

In [None]:
getCommonFeatures("Max_W_", "BrettGilpin", r)

## Find subreddits that a user posts in

In [None]:
def getSubredditsPosted(username, redditInstance):
    comments = getRedditorInfo(username, redditInstance)
    subs = np.array(comments["subreddit"])
    subs = (np.unique(subs))
    return(subs)

In [None]:
getSubredditsPosted("Max_W_", r)

## Analyze sentiment of user comments

In [None]:
def getSentiment(username, r):
    comments = getRedditorInfo(username, r)
                    
    if (len(comments) < 1):
        print("No comments for that user")
        return None
                        
    comments = comments.sort_values("created_utc", ascending = True)    
    sid = SentimentIntensityAnalyzer()
    
    comments["negative"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["neg"]) 
    comments["neutral"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["neu"]) 
    comments["positive"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["pos"]) 
    comments["compound"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["compound"]) 
    
    return(comments)    

### Examples

In [None]:
sents = getSentiment("Max_W_", r)
sents.head()

## Plot sentiment of comments over time

In [None]:
def plotSentiment(username, subreddit_name, redditInstance):
    sents = getSentiment(username, redditInstance)
    subreddit_name = "r/" + subreddit_name
    sents = sents[sents["subreddit"].str.lower() == subreddit_name.lower()]
    sents = sents.sort_values("created_utc", ascending = True)  
    sents = sents.reset_index()
    sents["id"] = sents.index
    
    if (len(sents) < 3):
        print("User has not posted on this subreddit")
        return        
        
    _ = plt.plot(sents["id"], sents["compound"], marker = "", linewidth = 1.9, alpha = 0.9)
    title = "Sentiment analysis on " + subreddit_name + " for user: " + username
    _ = plt.suptitle(title)
    _ = plt.show()
    
    return

In [None]:
def plotTopSentiments(username, redditInstance):
    sents = getSentiment(username, redditInstance)    
   
    subreddits = np.array(sents.groupby(["subreddit"]).\
                                                  count().\
                                                  reset_index().\
                                                  sort_values("comment", ascending = False).\
                                                  head(4)["subreddit"])
    
    numberOfSubs = len(subreddits)
    
    # create a color palette
    palette = plt.get_cmap('Set1')
    num = 0
    
    for s in subreddits:       
        
        subreddit_name = s
        df = sents[sents["subreddit"].str.lower() == subreddit_name.lower()]       
        
        if (len(df) >= 2):            
            num = num + 1 

            # Find the right spot on the plot
            _ = plt.subplot(2, 2, num)                 

            df = df.sort_values("created_utc", ascending = True)  
            df = df.reset_index()
            df["id"] = df.index

            _ = plt.plot(df["id"], df["compound"], marker = "", linewidth = 1.9, alpha = 0.9, color = palette(num))

            # Not ticks everywhere
            if num in range(7) :
                _ = plt.tick_params(labelbottom='off')
            if num not in [1,4,7] :
                _ = plt.tick_params(labelleft='off')

            # Add title
            _ = plt.title(subreddit_name, loc='left', fontsize=12, fontweight=0, color=palette(num))
             

    title = "Sentiment analysis " + "for user: " + username
    _ = plt.suptitle(title, fontsize=13, fontweight=0, color='black', style='italic', y=1.02)    
    _ = plt.show()
    
    return

### Examples

In [None]:
plotSentiment("Max_W_", "mizzou", r)

In [None]:
plotTopSentiments("Max_W_", r)

In [None]:
plotTopSentiments("BrettGilpin", r)

In [None]:
# NOTE: plot a cumulative chart of sentiment

## Build collapsible index visualization

In [None]:
def formatToDict(name, size):    
    return {'name': name, 'size': round(size*100000)}

In [None]:
def buildCollapsible(your_username, subreddit_name, redditInstance):
    matches = list(findMatches(your_username, subreddit_name, redditInstance))
    
    if (len(matches) < 1): 
        return None
    
    entireObjDict = {}
    entireObjDict["name"] = "Top Features"
    objList = []
    
    for username in matches:
        colNames = ["name", "size"]
        features = getTopFeatures(username, redditInstance)        
        features.columns = colNames
        features = features.head(15)
        
        children = list(features.apply(lambda row: formatToDict(row["name"], row["size"]), axis = 1))
        children
        
        userObj = {}
        userObj["name"] = username
        userObj["children"] = children
        objList.append(userObj)
        
    entireObjDict["children"] = objList        
    return(entireObjDict)        

In [None]:
jsonObj = buildCollapsible("Max_W_", "mizzou", r)

## K-means clustering