In [2]:
import numpy as np
import pandas as pd
import math as mt
import csv
from pandas import DataFrame,Series,read_csv
import scipy
import scipy.sparse as sp
from sparsesvd import sparsesvd        #used for matrix factorization
from scipy.sparse import csc_matrix    #used for sparse matrix
from scipy.sparse.linalg import *      #used for matrix multiplication

In [3]:
reddit_df = read_csv('/Users/jenniferwu/Documents/SVD_for_Subreddit_Recommendation/reddit_praw.csv')
reddit_df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
reddit_df.head()

Unnamed: 0,username,subreddit,utc
0,kabanossi,photoshopbattles,1482748000.0
1,kabanossi,GetMotivated,1482748000.0
2,kabanossi,vmware,1482748000.0
3,kabanossi,carporn,1482748000.0
4,kabanossi,DIY,1482747000.0


In [111]:
user = reddit_df.username.unique()
subreddit = reddit_df.subreddit.unique()
print('unique reddittor:',len(user)+1)
print('unique subreddit:',len(subreddit)+1)
print('total data entry:',reddit_df.shape)

unique reddittor: 15000
unique subreddit: 29281
total data entry: (9391244, 3)


In [12]:
print("Are there null values from our API dataset?  \n" + str(reddit_df.isnull().any()))

Are there null values from our API dataset?  
username     False
subreddit    False
utc          False
dtype: bool


In [40]:
top_subreddit = reddit_df.groupby(['subreddit']
                                      , as_index=False).agg({'username': 'count'}).sort_values(by=['username']
                                      , ascending=False).reset_index(drop=True).rename(columns={'username':'username_count'})
top_subreddit['cummulative_pct']  = top_subreddit.username_count.cumsum()/top_subreddit.username_count.sum()*100
top_subreddit.head()

Unnamed: 0,subreddit,username_count,cummulative_pct
0,AskReddit,683932,7.282656
1,politics,260215,10.053482
2,The_Donald,146480,11.613232
3,nfl,122088,12.913252
4,worldnews,109187,14.075899


# Evaluating our SVD Model

In [16]:
user_subred_df = reddit_df.groupby(['username','subreddit']).agg({'subreddit':'count',
                                                                 'utc':'max'}).\
              rename(columns={'subreddit':'submission_freq','utc':'most_recent_timestamp'}).reset_index()
user_subred_df.tail(10)

Unnamed: 0,username,subreddit,submission_freq,most_recent_timestamp
603462,zwingo,technology,3,1479072000.0
603463,zwingo,television,6,1481714000.0
603464,zwingo,todayilearned,5,1480154000.0
603465,zwingo,tumblr,3,1477503000.0
603466,zwingo,videos,31,1477933000.0
603467,zwingo,weed,10,1481396000.0
603468,zwingo,worldnews,10,1481361000.0
603469,zzjm,getdisciplined,3,1482980000.0
603470,zzvilzz,FFBraveExvius,26,1482995000.0
603471,zzvilzz,ffbe,8,1482940000.0


In [25]:
doc_df = reddit_df.groupby('username')['subreddit'].apply(lambda x: "%s" % ' '.join(x)).reset_index()
doc_df.head()

Unnamed: 0,username,subreddit
0,--ANUSTART-,news news AdviceAnimals AskReddit explainlikei...
1,--Sko--,DestinyTheGame DestinyTheGame DestinyTheGame C...
2,--UNKN0WN--,AceAttorney AceAttorney AceAttorney AceAttorne...
3,--harley--quinn--,AskReddit RoastMe tifu AskReddit funny Patriot...
4,-A-p-r-i-l-,AskReddit AskReddit tdi tdi tdi tdi tdi tdi As...


In [27]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
document = doc_df.iloc[:, 1]
document = document.apply(lambda row: tokenizer.tokenize(row))
document.head()

0    [news, news, AdviceAnimals, AskReddit, explain...
1    [DestinyTheGame, DestinyTheGame, DestinyTheGam...
2    [AceAttorney, AceAttorney, AceAttorney, AceAtt...
3    [AskReddit, RoastMe, tifu, AskReddit, funny, P...
4    [AskReddit, AskReddit, tdi, tdi, tdi, tdi, tdi...
Name: subreddit, dtype: object

## Creating User-Subreddit Matrix
Using CSC Matrix to Handle highly sparse matrix. To view normally, use : user_subreddit_matrix.todense()

In [30]:
corpus_of_subs = []
for subreddits in subreddit:
    corpus_of_subs.append(subreddits)


voc2id = dict(zip(corpus_of_subs, range(len(corpus_of_subs))))
rows, cols, vals = [], [], []
for r, d in enumerate(document):
    for e in d:
        if voc2id.get(e) is not None:
            rows.append(r)
            cols.append(voc2id[e])
            vals.append(1)
user_subreddit_matrix = csc_matrix((vals, (rows, cols)), dtype=np.float32)
print((user_subreddit_matrix.shape))

(14999, 29280)


In [100]:
def computeSVD(user_subreddit_matrix, no_of_latent_factors):
    
    """Compute the SVD of the given matrix.
    :user_subreddit_matrix: a numeric matrix
    :no_of_latent_factors : numeric scalar value
    
    :U  : User to concept matrix 
    :S  : Strength of the concepts matrix
    :Vt : Subreddit to concept matrix
    """
    U, s, Vt = sparsesvd(user_subreddit_matrix, no_of_latent_factors)
    
    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(np.transpose(U), dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)

    return U, S, Vt

In [118]:
#Compute estimated recommendations for the given user
def computeEstimatedRecommendation(U, S, Vt, uTest):
    """Compute the recommendation for the given user.
    
    :U     : User to concept matrix 
    :S     : Strength of the concepts matrix
    :Vt    : Subreddit to concept matrix
    :uTest : Index of the user for which the recommendation has to be made
    
    :recom : List of recommendations made to the user
    """
 
    #constants defining the dimensions of the estimated rating matrix
    MAX_PID = len(subreddit)
    MAX_UID = len(user)
    
    rightTerm = S*Vt 

    EstimatedRecommendation = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        # Converting the vector to dense format in order to get the indices 
        # of the movies with the best estimated ratings 
        
        EstimatedRecommendation[userTest, :] = prod.todense()
        recom = (-EstimatedRecommendation[userTest, :]).argsort()[:293]
    return recom

In [113]:
n_latent_fact = top_subreddit.subreddit[top_subreddit.cummulative_pct <= 65].count()
contribution_pct = round(top_subreddit.cummulative_pct[len(subreddit[top_subreddit.cummulative_pct <= 65])-1],1)


print("Top", n_latent_fact ,"subreddits contribute a total of"
      , contribution_pct,"%", "to the total subreddits in the dataset")


Top 293 subreddits contribute a total of 65.0 % to the total subreddits in the dataset


In [114]:
no_of_latent_factors = top_subreddit.subreddit[top_subreddit.cummulative_pct <= 65].count() #293
no_of_recommendations_for_each_user = 5
uTest = [np.where(user == 'zwingo')[0][0]]
U, S, Vt = computeSVD(user_subreddit_matrix, no_of_latent_factors)

In [115]:
print("------------------------------------------------------------------------------------\n")
print("Recommendation for Redditor: %s\n" % user[uTest[0]])
print("------------------------------------------------------------------------------------\n")
print("User Subreddit History - \n")

##Getting users subs history where the vals in the matrix != 0
previous_subredit_history = subreddit[np.where(user_subreddit_matrix[uTest[0],:].todense().T != 0)[0]]
previous_subredit_history
for previous_subredits in previous_subredit_history:
     print(previous_subredits)
print("\n------------------------------------------------------------------------------------\n")

------------------------------------------------------------------------------------

Recommendation for Redditor: zwingo

------------------------------------------------------------------------------------

User Subreddit History - 

pics
CombatFootage
buildapcsales
space
movies
DestinyTheGame
xboxone
AceAttorney
TheSimpsons
SuicideSquad
thatHappened
history
books
Whatcouldgowrong
TheLastAirbender
SRSsucks
Conservative
natureismetal
botsrights
cringe
Tinder
PS4
Frugal
blackpeoplegifs
FanTheories
askgaybros
Glitch_in_the_Matrix
IPTV
Puscifer
VirginiaTech
eldertrees
litecoinmining
europeanmalefashion
SULeaks
DesignatedSurvivor
Toriko
PublicMobile

------------------------------------------------------------------------------------



In [119]:
#Get the top 5 subreddit recommendations for test user
recommended_items = computeEstimatedRecommendation(U, S, Vt, uTest)
final_recommendation = []
for r in subreddit[recommended_items]:
    ##Making sure the subreddits aren't from what they already viewed before (for Novelty)
    if r not in previous_subredit_history:
        final_recommendation.append(r)
        if len(final_recommendation) == no_of_recommendations_for_each_user:
            break

print("------------------------------------------------------------------------------------\n")
print("Recommendation for %s are as follows - \n" % user[uTest[0]])
print("------------------------------------------------------------------------------------\n")

for recommendation in final_recommendation:
    print(recommendation)
print("------------------------------------------------------------------------------------\n")


------------------------------------------------------------------------------------

Recommendation for zwingo are as follows - 

------------------------------------------------------------------------------------

modnews
HomeServer
absolutelynotme_irl
trees
bootcamp
------------------------------------------------------------------------------------

