# Subreddit Recommendation System using SVD

<img src='https://assets.ifttt.com/images/channels/1352860597/icons/on_color_large.png'/>

In [50]:
import numpy as np
import pandas as pd
import math as mt
import csv
from pandas import DataFrame,Series,read_csv
import scipy
import scipy.sparse as sp
from sparsesvd import sparsesvd        #used for matrix factorization
from scipy.sparse import csc_matrix    #used for sparse matrix
from scipy.sparse.linalg import *      #used for matrix multiplication
from scipy.linalg import sqrtm
from nltk.tokenize import TreebankWordTokenizer

In [51]:
reddit_df = read_csv('/Users/jenniferwu/Documents/SVD_for_Subreddit_Recommendation/reddit_praw.csv')
reddit_df.drop(columns=['Unnamed: 0'], inplace=True)

In [52]:
reddit_df.head()

Unnamed: 0,username,subreddit,utc
0,kabanossi,photoshopbattles,1482748000.0
1,kabanossi,GetMotivated,1482748000.0
2,kabanossi,vmware,1482748000.0
3,kabanossi,carporn,1482748000.0
4,kabanossi,DIY,1482747000.0


In [53]:
user = reddit_df.username.unique()
subreddit = reddit_df.subreddit.unique()
print('unique reddittor:',len(user)+1)
print('unique subreddit:',len(subreddit)+1)
print('total data entry:',reddit_df.shape)

unique reddittor: 15000
unique subreddit: 29281
total data entry: (9391244, 3)


In [54]:
print("Are there null values from our API dataset?  \n" + str(reddit_df.isnull().any()))

Are there null values from our API dataset?  
username     False
subreddit    False
utc          False
dtype: bool


# Evaluating our SVD Model - with Test & Train by Sampling 500 users

In [7]:
sample_username = list(reddit_df.username.unique())[300:800]
sample_df = reddit_df[reddit_df.username.isin(sample_username)]

users = list(sample_df.username.unique())
subreddits = list(sample_df.subreddit.unique())

In [8]:
subs_freq = sample_df.groupby(['subreddit']
                                      , as_index=False).agg({'username': 'count'}).sort_values(by=['username']
                                      , ascending=False).reset_index(drop=True).rename(columns={'username':'username_count'})
subs_freq['cummulative_pct'] = subs_freq.username_count.cumsum()/subs_freq.username_count.sum()*100

latent_fac = subs_freq.subreddit[subs_freq.cummulative_pct <= 65].count()
contribution_pcts = round(subs_freq.cummulative_pct[len(subs_freq.subreddit[subs_freq.cummulative_pct <= 65])-1],1)

print("Top", latent_fac ,"subreddits contribute a total of"
      , contribution_pcts,"%", "to the total subreddits in the dataset")



Top 134 subreddits contribute a total of 64.9 % to the total subreddits in the dataset


In [9]:
data =sample_df.groupby(['username','subreddit']).agg({'subreddit':'count',
                                                                 'utc':'max'}).\
              rename(columns={'subreddit':'submission_freq','utc':'most_recent_timestamp'}).reset_index()
data.head(10)

Unnamed: 0,username,subreddit,submission_freq,most_recent_timestamp
0,-SA-HatfulOfHollow,news,1,1482761000.0
1,-SA-HatfulOfHollow,reddevils,1,1482742000.0
2,-SA-HatfulOfHollow,soccer,1,1482771000.0
3,-SA-HatfulOfHollow,worldnews,11,1476293000.0
4,-_-_-_-otalp-_-_-_-,Android,3,1475605000.0
5,-_-_-_-otalp-_-_-_-,AskAnthropology,2,1480134000.0
6,-_-_-_-otalp-_-_-_-,AskReddit,2,1482744000.0
7,-_-_-_-otalp-_-_-_-,BlackPeopleTwitter,6,1482560000.0
8,-_-_-_-otalp-_-_-_-,CrazyIdeas,1,1480079000.0
9,-_-_-_-otalp-_-_-_-,DC_Cinematic,2,1476638000.0


In [10]:
user_sum = data.groupby(['username'], as_index=False).agg({'submission_freq':'sum'})
temp = pd.merge(left = data, right = user_sum, how='left', left_on='username',right_on='username').\
                rename(columns={'submission_freq_y':'user_sum',
                               'submission_freq_x':'submission_freq'})
data['user_implicit_rating'] = temp.submission_freq/temp.user_sum
data.drop(columns=['submission_freq'], inplace=True)
data = pd.concat([data.iloc[:,:2],data.iloc[:,-1:],data['most_recent_timestamp']], axis=1)
data.dropna(inplace = True)

In [11]:
data.shape

(32018, 4)

# Splitting Train and Test Dataset based on Timestamp (utc)

- Calculating implicit rating using number of submissions per subreddit.
- Ideally need data for upvotes.

In [12]:
users = data['username'].unique() #list of all users
subs = data['subreddit'].unique() #list of all movies

test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)
test_ratio = 0.2 #fraction of data to be used as test set.
temp1 = data[data.username.isin(users)]
for u in users:
    n = len(temp1)
    test_size = int(test_ratio*n)

temp1 = temp1.sort_values('most_recent_timestamp').reset_index()
temp1.drop('index', axis=1, inplace=True)
    
dummy_test = temp1.ix[n-1-test_size :]
dummy_train = temp1.ix[: n-2-test_size]
    
test = pd.concat([test, dummy_test])
train = pd.concat([train, dummy_train])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()


In [13]:
print("""Train Data for User "-_-_-_-otalp-_-_-_-"        :""")
print(train[train.username == '-_-_-_-otalp-_-_-_-'].iloc[:,:3])
print(" ")
print("""Test Data for User "-_-_-_-otalp-_-_-_-"        :""")
print(test[test.username == '-_-_-_-otalp-_-_-_-'].iloc[:,:3])

Train Data for User "-_-_-_-otalp-_-_-_-"        :
                  username             subreddit  user_implicit_rating
11220  -_-_-_-otalp-_-_-_-             worldnews              0.001017
11222  -_-_-_-otalp-_-_-_-            OCOCTATIAT              0.001017
11365  -_-_-_-otalp-_-_-_-           nottheonion              0.001017
11405  -_-_-_-otalp-_-_-_-     millionairemakers              0.001017
11600  -_-_-_-otalp-_-_-_-          changemyview              0.001017
11650  -_-_-_-otalp-_-_-_-                  news              0.001017
12684  -_-_-_-otalp-_-_-_-               Android              0.003052
13535  -_-_-_-otalp-_-_-_-               chomsky              0.002035
13634  -_-_-_-otalp-_-_-_-          DC_Cinematic              0.002035
14440  -_-_-_-otalp-_-_-_-        TheoryOfReddit              0.002035
14453  -_-_-_-otalp-_-_-_-         UpliftingNews              0.001017
14629  -_-_-_-otalp-_-_-_-          the_meltdown              0.001017
15198  -_-_-_-otalp-_-_-_-

### Transforming the Dataframe into Utility Matrix for SVD Computation Later

In [22]:
userList = data.ix[:,0].tolist()
itemList = data.ix[:,1].tolist()
valueList = data.ix[:,2].tolist()
users = list(set(data.ix[:,0]))
items = list(set(data.ix[:,1]))
users_index = {users[i]: i for i in range(len(users))}
pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
for i in range(0,len(data)):
    item = itemList[i]
    user = userList[i]
    value = valueList[i]
    pd_dict[item][users_index[user]] = value
X = pd.DataFrame(pd_dict)
X.index = users

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
 

In [23]:
X[X.index == '-_-_-_-otalp-_-_-_-']['soccer']

-_-_-_-otalp-_-_-_-    0.348932
Name: soccer, dtype: float64

In [27]:
def svd(train, k):
    utilMat = np.array(train)
    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x
    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    print("svd done")
    return UsV

In [32]:
def mse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

def mae(true, pred):
    # this will be used towards the end
    x = abs(true - pred)
    return sum([xi for xi in x])/len(x)


# to test the performance over a different number of features
no_of_features = [134]

svdout = svd(X, k=134)
pred = [] #to store the predicted ratings
    
for _,row in test.iterrows():
        user = row['username']
        item = row['subreddit']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
print(mse(test['user_implicit_rating'], pred))
print(mae(test['user_implicit_rating'], pred))


svd done
0.0001907806366327251
0.005219784277697061


## Preparing the dataset for our SVD Recommendation Demo

In [63]:
top_subreddit = reddit_df.groupby(['subreddit']
                                      , as_index=False).agg({'username': 'count'}).sort_values(by=['username']
                                      , ascending=False).reset_index(drop=True).rename(columns={'username':'username_count'})
top_subreddit['cummulative_pct'] = top_subreddit.username_count.cumsum()/top_subreddit.username_count.sum()*100
top_subreddit.head()

Unnamed: 0,subreddit,username_count,cummulative_pct
0,AskReddit,683932,7.282656
1,politics,260215,10.053482
2,The_Donald,146480,11.613232
3,nfl,122088,12.913252
4,worldnews,109187,14.075899


In [64]:
user = reddit_df.username.unique()
subreddit = reddit_df.subreddit.unique()
doc_df = reddit_df.groupby('username')['subreddit'].apply(lambda x: "%s" % ' '.join(x)).reset_index()
doc_df.head()

Unnamed: 0,username,subreddit
0,--ANUSTART-,Testosterone Testosterone Testosterone Testost...
1,--Sko--,DestinyTheGame DestinyTheGame DestinyTheGame D...
2,--UNKN0WN--,AceAttorney AceAttorney AceAttorney AceAttorne...
3,--harley--quinn--,LGBTeens Patriots asktransgender Patriots Patr...
4,-A-p-r-i-l-,tdi tdi tdi AskReddit tdi tdi tdi tdi tdi tdi ...


In [65]:
tokenizer = TreebankWordTokenizer()
document = doc_df.iloc[:, 1]
document = document.apply(lambda row: tokenizer.tokenize(row))
document.head()

0    [Testosterone, Testosterone, Testosterone, Tes...
1    [DestinyTheGame, DestinyTheGame, DestinyTheGam...
2    [AceAttorney, AceAttorney, AceAttorney, AceAtt...
3    [LGBTeens, Patriots, asktransgender, Patriots,...
4    [tdi, tdi, tdi, AskReddit, tdi, tdi, tdi, tdi,...
Name: subreddit, dtype: object

## Creating User-Subreddit Matrix
Using CSC Matrix to Handle highly sparse matrix. To view normally, use : user_subreddit_matrix.todense()

In [66]:
corpus_of_subs = []
for subreddits in subreddit:
    corpus_of_subs.append(subreddits)


voc2id = dict(zip(corpus_of_subs, range(len(corpus_of_subs))))
rows, cols, vals = [], [], []
for r, d in enumerate(document):
    for e in d:
        if voc2id.get(e) is not None:
            rows.append(r)
            cols.append(voc2id[e])
            vals.append(1)
user_subreddit_matrix = csc_matrix((vals, (rows, cols)), dtype=np.float32)
print((user_subreddit_matrix.shape))

(14999, 29280)


In [67]:
def computeSVD(user_subreddit_matrix, no_of_latent_factors):
    
    """Compute the SVD of the given matrix.
    :user_subreddit_matrix: a numeric matrix
    :no_of_latent_factors : numeric scalar value
    
    :U  : User to concept matrix 
    :S  : Strength of the concepts matrix
    :Vt : Subreddit to concept matrix
    """
    U, s, Vt = sparsesvd(user_subreddit_matrix, no_of_latent_factors)
    
    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(np.transpose(U), dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)

    return U, S, Vt

In [68]:
#Compute estimated recommendations for the given user
def computeEstimatedRecommendation(U, S, Vt, uTest):
    """Compute the recommendation for the given user.
    
    :U     : User to concept matrix 
    :S     : Strength of the concepts matrix
    :Vt    : Subreddit to concept matrix
    :uTest : Index of the user for which the recommendation has to be made
    
    :recom : List of recommendations made to the user
    """
 
    #constants defining the dimensions of the estimated rating matrix
    MAX_PID = len(subreddit)
    MAX_UID = len(user)
    
    rightTerm = S*Vt 

    EstimatedRecommendation = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        # Converting the vector to dense format in order to get the indices 
        # of the movies with the best estimated ratings 
        
        EstimatedRecommendation[userTest, :] = prod.todense()
        recom = (-EstimatedRecommendation[userTest, :]).argsort()[:293]
    return recom

In [69]:
n_latent_fact = top_subreddit.subreddit[top_subreddit.cummulative_pct <= 65].count()
contribution_pct = round(top_subreddit.cummulative_pct[len(subreddit[top_subreddit.cummulative_pct <= 65])-1],1)

print("Top", n_latent_fact ,"subreddits contribute a total of"
      , contribution_pct,"%", "to the total subreddits in the dataset")


Top 293 subreddits contribute a total of 65.0 % to the total subreddits in the dataset


## Recommendation Demo 1

In [127]:
no_of_latent_factors = top_subreddit.subreddit[top_subreddit.cummulative_pct <= 65].count() #293
no_of_recommendations_for_each_user = 5
uTest = [np.where(user == 'CarnationsPls')[0][0]]
U, S, Vt = computeSVD(user_subreddit_matrix, no_of_latent_factors)

In [128]:
print("------------------------------------------------------------------------------------\n")
print("Redditor: %s\n" % user[uTest[0]])
print("------------------------------------------------------------------------------------\n")
print("User Subreddit History - \n")

##Getting users subs history where the vals in the matrix != 0
previous_subredit_history = subreddit[np.where(user_subreddit_matrix[uTest[0],:].todense().T != 0)[0]]
previous_subredit_history
for previous_subredits in previous_subredit_history:
     print(previous_subredits)
print("\n------------------------------------------------------------------------------------\n")

------------------------------------------------------------------------------------

Redditor: CarnationsPls

------------------------------------------------------------------------------------

User Subreddit History - 

sports
gaming
gifs
AskReddit
fo4
todayilearned
TheLastAirbender
realrule34

------------------------------------------------------------------------------------



In [129]:
#Get the top 5 subreddit recommendations for test user
recommended_items = computeEstimatedRecommendation(U, S, Vt, uTest)
final_recommendation = []
for r in subreddit[recommended_items]:
    ##Making sure the subreddits aren't from what they already viewed before (for Novelty)
    if r not in previous_subredit_history:
        final_recommendation.append(r)
        if len(final_recommendation) == no_of_recommendations_for_each_user:
            break

print("------------------------------------------------------------------------------------\n")
print("Recommendation for %s : \n" % user[uTest[0]])
print("------------------------------------------------------------------------------------\n")

for recommendation in final_recommendation:
    print(recommendation)
print("------------------------------------------------------------------------------------\n")


------------------------------------------------------------------------------------

Recommendation for CarnationsPls : 

------------------------------------------------------------------------------------

Calgary
Amd
pcgaming
techsupport
NoMansSkyTheGame
------------------------------------------------------------------------------------



## Recommendation Demo 2

In [141]:
no_of_latent_factors = top_subreddit.subreddit[top_subreddit.cummulative_pct <= 65].count() #293
no_of_recommendations_for_each_user = 5
uTest = [np.where(user == 'comicfan815')[0][0]]
U, S, Vt = computeSVD(user_subreddit_matrix, no_of_latent_factors)

In [142]:
print("------------------------------------------------------------------------------------\n")
print("Redditor: %s\n" % user[uTest[0]])
print("------------------------------------------------------------------------------------\n")
print("User Subreddit History - \n")

##Getting users subs history where the vals in the matrix != 0
previous_subredit_history = subreddit[np.where(user_subreddit_matrix[uTest[0],:].todense().T != 0)[0]]
previous_subredit_history
for previous_subredits in previous_subredit_history:
     print(previous_subredits)
print("\n------------------------------------------------------------------------------------\n")

------------------------------------------------------------------------------------

Redditor: comicfan815

------------------------------------------------------------------------------------

User Subreddit History - 

reactiongifs
cringepics
nba
lakers
NBA2k

------------------------------------------------------------------------------------



In [143]:
#Get the top 5 subreddit recommendations for test user
recommended_items = computeEstimatedRecommendation(U, S, Vt, uTest)
final_recommendation = []
for r in subreddit[recommended_items]:
    ##Making sure the subreddits aren't from what they already viewed before (for Novelty)
    if r not in previous_subredit_history:
        final_recommendation.append(r)
        if len(final_recommendation) == no_of_recommendations_for_each_user:
            break

print("------------------------------------------------------------------------------------\n")
print("Recommendation for %s : \n" % user[uTest[0]])
print("------------------------------------------------------------------------------------\n")

for recommendation in final_recommendation:
    print(recommendation)
print("------------------------------------------------------------------------------------\n")

------------------------------------------------------------------------------------

Recommendation for comicfan815 : 

------------------------------------------------------------------------------------

rockets
warriors
bostonceltics
sixers
torontoraptors
------------------------------------------------------------------------------------



## The End