In [1]:
from io import StringIO
from io import BytesIO
import requests
import json
import pandas as pd
import numpy as np
import itertools
from datetime import datetime as dt
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# @hidden_cell
# This function accesses a file in your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
*****Credentials*****
data = pd.read_csv(get_object_storage_file_with_credentials_9c051c366b95483f8eb41bf30eec97a7('DefaultProjectsaicolosseumtechcoin', 'reddit_data.csv'))
data.head()

Unnamed: 0,username,subreddit,utc
0,kabanossi,photoshopbattles,1482747899
1,kabanossi,GetMotivated,1482747794
2,kabanossi,vmware,1482747676
3,kabanossi,carporn,1482747618
4,kabanossi,DIY,1482747479


In [3]:
data['date'] = data['utc'].apply(lambda x: dt.fromtimestamp(x).date()) # Extracting date from utc

In [4]:
data.head()

Unnamed: 0,username,subreddit,utc,date
0,kabanossi,photoshopbattles,1482747899,2016-12-26
1,kabanossi,GetMotivated,1482747794,2016-12-26
2,kabanossi,vmware,1482747676,2016-12-26
3,kabanossi,carporn,1482747618,2016-12-26
4,kabanossi,DIY,1482747479,2016-12-26


In [5]:
data.tail()

Unnamed: 0,username,subreddit,utc,date
13999995,RushNY,iOSthemes,1405371369,2014-07-14
13999996,RushNY,iOSthemes,1404528135,2014-07-04
13999997,RushNY,cars,1404229927,2014-07-01
13999998,RushNY,Toyota,1404143090,2014-06-30
13999999,RushNY,jailbreak,1404094741,2014-06-29


In [6]:
data['username'].nunique()

22610

In [7]:
data['subreddit'].nunique()

34967

users = set(data['username'])
subreddits = set(data['subreddit'])

# Outlier
Users who've visited the site lesser times can be outliers. But they can very well also be new users. We thus decide to treat those users as outliers who haven't visited the site one year prior to the current date.

In [8]:
data['year'] = data['date'].apply(lambda x: x.year)
data_old = data[data['year'] != 2016]

In [9]:
outliers = data_old['username'].value_counts().sort_values()[:1109].index

In [10]:
data.drop('year', axis  = 1, inplace = True)

In [11]:
outliers[:5]

Index([u'AH-17_', u'dumbkid12', u'Northgates', u'Starlord_26', u'curious6789'], dtype='object')

# Extracting 'weight' feature from utc.
The reasoning here is that subreddits which were searched for years back probably would not have as much relevance as recently searched subreddits do. Thus we normalise the utc value to use it as a weight column

In [12]:
data['utc'] = data['utc'] - data['utc'].min() + 1

In [13]:
data['utc'] = data['utc'] / data['utc'].max()
data.rename(columns = {'utc': 'weight'}, inplace = True)

# Extracting user wise information
This enables us to easily extract information about a user by indexing the username. 

In [14]:
users_subreddits = pd.DataFrame(data.groupby(['username', 'subreddit'])['weight'].sum())

In [15]:
users_subreddits.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
username,subreddit,Unnamed: 2_level_1
--ANUSTART-,AOImmortals,1.957625
--ANUSTART-,Addons4Kodi,0.904965
--ANUSTART-,AdviceAnimals,6.469323
--ANUSTART-,AskReddit,13.022203
--ANUSTART-,Assistance,8.923811


# Flattening the multi-index dataframe extracted above
This gives us a clear comparision between every user

In [16]:
user_similarity_vectors = users_subreddits.unstack(level = -1)

In [17]:
user_similarity_vectors.fillna(0, inplace = True)
user_similarity_vectors.head()

Unnamed: 0_level_0,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight
subreddit,007,065_082_071,0ad,0x10c,0x3642,1000degreeknife,1000thworldproblems,100DayComicChallenge,100DaysofKeto,100kin,...,zorinos,zsh,zumba,zurich,zwave,zweiteliga,zxspectrum,zyramains,zyzz,zzt
username,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
--ANUSTART-,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--Sko--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--UNKN0WN--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--harley--quinn--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-A-p-r-i-l-,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Creating a dataframe to get the similarities between every users.
Cosine similarity is used to compute the similarities between every user

In [19]:
users = data['username'].unique()

In [20]:
similarity = pd.DataFrame(cosine_similarity(user_similarity_vectors), index = users, columns = users)

In [21]:
similarity.head()

Unnamed: 0,kabanossi,MahatmaGuru,Drums_And_Dreams,Dockhead,bob-leblaw,Very_High_IQ_Yes,CarnationsPls,414D59,Rankscar,thelonghauls,...,anz_cheer_up,SaiyanOfDarkness,Konzti,need_a_nick,sk3pt1c,theseconddennis,Insolent_redneck,Least_ValuablePlayer,applebrush,RushNY
kabanossi,1.0,0.000184,0,0.014229,0.03278,0.00518,0.016461,0.096339,0.001305,0.024663,...,0.003963,0.040275,0.096222,0.001477,0.000192,0.116619,0.030743,0,0,0.041362
MahatmaGuru,0.000184,1.0,0,0.000513,0.0,0.0,0.00022,6.5e-05,8.9e-05,0.0,...,0.000314,0.000104,0.000261,0.000134,0.0,0.000268,0.00022,0,0,0.000134
Drums_And_Dreams,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000138,0.0,0.0,0.0,0.0,0,0,0.0
Dockhead,0.014229,0.000513,0,1.0,0.030783,0.004601,0.01746,0.0909,0.003564,0.002529,...,0.004708,0.031422,0.087542,0.001025,0.0,0.103833,0.015314,0,0,0.03976
bob-leblaw,0.03278,0.0,0,0.030783,1.0,0.014086,0.015896,0.286398,0.0,0.006027,...,0.0,0.093193,0.253691,0.001096,0.0,0.30824,0.045567,0,0,0.121497


In [22]:
np.fill_diagonal(similarity.values,0)

In [23]:
similarity.head()

Unnamed: 0,kabanossi,MahatmaGuru,Drums_And_Dreams,Dockhead,bob-leblaw,Very_High_IQ_Yes,CarnationsPls,414D59,Rankscar,thelonghauls,...,anz_cheer_up,SaiyanOfDarkness,Konzti,need_a_nick,sk3pt1c,theseconddennis,Insolent_redneck,Least_ValuablePlayer,applebrush,RushNY
kabanossi,0.0,0.000184,0,0.014229,0.03278,0.00518,0.016461,0.096339,0.001305,0.024663,...,0.003963,0.040275,0.096222,0.001477,0.000192,0.116619,0.030743,0,0,0.041362
MahatmaGuru,0.000184,0.0,0,0.000513,0.0,0.0,0.00022,6.5e-05,8.9e-05,0.0,...,0.000314,0.000104,0.000261,0.000134,0.0,0.000268,0.00022,0,0,0.000134
Drums_And_Dreams,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000138,0.0,0.0,0.0,0.0,0,0,0.0
Dockhead,0.014229,0.000513,0,0.0,0.030783,0.004601,0.01746,0.0909,0.003564,0.002529,...,0.004708,0.031422,0.087542,0.001025,0.0,0.103833,0.015314,0,0,0.03976
bob-leblaw,0.03278,0.0,0,0.030783,0.0,0.014086,0.015896,0.286398,0.0,0.006027,...,0.0,0.093193,0.253691,0.001096,0.0,0.30824,0.045567,0,0,0.121497


# Creating a dataframe having the most similar user for each user to avoid computing it repetitively
This enables us to easily access the most similar user for any particular user.
This makes it computationally cheap as the similarity function and the most similar user matrix can be updated periodically.

In [24]:
user_user = pd.DataFrame(similarity.idxmax(axis = 1), index = similarity.index)

In [25]:
user_user.head()

Unnamed: 0,0
kabanossi,alexyct
MahatmaGuru,SlapDa
Drums_And_Dreams,nitotheblue
Dockhead,torahking
bob-leblaw,SandraSimmons001


# We create a dataframe consisting of the most searched subreddits from the previous month
We get the most recent trends through this.

In [26]:
data['year'] = data['date'].apply(lambda x: x.year)
data['month'] = data['date'].apply(lambda x: x.month)
last_month_data = data[(data['year'] == 2016) & (data['month'] == 12)]
data.drop(['year', 'month'], axis = 1, inplace = True)
last_month_data.drop(['date', 'year', 'month'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
recent_top_10 = last_month_data.groupby('subreddit')['weight'].sum().sort_values()[:10]

# Function used for recommendation
We use three information domains from the available data
1. User history
2. Similar user's history
3. Recent trends

In [28]:
def recommend(username):
    if username in users:
        if users_subreddits.xs((username,)).shape[0] >= 4:
            recommendations = set(users_subreddits.xs((username,)).sort_values(by= 'weight', ascending = False)[:4].index)
            similar_top_5 = set(users_subreddits.xs((user_user.loc[username][0], )).sort_values(by= 'weight', ascending = False)[:5].index)
            recommendations = recommendations.union(similar_top_5)
            recommendations = recommendations.union(recent_top_10[:10 - len(recommendations)].index)
            return pd.Series(list(recommendations))
        else:
            recommendations = set(users_subreddits.xs((username,)).sort_values(by= 'weight', ascending = False).index)
            similar_top_5 = set(users_subreddits.xs((user_user.loc[username][0], )).sort_values(by= 'weight', ascending = False)[:5].index)
            recommendations = recommendations.union(similar_top_5)
            recommendations = recommendations.union(recent_top_10[:10 - len(recommendations)].index)
            return pd.Series(list(recommendations))
    else:
        return pd.Series(recent_top_10.index)

In [29]:
solution_df = pd.DataFrame(index = users, columns = range(0,10))

In [30]:
solution_df = pd.Series(list(users)).apply(lambda x : recommend(x))
solution_df.index = list(users)

In [31]:
solution_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
kabanossi,Chasers,photoshopbattles,cratedigging,uniformporn,Deformed,paraballs,sysadmin,homelab,techsupport,GetMotivated
MahatmaGuru,cratedigging,todayilearned,self,RecruitCS,guns,Jokes,AskReddit,news,OverwatchLFT,GetMotivated
Drums_And_Dreams,funny,brasil,cratedigging,AskReddit,pics,PoliticalDiscussion,DIY,TwoXChromosomes,politics,The_Donald
Dockhead,Chasers,food,AskReddit,Deformed,cratedigging,worldnews,news,explainlikeimfive,paraballs,trees
bob-leblaw,Chasers,cratedigging,DoesAnybodyElse,todayilearned,AskReddit,Showerthoughts,acting,DaniellaMonet,shockwaveporn,canucks


In [32]:
solution_df.shape

(22610, 10)