In [2]:
import pandas as pd
import numpy as np

In [3]:
following = pd.read_json("json_data/lucid_table_following.json")
notifications = pd.read_json("json_data/lucid_table_notifications.json")
posts = pd.read_json("json_data/lucid_table_posts.json")
users = pd.read_json("json_data/lucid_table_users.json")

In [4]:
#Dropping irrevelant columns and renaming some columns
#column are renamed based on context for easier computing

following.drop('status', axis=1, inplace=True)
following.rename(columns={'my_id': 'followed_id', 'follower_id': 'user_id'}, inplace=True)
following.head()

Unnamed: 0,user_id,followed_id
0,6,3
1,3,6
2,2,3
3,7,3
4,2,7


In [5]:
cols = ['name', 'username', 'email', 'image', 'provider', 'provider_id', 'password', 'remember_token', 'created_at', 'updated_at', 'short_bio']

users_edited = users.drop(cols, axis=1)
users_edited.rename(columns={'id': 'user_id'}, inplace=True)
users_edited.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


In [6]:
col = ['slug', 'created_at', 'updated_at', 'image', 'status_id', 'action', 'post_id']

posts_edited = posts.drop(col, axis=1)
posts_edited.rename(columns={'id': 'post_id'}, inplace = True)
posts_edited.head()

Unnamed: 0,content,post_id,tags,title,user_id
0,I learnt how to use the table tag as i have us...,1,,What i have learnt so far on HTML,2077
1,"Â I am on this journey with start.ng, and here...",2,Technology,HTML BEGINS HERE,1719
2,I have not been attending classes on the HNG c...,4,,My Laziness In The Open,1310
3,My journey on **StartNG** pre-internship progr...,6,,MY TASK 2,1787
4,"Â A Summary on The â€œidongesit.htmlâ€� CV, It...",7,,Task 2,167


In [7]:
notif_col = ['id', 'parent_comment_id', 'status', 'type', 'created_at', 'updated_at']
notif_edited = notifications.drop(notif_col, axis=1)
notif_edited.head()

Unnamed: 0,action,comment,post_id,sender_id,user_id
0,Followed,,,4,7
1,Followed,,,1,4
2,Followed,,,3,45
3,Followed,,,1,4
4,Commented,Hey bro!,278.0,1,4


In [8]:
#Data for action between each sender, the post interacted with and the number of interactions
#Note that groupby will ignore the null values in post_id
#And also sender_id is renamed user_id as is in this context

inter = notif_edited.groupby(['sender_id', 'post_id'])['action'].apply(lambda x: x.notnull().sum()).reset_index()
inter.rename(columns={'sender_id': 'user_id'}, inplace=True)
inter

Unnamed: 0,user_id,post_id,action
0,1,278.0,1
1,1,353.0,1
2,1,355.0,2
3,1,985.0,1
4,1,996.0,1
5,1,999.0,1
6,2,985.0,2
7,2,992.0,3
8,2,993.0,1
9,2,998.0,3


#Notice that the data for interactions/actions is quite small compared to the number of users in the database

#The actions('like', 'comment', 'replied', 'love') by one user per post is only 35 rows 

In [9]:
# Number of posts and users in database

n_items = posts_edited.post_id.max()
n_users = users_edited.user_id.max()

In [10]:
#Merging tables to compute user-post matrix and user-follow matrix

df_inter = users_edited.merge(inter, on='user_id')
df_follow = users_edited.merge(following, on='user_id')
df_inter.shape, df_follow.shape

((36, 3), (5296, 2))

In [11]:
df_inter

Unnamed: 0,user_id,post_id,action
0,1,278.0,1
1,1,353.0,1
2,1,355.0,2
3,1,985.0,1
4,1,996.0,1
5,1,999.0,1
6,2,985.0,2
7,2,992.0,3
8,2,993.0,1
9,2,998.0,3


In [12]:
df_follow.head()

Unnamed: 0,user_id,followed_id
0,1,4
1,1,3
2,1,3
3,1,164
4,1,767


In [13]:
#data matrix for users and items
data_mat1 = np.zeros((n_users, n_items))

for line in df_inter.itertuples():
    data_mat1[line[1]-1, int(line[2])-1] = line[3]

In [14]:
#data matrix for users and people they follow
data_mat2 = np.zeros((n_users, n_users))

for line in df_follow.itertuples():
    data_mat2[line[1]-1, line[2]-1] = 1

The matrix product below between data_mat2 and data_mat1 is to get a matrix which is a linear function of the people a user  is following(follow_id(s)) and each post with respect to each user.

The new matrix will capture the interactions of the followed_id(s) and each post for each user

In [15]:
data_mat3 = data_mat2.dot(data_mat1)

In [16]:
#Combination of matrix 3 and 1
data_mat4 = data_mat3 + data_mat1

In [17]:
#Using pairwise_distances to compute similarity

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_mat4,metric='cosine')

In [18]:
#function to get user predictions
#formula used is P = sum(r*s)/sum(s), where P is prediction
#r is rating and s is similarity

def predict(ratings, similarity):
    mean_user = ratings.mean(axis=1)
    ratings_dif = (ratings - mean_user[:, np.newaxis])
    pred = mean_user[:, np.newaxis] + similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pd.DataFrame(np.abs(pred))

In [19]:
#function to get predictions and index for first five posts given user id
def get_pred(user_id, pred):
    token = dict(pred.iloc[user_id])
    token2 = dict(sorted(token.items(), key=lambda x: x[1], reverse=True))
    token3 = list(token2.keys())[:5]
    return token3

In [20]:
#function to return post title given index
def get_post(index, posts):
    post = pd.Series(posts_edited.title.values, index=posts_edited.post_id.values)
    return post.iloc[index]

In [21]:
#Here you can play around with all the matrices
user_pred = predict(data_mat4, user_similarity)

In [22]:
test_pred = get_pred(40, user_pred)

for i in test_pred:
    try:
        print(get_post(i, posts_edited))
    except:
        pass

My startNg Expectations
A Trip To Iceland


The matrix used in computing computing the predictions is sparse, as such the memory based collaborative model suffers a problem known as cold start which is very common with collaborative filters.