# Video to Video Ranker
### A content-filtering kNN model

In order to promote content diversity, content delivery platforms usually employ models that connect users with content that is similar to what they have been exposed to already. 

These models are trained only to examine the similarities between the content available.

In this notebook, you'll see that I've used the kNNBaseline model from the Python Surprise package to start and have left my best performing iteration from there.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans

In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify

<IPython.core.display.Javascript object>

# Import Data and Data Split

If you haven't already, you can see how I split the data in the 1M_PVR Notebook.

In [3]:
data = pd.read_csv('data/1m_useratt_minreq.csv')
minorityrec = pd.read_csv('data/minreq.csv')

train = pd.read_csv('data/train_1M.csv')
test = pd.read_csv('data/test_1M.csv')
holdout = pd.read_csv('data/ho_1M.csv')

print('holdout shape: ', holdout.shape)
print('test shape: ', test.shape)
print('train shape: ', train.shape)

holdout shape:  (100000, 11)
test shape:  (200000, 11)
train shape:  (700000, 11)


In [8]:
data['m_decade'].replace(0, 6, inplace=True)
train['m_decade'].replace(0, 6, inplace=True)
test['m_decade'].replace(0, 6, inplace=True)
holdout['m_decade'].replace(0, 6, inplace=True)

## Preparing Data in Surprise Format

In [9]:
reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(train[['cust_id','mid','rating']], reader)
test_data = Dataset.load_from_df(test[['cust_id','mid','rating']], reader)
ho_data = Dataset.load_from_df(holdout[['cust_id','mid','rating']], reader)
all_data = Dataset.load_from_df(data[['cust_id','mid','rating']], reader)

#correct surprise dataset format
train_sr = train_data.build_full_trainset()

test_sr1 = test_data.build_full_trainset()
test_sr = test_sr1.build_testset()

ho_sr1 = ho_data.build_full_trainset()
ho_sr = ho_sr1.build_testset()

all_sr = all_data.build_full_trainset()

# kNN Baseline Model

In [10]:
sim_dict = {'name': 'cosine', 'user_based': False}
knn_bsl1 = KNNBaseline(sim_options=sim_dict)
knn_bsl1.fit(train_sr)
knn_bsl1_preds = knn_bsl1.test(test_sr)

accuracy.rmse(knn_bsl1_preds)
accuracy.mae(knn_bsl1_preds)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.1160
MAE:  0.8709


0.8709091973688263

# Best Performing kNN Model

In [None]:
sim_dict = {'name': 'cosine', 'user_based': False}
knn_bsl = KNNBaseline(min_k=5, sim_options=sim_dict)
knn_bsl.fit(train_sr)
knn_bsl_preds = knn_bsl.test(test_sr)

accuracy.rmse(knn_bsl_preds)
accuracy.mae(knn_bsl_preds)

In [None]:
knn_bsl_preds_ho = knn_bsl.test(ho_sr)

accuracy.rmse(knn_bsl_preds_ho)
accuracy.mae(knn_bsl_preds_ho)

## DECADE

In [11]:
reader_decade = Reader(rating_scale=(1,6))

train_data_dec = Dataset.load_from_df(train[['cust_id','mid','m_decade']], reader_decade)
test_data_dec = Dataset.load_from_df(test[['cust_id','mid','m_decade']], reader_decade)
ho_data_dec = Dataset.load_from_df(holdout[['cust_id','mid','m_decade']], reader_decade)
all_data_dec = Dataset.load_from_df(data[['cust_id','mid','m_decade']], reader_decade)

#correct surprise dataset format
train_sr_dec = train_data_dec.build_full_trainset()

test_sr1_dec = test_data_dec.build_full_trainset()
test_sr_dec = test_sr1_dec.build_testset()

ho_sr1_dec = ho_data_dec.build_full_trainset()
ho_sr_dec = ho_sr1_dec.build_testset()

all_sr_dec = all_data_dec.build_full_trainset()

In [13]:
sim_dict = {'name': 'cosine', 'user_based': False}
knn_bsl_dec = KNNBaseline(sim_options=sim_dict)
knn_bsl_dec.fit(train_sr_dec)
knn_bsl_dec_preds = knn_bsl_dec.test(test_sr_dec)

accuracy.rmse(knn_bsl_dec_preds)
accuracy.mae(knn_bsl_dec_preds)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.2507
MAE:  0.1017


0.10170018278665467

In [18]:
def dec_rec(cust_id):
        
    # last movie they rated
    rated = data[data['cust_id'] == cust_id]['mid'].to_list()
    mid = rated[-1]

    # neighbors
    neighbors = knn_bsl_dec.get_neighbors(all_sr.to_inner_iid(mid), k=10)

    #get raw item ids
    item_ids = [all_sr.to_raw_iid(inner_id) for inner_id in neighbors]

    #returns list of cust_id, iid, neighbor for each neighbor
    neighbors_list = [(cust_id, mid, iid) for iid in item_ids]

    #new df of recs for analysis
    neighbors_df = pd.DataFrame(neighbors_list, columns=["cust_id", "mid", 'recs'])
    return neighbors_df

In [19]:
test = dec_rec(2407458)
test

Unnamed: 0,cust_id,mid,recs
0,2407458,4157,15436
1,2407458,4157,9645
2,2407458,4157,7048
3,2407458,4157,8627
4,2407458,4157,8333
5,2407458,4157,12677
6,2407458,4157,16913
7,2407458,4157,8507
8,2407458,4157,15813
9,2407458,4157,443


In [23]:
ruser = data.sample(1, random_state=2)
ruser

Unnamed: 0,mid,cust_id,rating,r_date,m_decade,m_avg_rating,user_engagement,cust_act_activity_rank,adopters,m_minreq
309190,607,2200332,3.0,2005-05-14,4,3.521739,6,4,4,0.0


In [25]:
test2 = dec_rec(2200332)
test2

Unnamed: 0,cust_id,mid,recs
0,2200332,16563,16912
1,2200332,16563,17251
2,2200332,16563,4147
3,2200332,16563,6574
4,2200332,16563,1123
5,2200332,16563,12303
6,2200332,16563,551
7,2200332,16563,7242
8,2200332,16563,3332
9,2200332,16563,4996


In [27]:
data[(data['mid'] == 5938)]

Unnamed: 0,mid,cust_id,rating,r_date,m_decade,m_avg_rating,user_engagement,cust_act_activity_rank,adopters,m_minreq
971572,5938,289710,3.0,2001-03-10,6,2.0,18,5,2,0.0
971573,5938,1272379,1.0,2002-04-06,6,2.0,36,5,2,0.0


In [28]:
data[(data['cust_id'] == 1272379)]

Unnamed: 0,mid,cust_id,rating,r_date,m_decade,m_avg_rating,user_engagement,cust_act_activity_rank,adopters,m_minreq
60694,6510,1272379,4.0,2002-01-05,4,3.927431,36,5,2,0.0
145042,7381,1272379,1.0,2002-01-05,4,3.192029,36,5,2,0.0
196211,5287,1272379,3.0,2002-03-16,1,3.801418,36,5,2,0.0
310752,15748,1272379,1.0,2002-01-25,4,3.05298,36,5,2,1.0
312644,17622,1272379,1.0,2005-02-09,5,3.101498,36,5,2,0.0
482706,11042,1272379,1.0,2002-04-28,2,3.734177,36,5,2,0.0
488405,9442,1272379,2.0,2005-01-06,5,3.590998,36,5,2,0.0
513723,8845,1272379,3.0,2002-03-28,5,3.37931,36,5,2,0.0
517526,5112,1272379,4.0,2002-01-05,5,3.534946,36,5,2,0.0
692779,5474,1272379,2.0,2002-03-06,5,2.636364,36,5,2,0.0


In [30]:
test3 = dec_rec(1272379)
test3

Unnamed: 0,cust_id,mid,recs
0,1272379,11566,11337
1,1272379,11566,15578
2,1272379,11566,2286
3,1272379,11566,16835
4,1272379,11566,3113
5,1272379,11566,2953
6,1272379,11566,13882
7,1272379,11566,16912
8,1272379,11566,10785
9,1272379,11566,7533


# V2V Ranker

I retrain the best model using all the data and then generate a V2V Ranker based only on the last video that a user rated.

In [None]:
#retrain on all data
knn_bsl.fit(all_sr)

In [None]:
def v2v_recs(user_list):
    
    '''
    this is an unpersonalized item based recommendation
    returns a movie that is similar to one that has been watched 
    but doesn't take into account any other user behavior
    '''
    
    v2v = []
    
    def v2v_mod(cust_id):
        
        # last movie they rated
        rated = data[data['cust_id'] == cust_id]['mid'].to_list()
        mid = rated[-1]
        
        # neighbors
        neighbors = knn_bsl.get_neighbors(all_sr.to_inner_iid(mid), k=10)
        
        #get raw item ids
        item_ids = [all_sr.to_raw_iid(inner_id) for inner_id in neighbors]

        #returns list of cust_id, iid, neighbor for each neighbork
        return [(cust_id, iid, neighbor) for iid in item_ids]
    
    #for all users get the nearest neighbors to the last movie they rated
    for cust_id in user_list:
        user_recs = v2v_mod(cust_id)
        v2v.extend(user_recs)
    
    #new df of recs for analysis
    neighbors_df = pd.DataFrame(v2v, columns=["cust_id", "mid", 'recs'])
    return neighbors_df

# V2V Ranker Results

In [None]:
#list of all users
all_users = data['cust_id'].unique()
len(all_users)

In [None]:
v2v_df = v2v_recs(all_users)
v2v_df.head()

In [None]:
v2v_df.to_csv('data/v2vrecs.csv', index=False)

In [None]:
v2v_df = v2v_df.merge(minorityrec)
v2v_df.head()

In [None]:
v2v_df['mid'].value_counts()

In [None]:
v2v_df['m_minreq'].value_counts()

In [None]:
v2v_df['m_minreq'].value_counts(normalize=True)

Unsurprisingly, this model performed the best in terms of content diversirty. It recommends almost 14k distinct videos and contains almost 18% instances of minority driven content.

# User #2407458

Our example user was given the following results for this ranker:

In [None]:
example = data[data['cust_id'] == 2407458]
example

In [None]:
v2v_example = v2v_df[(v2v_df['cust_id'] == 2407458)]
v2v_example