In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date
import random
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [5]:
video_space = pd.read_csv("video_space.csv")
video_space = video_space.set_index('video_id')
patient_features = pd.read_csv("patient_features.csv")
watch_events = pd.read_csv("video_watched.csv")

In [6]:
del patient_features['Unnamed: 0']

In [358]:
#patient_features

In [359]:
#video_space

In [360]:
#watch_events

In [26]:
# kmeans models
kmeanModel = KMeans(n_clusters=100)
kmeanModel.fit(video_space)
kmeanModel.inertia_

440373.4693958165

In [27]:
# build watch list for each patient
watch_list = {}

for ind in watch_events.index:
    
    patient_id = watch_events.at[ind, 'patient_id']
    video_id = watch_events.at[ind, 'video_id']
    
    if patient_id in watch_list.keys():
        temp_list = watch_list.get(patient_id)
        if video_id not in temp_list:
            temp_list.append(video_id)
            watch_list[patient_id] = temp_list
    else:
        temp_list = []
        temp_list.append(video_id)
        watch_list[patient_id] = temp_list
        
    

In [28]:
# to test, get patients with more than 3 watch events
test_watched_list = {}

# for each patient
for patient in watch_list.keys():
    
    watched = watch_list.get(patient)

    # if list of watched videos less than 3, don't use in test (cold start concerns)
    if(len(watched) > 10):
        test_watched_list[patient] = watched
    

# Run from here

In [31]:
# pull two videos from each patient to try and guess with recommender
df = pd.DataFrame(index = test_watched_list.keys(), columns = ['watch_list', 'test_in', 'test_out', 'recommendation'])

for patient in test_watched_list.keys():
    
    patient_watched_list = test_watched_list.get(patient)
    df.at[patient, 'watch_list'] = patient_watched_list
    
    test_out = []
    test_in = []
    
    for i in range(0,2):
        rand = random.randint(0, len(patient_watched_list) - 1)
        test_out.append(patient_watched_list[rand])
        
    for el in patient_watched_list:
        if el not in test_out:
            test_in.append(el)
    
    df.at[patient, 'test_in'] = test_in
    df.at[patient, 'test_out'] = test_out
    

    
video_to_cat = pd.read_excel("video_id_to_cat.xlsx")
video_to_cat = video_to_cat.set_index('video_id')


video_to_category = {}
category_to_videos = {}
i = 0;

# build map from video to it's category
for video in video_space.index:
    
    video_to_category[video] = kmeanModel.labels_[i]
    i = i + 1

# build map from category to it's videos
for video in video_to_category.keys():

    cat = video_to_category.get(video)
    temp_list = []
    
    if cat in category_to_videos.keys():
        temp_list = category_to_videos.get(cat)
        temp_list.append(video)
        if video not in temp_list:
            category_to_videos[cat] = temp_list
    else:
        temp_list.append(video)
        category_to_videos[cat] = temp_list


In [32]:
df_ = df

# try to predcit for 10 patients
for i in range(0, 3000):
    
    # get random patient
    rand = random.randint(0, len(df.index) - 1)
    patient = df_.index[rand]
    
    #get their watched list for this test
    watched = df_.at[patient, 'test_in']
    #vars needed for bookkeeping
    rec_list = []
    rec_dict = {}
    
    # for each video in the patients watched list
    for video in watched:
        # get the primary category and predicted cluster
        primary = video_to_cat.at[video, 'category']
        cat_pred = kmeanModel.predict(video_space.loc[video].to_numpy().reshape(1,-1))
        
        # for each video in that cluster
        for rec_video in category_to_videos.get(cat_pred[0]):
            #if the video isn't already watched keep track of the count
            if rec_video not in df_.at[patient, 'test_in']:     
                # if the video is not in the count dict, add it. else get the current value and increment by 1
                if rec_video not in rec_dict.keys():
                    rec_dict[rec_video] = 1
                else:
                    value = rec_dict.get(rec_video)
                    rec_dict[rec_video] = value + 1
                
                    if len(rec_list) < 10:
                        rec_category = video_to_cat.at[rec_video, 'category']
                        if rec_category is primary:
                            rec_list.append(rec_video)
            # endif
        # end for
    #end for
    
    recommendation = pd.DataFrame.from_dict(rec_dict, orient='index')
    recommendation = recommendation.sort_values(0, ascending = False)
    recommendation = recommendation.reset_index()
    
    for i in range(0,20):
        video = recommendation.at[i, 'index']
        rec_list.append(int(video))
        
    df_.at[patient, 'recommendation'] = rec_list
    
#end for


df_ = df_.dropna()

good = 0
total = 0

for patient in df_.index:
    
    recs = df_.at[patient, 'recommendation']

    
    test_out = df_.at[patient, 'test_out']
    
    for video in test_out:
        total = total + 1
        if video in recs:
            good = good + 1
        #end if
    #end for
    
    
#end for

print("Good is " + str(good))
print("Total is "+ str(total))
df_

Good is 181
Total is 3492


Unnamed: 0,watch_list,test_in,test_out,recommendation
26208,"[2595, 3021, 3023, 3020, 2600, 2596, 609, 2587...","[2595, 3021, 3023, 3020, 2600, 609, 2587, 2599...","[2596, 3569]","[741, 780, 742, 3407, 4036, 4901, 4260, 4995, ..."
26209,"[3007, 678, 648, 2595, 615, 2596, 2587, 647, 2...","[3007, 678, 648, 615, 2596, 2587, 647, 2979, 701]","[2983, 2595]","[2989, 2991, 653, 3445, 775, 652, 776, 785, 77..."
26211,"[710, 2599, 734, 644, 646, 2600, 634, 3569, 30...","[710, 2599, 734, 644, 646, 634, 3569, 3023, 70...","[2600, 3022]","[741, 780, 742, 3407, 4036, 4901, 4260, 4995, ..."
26238,"[1179, 627, 735, 707, 3021, 653, 696, 3567, 30...","[1179, 627, 707, 3021, 653, 696, 3567, 3022, 2...","[3539, 735]","[2600, 3007, 3354, 3359, 622, 3012, 611, 3138,..."
26239,"[3021, 2602, 2599, 2979, 2601, 2600, 633, 774,...","[3021, 2602, 2599, 2979, 2600, 774, 631, 3354,...","[633, 2601]","[3007, 3359, 622, 3012, 611, 3138, 3139, 781, ..."
...,...,...,...,...
83061,"[707, 1706, 677, 2979, 4092, 3386, 4093, 3502,...","[707, 1706, 2979, 4092, 4093, 3502, 940, 4094,...","[677, 3386]","[3007, 3354, 3359, 2602, 622, 3012, 611, 3138,..."
83273,"[694, 646, 3510, 774, 643, 736, 626, 770, 3551...","[694, 646, 3510, 643, 736, 626, 770, 3551, 480...","[774, 661]","[2979, 710, 623, 696, 774, 695, 708, 706, 3505..."
83451,"[3356, 648, 3645, 3354, 620, 3644, 3139, 3646,...","[3356, 3645, 3354, 620, 3139, 3646, 3355, 676,...","[648, 3644]","[709, 2982, 619, 712, 3509, 3511, 3050, 698, 3..."
83911,"[3700, 3701, 3702, 706, 4094, 3509, 638, 637, ...","[3700, 3701, 3702, 706, 4094, 3509, 638, 637, ...","[760, 765]","[3401, 3396, 636, 2422, 2423, 2615, 999, 1000,..."


In [24]:
#distortions = []
#K = range(10,20)
#for k in K:
   # kmeanModel = KMeans(n_clusters=k)
   # kmeanModel.fit(video_space)
  #  distortions.append(kmeanModel.inertia_)
    #len(kmeansModel.labels_)


In [25]:
#plt.figure(figsize = (16,8))
#plt.plot(K, distortions, 'bx-')
#plt.xlabel('k')
#plt.ylabel('distortion')
#plt.title('Elbow Method Showing Optimal k')
#plt.show()