In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date
import random
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [2]:
video_space = pd.read_csv("video_space.csv")
video_space = video_space.set_index('video_id')
patient_features = pd.read_csv("patient_features.csv")
watch_events = pd.read_csv("video_watched.csv")

In [3]:
del patient_features['Unnamed: 0']

In [4]:
#patient_features

In [5]:
#video_space

In [6]:
#watch_events

In [28]:
# kmeans models
kmeanModel = KMeans(n_clusters=20)
kmeanModel.fit(video_space)
kmeanModel.inertia_

945609.2026405078

In [29]:
# build watch list for each patient
watch_list = {}

for ind in watch_events.index:
    
    patient_id = watch_events.at[ind, 'patient_id']
    video_id = watch_events.at[ind, 'video_id']
    
    if patient_id in watch_list.keys():
        temp_list = watch_list.get(patient_id)
        if video_id not in temp_list:
            temp_list.append(video_id)
            watch_list[patient_id] = temp_list
    else:
        temp_list = []
        temp_list.append(video_id)
        watch_list[patient_id] = temp_list
        
    

In [30]:
# to test, get patients with more than 3 watch events
test_watched_list = {}

# for each patient
for patient in watch_list.keys():
    
    watched = watch_list.get(patient)

    # if list of watched videos less than 3, don't use in test (cold start concerns)
    if(len(watched) > 5):
        test_watched_list[patient] = watched
    

# Run from here

In [31]:
# pull two videos from each patient to try and guess with recommender
df = pd.DataFrame(index = test_watched_list.keys(), columns = ['watch_list', 'test_in', 'test_out', 'recommendation'])

for patient in test_watched_list.keys():
    
    patient_watched_list = test_watched_list.get(patient)
    df.at[patient, 'watch_list'] = patient_watched_list
    
    test_out = []
    test_in = []
    
    for i in range(0,2):
        rand = random.randint(0, len(patient_watched_list) - 1)
        test_out.append(patient_watched_list[rand])
        
    for el in patient_watched_list:
        if el not in test_out:
            test_in.append(el)
    
    df.at[patient, 'test_in'] = test_in
    df.at[patient, 'test_out'] = test_out
    

    
video_to_cat = pd.read_excel("video_id_to_cat.xlsx")
video_to_cat = video_to_cat.set_index('video_id')


video_to_category = {}
category_to_videos = {}
i = 0;

# build map from video to it's category
for video in video_space.index:
    
    video_to_category[video] = kmeanModel.labels_[i]
    i = i + 1

# build map from category to it's videos
for video in video_to_category.keys():

    cat = video_to_category.get(video)
    temp_list = []
    
    if cat in category_to_videos.keys():
        temp_list = category_to_videos.get(cat)
        temp_list.append(video)
        if video not in temp_list:
            category_to_videos[cat] = temp_list
    else:
        temp_list.append(video)
        category_to_videos[cat] = temp_list


In [32]:
df_ = df

# try to predcit for 100 patients
for i in range(0, 3000):
    
    # get random patient
    rand = random.randint(0, len(df.index))
    patient = df_.index[rand]
    
    #get their watched list for this test
    watched = df_.at[patient, 'test_in']
    #vars needed for bookkeeping
    rec_list = []
    rec_dict = {}
    
    # for each video in the patients watched list
    for video in watched:
        # get the primary category and predicted cluster
        primary = video_to_cat.at[video, 'category']
        cat_pred = kmeanModel.predict(video_space.loc[video].to_numpy().reshape(1,-1))

        
        for rec_video in category_to_videos.get(cat_pred[0]):
            # if video not already watched
            if rec_video not in df_.at[patient, 'test_in']:
                #if list isn't too big
                if len(rec_list) < 10:
                    #if video wasn't already added
                    if rec_video not in rec_list:
                        #rec_category = video_to_cat.at[rec_video, 'category']
                        #if rec_category is primary:
                        rec_list.append(rec_video)
            # endif
        # end for
    #end for

        
    df_.at[patient, 'recommendation'] = rec_list
    
#end for


df_ = df_.dropna()

good = 0
total = 0

for patient in df_.index:
    
    recs = df_.at[patient, 'recommendation']
    
    test_out = df_.at[patient, 'test_out']
    
    for video in test_out:
        total = total + 1
        if video in recs:
            good = good + 1
        #end if
    #end for
    
    
#end for

print("Good is " + str(good))
print("Total is "+ str(total))

Good is 304
Total is 4260


In [33]:
df_

Unnamed: 0,watch_list,test_in,test_out,recommendation
26221,"[654, 3021, 2599, 2981, 3000, 2990, 2602, 705,...","[654, 3021, 2599, 2981, 3000, 2990, 2602, 705,...","[2984, 2598]","[787, 651, 655, 778, 3544, 937, 3828, 3944, 34..."
26238,"[1179, 627, 735, 707, 3021, 653, 696, 3567, 30...","[1179, 627, 735, 707, 3021, 653, 696, 3567, 26...","[3539, 3022]","[708, 704, 760, 629, 733, 628, 3498, 3410, 365..."
26259,"[2597, 3023, 609, 686, 697, 2602, 648, 3022, 3...","[2597, 3023, 686, 697, 2602, 648, 3022, 3567, ...","[635, 609]","[3007, 678, 2979, 701, 709, 3354, 3359, 2990, ..."
26268,"[781, 643, 646, 626, 2587, 676]","[643, 646, 626, 676]","[781, 2587]","[620, 2587, 707, 2595, 3023, 3020, 2600, 2596,..."
26269,"[621, 2979, 3023, 3020, 3021, 2595, 2596, 3009...","[621, 2979, 3023, 3020, 3021, 2595, 2596, 3009...","[708, 3010]","[609, 634, 627, 633, 632, 730, 662, 729, 717, ..."
...,...,...,...,...
83887,"[626, 643, 612, 655, 676, 682, 3701, 3700]","[626, 643, 612, 655, 676, 3701, 3700]","[682, 682]","[645, 764, 768, 755, 767, 757, 3527, 4199, 336..."
83911,"[3700, 3701, 3702, 706, 4094, 3509, 638, 637, ...","[3701, 3702, 4094, 3509, 638, 637, 4093, 620, ...","[706, 3700]","[624, 771, 615, 2983, 710, 646, 3000, 3008, 29..."
84126,"[3701, 3700, 742, 3702, 3443, 3444]","[3701, 742, 3702, 3444]","[3700, 3443]","[624, 771, 615, 2983, 710, 646, 3000, 3008, 29..."
84135,"[644, 4092, 626, 4094, 3702, 4093, 3701, 3700]","[644, 4092, 4094, 3702, 3701, 3700]","[4093, 626]","[620, 2587, 707, 2595, 3023, 3020, 2600, 2596,..."


In [13]:
#distortions = []
#K = range(10,20)
#for k in K:
   # kmeanModel = KMeans(n_clusters=k)
   # kmeanModel.fit(video_space)
  #  distortions.append(kmeanModel.inertia_)
    #len(kmeansModel.labels_)


In [14]:
#plt.figure(figsize = (16,8))
#plt.plot(K, distortions, 'bx-')
#plt.xlabel('k')
#plt.ylabel('distortion')
#plt.title('Elbow Method Showing Optimal k')
#plt.show()