In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

pd.set_option("display.max_columns", None)

path = "C:/Users/Admin/Documents/ironhack/streaming_service_recommender/"

## Streaming Service Recommender

#### Goals

- Create a streaming service recommender model based on similarity

### 1. Import data

We will start by joining the Netflix data and then we will use Amazon and HBO for more testing.

In [2]:
genres_recommender = pd.read_pickle(path + "Data/genres_recommender.pkl")

In [3]:
genres_recommender

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
streaming_service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Amazon,0.1212,0.1228,0.1619,0.0191,0.2432,0.1252,0.2018,0.3349,0.1045,0.0574,0.0152,0.0606,0.0263,0.0096,0.0032,0.0582,0.0048,0.1021,0.055,0.0327,0.0144,0.0152,0.0072,0.0287,0.0112,0.0199
HBO,0.0769,0.0533,0.0473,0.0355,0.3905,0.1538,0.1006,0.5089,0.0414,0.0473,0.0059,0.0769,0.0118,0.0237,0.0059,0.0828,0.0296,0.0237,0.0769,0.0237,0.0,0.0533,0.0414,0.0355,0.0059,0.0
Netflix,0.1399,0.119,0.1675,0.0221,0.2736,0.1466,0.1718,0.3767,0.062,0.0571,0.0123,0.0387,0.0344,0.0178,0.0061,0.0571,0.0018,0.0883,0.0785,0.0344,0.0025,0.016,0.011,0.0479,0.0098,0.0031


### 2. Modelling

In [25]:
# we will train NearestNeighbors using cosine metric

model_knn = NearestNeighbors(metric="cosine",
                             algorithm="brute",
                             n_jobs=-1)
model_knn.fit(genres_recommender)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

Now that we have our model we will test it with the user survey tests created on 06.1_user_df

### 3. Test

In [5]:
user_netflix = pd.read_pickle(path + ("Data/netflix_test.pkl"))

In [6]:
user_recommender_netflix = genres_recommender.append(user_netflix)

We will first append our user row to genres_recommender data frame.

In [7]:
user_recommender_netflix

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
Amazon,0.1212,0.1228,0.1619,0.0191,0.2432,0.1252,0.2018,0.3349,0.1045,0.0574,0.0152,0.0606,0.0263,0.0096,0.0032,0.0582,0.0048,0.1021,0.055,0.0327,0.0144,0.0152,0.0072,0.0287,0.0112,0.0199
HBO,0.0769,0.0533,0.0473,0.0355,0.3905,0.1538,0.1006,0.5089,0.0414,0.0473,0.0059,0.0769,0.0118,0.0237,0.0059,0.0828,0.0296,0.0237,0.0769,0.0237,0.0,0.0533,0.0414,0.0355,0.0059,0.0
Netflix,0.1399,0.119,0.1675,0.0221,0.2736,0.1466,0.1718,0.3767,0.062,0.0571,0.0123,0.0387,0.0344,0.0178,0.0061,0.0571,0.0018,0.0883,0.0785,0.0344,0.0025,0.016,0.011,0.0479,0.0098,0.0031
User_Netflix,0.2,0.0,0.2,0.0,0.3,0.0,0.0,0.8,0.0,0.3,0.0,0.0,0.5,0.1,0.1,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.4,0.0,0.0


Now we will test the model, asking for two closest neighbors, since one of them will be the user himself, this will return two arrays, one for distances and one for the index of the nearest neighboor.

In [8]:
distances, index = model_knn.kneighbors(user_recommender_netflix.loc["User_Netflix"].values.reshape(1,-1), n_neighbors=2)

In [9]:
distances

array([[0.26694147, 0.26942758]])

In [10]:
index

array([[2, 1]], dtype=int64)

In [11]:
user_recommender_netflix.iloc[2]

genre
Action         0.1399
Adventure      0.1190
Animation      0.1675
Biography      0.0221
Comedy         0.2736
Crime          0.1466
Documentary    0.1718
Drama          0.3767
Family         0.0620
Fantasy        0.0571
Game-Show      0.0123
History        0.0387
Horror         0.0344
Music          0.0178
Musical        0.0061
Mystery        0.0571
News           0.0018
Reality-TV     0.0883
Romance        0.0785
Sci-Fi         0.0344
Short          0.0025
Sport          0.0160
Talk-Show      0.0110
Thriller       0.0479
War            0.0098
Western        0.0031
Name: Netflix, dtype: float64

In [12]:
user_recommender_netflix.index[index.flatten()[0]]

'Netflix'

It works, we will now try it for Amazon and HBO

In [13]:
user_amazon = pd.read_pickle(path + "Data/amazon_test.pkl")

user_hbo = pd.read_pickle(path + "Data/hbo_test.pkl")

In [14]:
user_recommender_amazon = genres_recommender.append(user_amazon)

In [15]:
user_recommender_amazon

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
Amazon,0.1212,0.1228,0.1619,0.0191,0.2432,0.1252,0.2018,0.3349,0.1045,0.0574,0.0152,0.0606,0.0263,0.0096,0.0032,0.0582,0.0048,0.1021,0.055,0.0327,0.0144,0.0152,0.0072,0.0287,0.0112,0.0199
HBO,0.0769,0.0533,0.0473,0.0355,0.3905,0.1538,0.1006,0.5089,0.0414,0.0473,0.0059,0.0769,0.0118,0.0237,0.0059,0.0828,0.0296,0.0237,0.0769,0.0237,0.0,0.0533,0.0414,0.0355,0.0059,0.0
Netflix,0.1399,0.119,0.1675,0.0221,0.2736,0.1466,0.1718,0.3767,0.062,0.0571,0.0123,0.0387,0.0344,0.0178,0.0061,0.0571,0.0018,0.0883,0.0785,0.0344,0.0025,0.016,0.011,0.0479,0.0098,0.0031
User,0.0,0.3,0.0,0.0,0.0,0.0,0.4,0.3,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.1,0.0,0.0,0.0,0.2,0.2


In [16]:
distances, index = model_knn.kneighbors(user_recommender_amazon.loc["User"].values.reshape(1,-1), n_neighbors=2)

In [17]:
user_recommender_amazon.index[index.flatten()[0]]

'Amazon'

In [18]:
user_recommender_hbo = genres_recommender.append(user_hbo)

In [19]:
user_recommender_hbo

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
Amazon,0.1212,0.1228,0.1619,0.0191,0.2432,0.1252,0.2018,0.3349,0.1045,0.0574,0.0152,0.0606,0.0263,0.0096,0.0032,0.0582,0.0048,0.1021,0.055,0.0327,0.0144,0.0152,0.0072,0.0287,0.0112,0.0199
HBO,0.0769,0.0533,0.0473,0.0355,0.3905,0.1538,0.1006,0.5089,0.0414,0.0473,0.0059,0.0769,0.0118,0.0237,0.0059,0.0828,0.0296,0.0237,0.0769,0.0237,0.0,0.0533,0.0414,0.0355,0.0059,0.0
Netflix,0.1399,0.119,0.1675,0.0221,0.2736,0.1466,0.1718,0.3767,0.062,0.0571,0.0123,0.0387,0.0344,0.0178,0.0061,0.0571,0.0018,0.0883,0.0785,0.0344,0.0025,0.016,0.011,0.0479,0.0098,0.0031
User,0.0,0.0,0.0,0.2,0.3,0.4,0.0,0.8,0.0,0.0,0.0,0.2,0.0,0.1,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0


In [20]:
distances, index = model_knn.kneighbors(user_recommender_hbo.loc["User"].values.reshape(1,-1), n_neighbors=2)

In [21]:
user_recommender_hbo.index[index.flatten()[0]]

'HBO'

It works for he three streaming services. Now we will create a function with all of these steps.

### 4. Create Function

In [29]:
def get_streaming_service(genres_recommender, user):
    
    user_recommender = genres_recommender.append(user)
    
    model_knn = NearestNeighbors(metric="cosine",
                             algorithm="brute",
                             n_jobs=-1)
    model_knn.fit(user_recommender)
    
    distances, streaming = model_knn.kneighbors(user_recommender.loc["User"].values.reshape(1,-1), n_neighbors=2)

    return (user_recommender.index[streaming.flatten()[1]])


In [31]:
get_streaming_service(genres_recommender, user_hbo)

'HBO'

In [32]:
user_recommender_netflix

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
Amazon,0.1212,0.1228,0.1619,0.0191,0.2432,0.1252,0.2018,0.3349,0.1045,0.0574,0.0152,0.0606,0.0263,0.0096,0.0032,0.0582,0.0048,0.1021,0.055,0.0327,0.0144,0.0152,0.0072,0.0287,0.0112,0.0199
HBO,0.0769,0.0533,0.0473,0.0355,0.3905,0.1538,0.1006,0.5089,0.0414,0.0473,0.0059,0.0769,0.0118,0.0237,0.0059,0.0828,0.0296,0.0237,0.0769,0.0237,0.0,0.0533,0.0414,0.0355,0.0059,0.0
Netflix,0.1399,0.119,0.1675,0.0221,0.2736,0.1466,0.1718,0.3767,0.062,0.0571,0.0123,0.0387,0.0344,0.0178,0.0061,0.0571,0.0018,0.0883,0.0785,0.0344,0.0025,0.016,0.011,0.0479,0.0098,0.0031
User_Netflix,0.2,0.0,0.2,0.0,0.3,0.0,0.0,0.8,0.0,0.3,0.0,0.0,0.5,0.1,0.1,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.4,0.0,0.0


In [36]:
netflix_values = user_recommender_netflix.loc["Netflix"].values

In [37]:
user_values = user_recommender_netflix.loc["User_Netflix"].values

In [43]:
for i in range(26):
    distance += (netflix_values[i] - user_values[i])**2

distance = distance**(1/2)

In [44]:
distance

0.8531497992732577

In [40]:
model_knn = NearestNeighbors(metric="euclidean",
                             algorithm="brute",
                             n_jobs=-1)
model_knn.fit(user_recommender_netflix)

model_knn.kneighbors(user_recommender_netflix.loc["User_Netflix"].values.reshape(1,-1), n_neighbors=4)

(array([[0.        , 0.82812476, 0.85133106, 0.90354031]]),
 array([[3, 1, 2, 0]], dtype=int64))

In [50]:
product = 0
netflix = 0
user = 0

for i in range(26):
    product += netflix_values[i] * user_values[i]
    netflix += netflix_values[i]**2
    user += user_values[i]**2

similarity = product / ((netflix**(1/2)) * (user**(1/2)))

similarity

0.7330585296050363

In [62]:
from math import acos
from math import pi

In [63]:
(acos(similarity) / pi)

0.2380921223472376

In [51]:
model_knn = NearestNeighbors(metric="cosine",
                             algorithm="brute",
                             n_jobs=-1)
model_knn.fit(user_recommender_netflix)

model_knn.kneighbors(user_recommender_netflix.loc["User_Netflix"].values.reshape(1,-1), n_neighbors=4)

(array([[0.        , 0.26694147, 0.26942758, 0.32079523]]),
 array([[3, 2, 1, 0]], dtype=int64))

In [65]:
netflix = pd.read_pickle(path + "Data/netflix_genres.pkl")

In [125]:
netflix[netflix["show"] == "Blue Mountain State"].head(20)

Unnamed: 0,show,genres,Crime,Drama,Thriller,Fantasy,Horror,Mystery,Comedy,Sci-Fi,Biography,Action,Adventure,Romance,History,Documentary,Animation,War,Sport,Family,Western,Short,Reality-TV,Musical,Music,Game-Show,Talk-Show,News
197,Blue Mountain State,"Comedy,Sport",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [74]:
user_recommender_netflix.sum(axis=1)

Amazon          1.9563
HBO             1.9525
Netflix         1.9960
User_Netflix    3.3000
dtype: float64

In [115]:
test = pd.read_pickle(path + "Data/user_test.pkl")

In [116]:
user_recommender2 = genres_recommender.append(test)

In [117]:
user_recommender2

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
Amazon,0.1212,0.1228,0.1619,0.0191,0.2432,0.1252,0.2018,0.3349,0.1045,0.0574,0.0152,0.0606,0.0263,0.0096,0.0032,0.0582,0.0048,0.1021,0.055,0.0327,0.0144,0.0152,0.0072,0.0287,0.0112,0.0199
HBO,0.0769,0.0533,0.0473,0.0355,0.3905,0.1538,0.1006,0.5089,0.0414,0.0473,0.0059,0.0769,0.0118,0.0237,0.0059,0.0828,0.0296,0.0237,0.0769,0.0237,0.0,0.0533,0.0414,0.0355,0.0059,0.0
Netflix,0.1399,0.119,0.1675,0.0221,0.2736,0.1466,0.1718,0.3767,0.062,0.0571,0.0123,0.0387,0.0344,0.0178,0.0061,0.0571,0.0018,0.0883,0.0785,0.0344,0.0025,0.016,0.011,0.0479,0.0098,0.0031
User,0.1333,0.1333,0.1333,0.0333,0.3333,0.1,0.1333,0.5,0.0333,0.1667,0.0333,0.0667,0.2,0.0333,0.0333,0.0333,0.0333,0.0667,0.1,0.1,0.0333,0.0333,0.0333,0.2,0.0333,0.0333


In [121]:
model_knn = NearestNeighbors(metric="euclidean",
                             algorithm="brute",
                             n_jobs=-1)
model_knn.fit(user_recommender2)

model_knn.kneighbors(user_recommender2.loc["User"].values.reshape(1,-1), n_neighbors=4)

(array([[2.98023224e-08, 3.15536052e-01, 3.41903715e-01, 3.62790725e-01]]),
 array([[3, 2, 1, 0]], dtype=int64))

In [119]:
netflix_values = user_recommender2.loc["Netflix"].values

user_values = user_recommender2.loc["User"].values

for i in range(26):
    distance += (netflix_values[i] - user_values[i])**2

distance = distance**(0.5)

distance


1.569438269602229

In [111]:
hbo_values = user_recommender2.loc["HBO"].values

user_values = user_recommender2.loc["User"].values

for i in range(26):
    distance += (hbo_values[i] - user_values[i])**2

distance = distance**(0.5)

distance

2.2888927261381644

In [113]:
amazon_values = user_recommender2.loc["Amazon"].values

user_values = user_recommender2.loc["User"].values

for i in range(26):
    distance += (amazon_values[i] - user_values[i])**2

distance = distance**(0.5)

distance

2.3635734820920398

In [122]:
user_recommender2.sum(axis=1)

Amazon     1.9563
HBO        1.9525
Netflix    1.9960
User       2.7662
dtype: float64