In [27]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz

pd.set_option("display.max_columns", None)

path = "C:/Users/Admin/Documents/ironhack/streaming_service_recommender/"

In [28]:
url = "https://docs.google.com/spreadsheets/d/1VsL_x7WAcEm-pLNmCqNwrDlFjTR_PUDpiadyUYJM-AQ/export?format=csv&gid=1360685521"

survey = pd.read_csv(url)

all_shows = pd.read_pickle(path + "Data/all_shows_genres.pkl")

genre_columns = [col for col in all_shows_test.columns if col not in ["show", "genres"]]

In [55]:
def get_recommender_df(survey_df, df_all_shows, genre_columns, row):
    
    survey_df = survey_df.drop(columns="Timestamp")
    
    all_shows_lst = [show for show in df_all_shows["show"]]
    
    titles = []

    for i in range(30):
        # remove the year and description
        title_pattern = r"(.*)\s\(\d{4}\).*$"
        title = re.findall(title_pattern, survey_df.loc[row][i])
        titles.append(title[0])

    matches = []

    for title in titles:
        for show in all_shows_lst:
            ratio = fuzz.ratio(title.lower(), show.lower())
            if ratio == 100:
                matches.append(show)
                matches = list(set(matches))
                              
    user_df = df_all_shows[df_all_shows["show"].isin(matches)].reset_index(drop=True)
    
    user_genre_ratio = (pd.DataFrame(user_df[genre_columns].sum()
                                    .sort_values(ascending=False))
                       .reset_index()
                       .rename(columns={"index":"genre", 0:"frequency"}))

    user_genre_ratio["ratio"] = round((user_genre_ratio["frequency"] / 30), 4)

    user_genre_ratio["user"] = "User"

    user_recommender = user_genre_ratio.pivot_table(index="user",
                                          columns="genre",
                                          values="ratio")
    
    return user_recommender

In [56]:
user_test = get_recommender_df(survey, all_shows, genre_columns, 4)

In [57]:
user_test.to_pickle(path + "Data/user_test.pkl")

-----

# TEST

In [33]:
netflix = pd.read_pickle(path + "Data/netflix_genres_ratio2.pkl")

amazon = pd.read_pickle(path + "Data/amazon_genres_ratio2.pkl")

hbo = pd.read_pickle(path + "Data/hbo_genres_ratio2.pkl")

In [48]:
netflix["streaming_service"] = "Netflix"

netflix_genres = netflix[["genre", "netflix_genres", "streaming_service"]]

amazon["streaming_service"] = "Amazon"

amazon_genres = amazon[["genre", "netflix_genres", "streaming_service"]]

hbo["streaming_service"] = "HBO"

hbo_genres = hbo[["genre", "netflix_genres", "streaming_service"]]

In [49]:
genres_features = netflix_genres.append(amazon_genres).append(hbo_genres)

In [50]:
genres_features = netflix_genres.append(amazon_genres).append(hbo_genres).reset_index()

In [52]:
genres_recommender = genres_features.pivot_table(index="streaming_service",
                                      columns="genre",
                                      values="netflix_genres").fillna(0)

In [53]:
genres_recommender

genre,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
streaming_service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Amazon,0.061954,0.062772,0.082758,0.009763,0.124316,0.063998,0.103154,0.171191,0.053417,0.029341,0.00777,0.030977,0.013444,0.004907,0.001636,0.02975,0.002454,0.05219,0.028114,0.016715,0.007361,0.00777,0.00368,0.014671,0.005725,0.010172
HBO,0.039385,0.027298,0.024225,0.018182,0.2,0.078771,0.051524,0.26064,0.021204,0.024225,0.003022,0.039385,0.006044,0.012138,0.003022,0.042407,0.01516,0.012138,0.039385,0.012138,0.0,0.027298,0.021204,0.018182,0.003022,0.0
Netflix,0.07009,0.059619,0.083918,0.011072,0.137074,0.073447,0.086072,0.188727,0.031062,0.028607,0.006162,0.019389,0.017234,0.008918,0.003056,0.028607,0.000902,0.044238,0.039329,0.017234,0.001253,0.008016,0.005511,0.023998,0.00491,0.001553


In [54]:
genres_recommender.sum(axis=1)

streaming_service
Amazon     1.0
HBO        1.0
Netflix    1.0
dtype: float64