<a href="https://colab.research.google.com/github/igoekce/netflixrecommender/blob/master/Master_Netflix_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Preprocessing

## Import

In [None]:
# import librairies
import networkx as nx
import pandas as pd
import numpy as np
import math as math
import time 

filepath = 'https://raw.githubusercontent.com/igoekce/netflixrecommender/master/netflix_titles.csv'

df = pd.read_csv(filepath)

##Variables

In [None]:
selected_movie = 'The Ridiculous 6'  #Input movie title
num_rec = 10 #Input max number of recommendations
# possible cols w/o title: 'type','director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description'
features = ['rating', 'cast', 'release_year', 'country', 'listed_in'] 
k=0.1

## Separation

In [None]:
df = df.dropna()

In [None]:
df = df.drop_duplicates()

In [None]:
df['title'].sample

<bound method NDFrame.sample of 0       Norm of the North: King Sized Adventure
4                                  #realityhigh
6                                      Automata
7            Fabrizio Copano: Solo pienso en mi
9                                   Good People
                         ...                   
6142              The Great British Baking Show
6158    Miraculous: Tales of Ladybug & Cat Noir
6167                               Sacred Games
6182                           Men on a Mission
6213                           Leyla and Mecnun
Name: title, Length: 3774, dtype: object>

In [None]:
df = df.sample(int(k*df.shape[0]), random_state=42)

In [None]:
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [None]:
df = splitDataFrameList(df,'cast',',')
df = splitDataFrameList(df,'country',',')
df = splitDataFrameList(df,'listed_in',',')

##Function generating Edge List

In [None]:
def eg_generator(features):
  df1 = pd.DataFrame(columns = ['title', 'value']) 
  for feature in features:
    print(feature)
    df_temp = df[['title', feature]].copy()
    df_temp.columns = ['title', 'value']
    df1 = pd.concat([df1, df_temp], axis=0)
  return df1

In [None]:
print('possible cols w/o title:', df.columns)

possible cols w/o title: Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [None]:
df_model = eg_generator(features)

rating
cast
release_year
country
listed_in


# Modelling

##Generating G

In [None]:
# Graphen erstellen
import networkx as nx
G = nx.from_pandas_edgelist(df_model, source='title', target='value')

## Calculation of Adar and Jaccard

In [None]:
# Ermittlung Jaccard Coeffizient
jacc_gen = nx.jaccard_coefficient(G) #Generator für Jaccard Koeffizient

In [None]:
# Ermittlung Adamic Adar Coeffizient
adam_ad = nx.adamic_adar_index(G)  #Generator für Adamic_Adar_Index

In [None]:
jacc_dict = {}
for u,v,p in jacc_gen:
  jacc_dict[(u, v)] = p

In [None]:
adam_ad_dict = {}
for u,v,p in adam_ad:
  adam_ad_dict[(u, v)] = p

In [None]:
results_jacc = {}
for key in jacc_dict:
  if selected_movie in key:
    results_jacc[key] = jacc_dict[key]

In [None]:
results_adam = {}
for key in adam_ad_dict:
  if selected_movie in key:
    results_adam[key] = adam_ad_dict[key]

##Preparing and sorting final recommendation

In [None]:
import operator
results_jacc_sorted = sorted(results_jacc.items(), key=operator.itemgetter(1), reverse=True)
results_adam_sorted = sorted(results_adam.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
import numpy as np
erg_jacc = results_jacc_sorted[:num_rec]
erg_jacc_only = [tup[1] for tup in erg_jacc]
erg_jacc_recom = [tup[0] for tup in erg_jacc ]
erg_jacc_rec =[tup[0] if tup[0]!=selected_movie else tup[1] for tup in erg_jacc_recom ]
erg_jacc_array = np.column_stack((erg_jacc_rec, erg_jacc_only))
set_jacc = set(erg_jacc_rec)
erg_jacc_top = [tup[1] for tup in erg_jacc]

In [None]:
erg_adam = results_adam_sorted[:num_rec]
erg_adam_only = [tup[1] for tup in erg_adam]
erg_adam_recom = [tup[0] for tup in erg_adam ]
erg_adam_rec =[tup[0] if tup[0]!=selected_movie else tup[1] for tup in erg_adam_recom ]
erg_adam_array = np.column_stack((erg_adam_rec, erg_adam_only))
set_adam = set(erg_adam_rec)

In [None]:
erg_jacc_array

array([['Rembat', '0.12121212121212122'],
       ['The Other One: The Long Strange Trip of Bob Weir',
        '0.10344827586206896'],
       ['The Amazing Praybeyt Benjamin', '0.09090909090909091'],
       ['National Treasure', '0.08571428571428572'],
       ['Death Race: Beyond Anarchy', '0.08333333333333333'],
       ['Stuart Little 2', '0.08333333333333333'],
       ['Austin Powers in Goldmember', '0.08108108108108109'],
       ['Money Talks', '0.08108108108108109'],
       ['Expo', '0.08108108108108109'],
       ['Judwaa 2', '0.08108108108108109']], dtype='<U48')

In [None]:
intersec = set_adam.intersection(set_jacc)
len(intersec)

5

In [None]:
intersec

{'Death Race: Beyond Anarchy',
 'National Treasure',
 'Rembat',
 'Stuart Little 2',
 'The Amazing Praybeyt Benjamin'}

In [None]:
#Proportion of common top recommendations (jacc and adam) #with respect to chosen number of recommendations (num_rec)
prop_com_rec = len(intersec)/(num_rec)*100
print(f'Proportion of common top recommendations {prop_com_rec} %')

Proportion of common top recommendations 50.0 %


# User Recommendation

In [None]:
from tabulate import tabulate

In [None]:
def returnrecommendations(title, movielist, verbose=False):
  print('For the movie:' + '\n')
  print('*'*40)
  print(title)
  print('*'*40 + '\n')
  print('We can recommend the following movies:' + '\n')

  table = []

  if verbose == False:
    for movie, value in movielist:
      value = round(float(value), 2) * 100
      table.append([movie, value])

    return print(tabulate(table, headers=["Movie Name", ' Score'], tablefmt='rst'))
  
  if verbose == True:
    for movie, value in movielist:
      value = round(float(value), 2) * 100
      # check if description is available
      try:
        description = df.loc[df['title'] == movie, 'description'].values[0]
      except:
        description = 'No description available'      
      table.append([movie, value, description])

    return print(tabulate(table, headers=["Movie Name", ' Score', 'Description'], tablefmt='rst'))

In [None]:
returnrecommendations(selected_movie, erg_jacc_array, verbose=True)

For the movie:

****************************************
The Ridiculous 6
****************************************

We can recommend the following movies:

Movie Name                                           Score  Description
Rembat                                                  12  Two huge soccer fans must go on a rambunctious road trip to fix a big match between Malaysia and Thailand, or they'll draw the ire of a loan shark.
The Other One: The Long Strange Trip of Bob Weir        10  This chronicle of Bob Weir highlights his brotherhood with Jerry Garcia and his success as a member of one of the world's most influential bands.
The Amazing Praybeyt Benjamin                            9  Now a celebrity, Benjie is tasked with taking care of a General's young son, who has information about a looming terrorist threat.
National Treasure                                        9  Modern treasure hunters search for a chest of riches rumored to have been stashed away by George Washington

In [None]:
returnrecommendations(selected_movie, erg_adam_array, verbose=True)

For the movie:

****************************************
The Ridiculous 6
****************************************

We can recommend the following movies:

Movie Name                        Score  Description
The Rugrats Movie                   193  When his baby brother Dil is born, Tommy Pickles and his pals decide that he's too much responsibility and try to return him to the hospital.
Stuart Little 2                     193  Zany misadventures are in store as lovable city mouse Stuart and his human brother, George, raise the roof in this sequel to the 1999 blockbuster.
National Treasure                   188  Modern treasure hunters search for a chest of riches rumored to have been stashed away by George Washington, Thomas Jefferson and Benjamin Franklin.
The Spiderwick Chronicles           164  When city-raised twins move to a tumbledown country mansion, it's quite an adjustment – especially after they uncover a magical fantasy world.
Death Race: Beyond Anarchy          135  An ex