**`Recommendation model`**

In [4]:
import etl_mainfile as etl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [5]:
#import datasets
df_platform= etl.df_platform
df_platform= df_platform.reset_index()
df_platform.shape

(22998, 15)

In [6]:
df_platform.head()

Unnamed: 0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration_int,duration_type,listed_in,description,platform
0,0,as1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,113,min,"comedy, drama",a small fishing village must procure a local d...,amazon
1,1,as2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,110,min,"drama, international",a metro family decides to fight a cyber crimin...,amazon
2,2,as3,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,74,min,"action, drama, suspense",after a man discovers his wife is cheating on ...,amazon
3,3,as4,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncé, britney...",united states,2021-03-30,2014,g,69,min,documentary,"pink breaks the mold once again, bringing her ...",amazon
4,4,as5,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,45,min,"drama, fantasy",teenage matt banting wants to work with a famo...,amazon


In [7]:
df_platform= df_platform.sample(n=2000, random_state=42)
df_platform.shape

(2000, 15)

In [8]:
df_platform= df_platform.fillna(0)
df_platform= df_platform.astype(str)
df_platform['title_description']= df_platform['title']+df_platform['description']
df_platform= df_platform.reset_index()

In [9]:
df_platform.head()

Unnamed: 0,level_0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration_int,duration_type,listed_in,description,platform,title_description
0,12626,1508,hs1509,movie,the assistant,0,0,united states,2020-07-20 00:00:00,2019,r,87,min,drama,"jane, a recent college graduate and aspiring f...",hulu,"the assistantjane, a recent college graduate a..."
1,2004,2004,as2005,movie,joni,james f. collier,"joni eareckson tada, bert remsen, katherine de...",united states,0,1979,pg,112,min,drama,"she was young...vital...just 17, when a diving...",amazon,"jonishe was young...vital...just 17, when a di..."
2,15062,871,ns872,movie,sardar ka grandson,kaashvie nair,"arjun kapoor, neena gupta, rakul preet singh, ...",india,2021-05-18 00:00:00,2021,tv-14,140,min,"comedies, dramas, international movies",a devoted grandson’s mission to reunite his ai...,netflix,sardar ka grandsona devoted grandson’s mission...
3,259,259,as260,movie,truck tunes 2,jim gardner,0,0,0,2014,all,27,min,"kids, special interest",from the creators of the original truck tunes ...,amazon,truck tunes 2from the creators of the original...
4,2195,2195,as2196,tv show,homecoming,0,"janelle monáe, hong chau, chris cooper, stepha...",united states,0,2020,16+,2,season,"drama, suspense",start with season two and experience a new mys...,amazon,homecomingstart with season two and experience...


In [10]:
#Data processing
vectorized= TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words=['english','0'])
X= vectorized.fit_transform(df_platform['title_description'])
#Create similarity matrix
similarity_matrix= cosine_similarity(X)

In [11]:
#Export to pickle file
with open('similarity_matrix.pickle', 'wb') as f:
    pickle.dump(similarity_matrix, f)

In [12]:
def get_similar_movies(title: str):
    #Obtain movie index
    idx = df_platform[df_platform['title'] == title].index[0]
    #Obtain similar movies related to "title"
    similar_movies = list(enumerate(similarity_matrix[idx]))
    #Order movies by cosine similarity
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    #5 similiar movies
    top_movies = [df_platform.iloc[i[0]].title for i in similar_movies[1:6]]
    return top_movies

In [19]:
#Obtain 5 similar movies
get_similar_movies('truck tunes 2')

['the gigglebellies: monster trucks',
 "gecko's garage - learn at home with gecko",
 "morphle non-dialogue - mila and morphle's magical life",
 'tiny trucks',
 'go buster - buster the wizard and more original kids songs']