In [2]:
import pandas as pd
import numpy as np
import warnings 

warnings.filterwarnings('ignore')

In [3]:
import ast

In [4]:
amazon = pd.read_csv('./amazon/titles.csv')
hbo = pd.read_csv('./hbo/titles.csv')
netflix = pd.read_csv('./netflix/titles.csv')

df = pd.concat([amazon, hbo, netflix], axis = 0)

In [9]:
df.shape

(19015, 15)

In [11]:
df.title.dropna(inplace=True)

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df.shape

(18980, 15)

In [19]:
df['description'].fillna('', inplace = True)

In [26]:
df['description'] = df['description'].apply(lambda x : x.split()) 

In [32]:
df['genres']

0       ['comedy', 'family', 'animation', 'action', 'f...
1       ['action', 'drama', 'war', 'western', 'comedy'...
2                             ['romance', 'war', 'drama']
3                          ['comedy', 'drama', 'romance']
4                        ['thriller', 'drama', 'romance']
                              ...                        
5845                                 ['romance', 'drama']
5846                                            ['drama']
5847                                           ['comedy']
5848                                                   []
5849                    ['family', 'animation', 'comedy']
Name: genres, Length: 18980, dtype: object

In [36]:
df['genres'] = df['genres'].apply(lambda x : ast.literal_eval(x))

In [39]:
df['tags'] = df['title'].str.split() + df['description'] + df['genres']

In [41]:
df_new = df[['title', 'tags']]

In [42]:
df_new

Unnamed: 0,title,tags
0,The Three Stooges,"[The, Three, Stooges, The, Three, Stooges, wer..."
1,The General,"[The, General, During, America’s, Civil, War,,..."
2,The Best Years of Our Lives,"[The, Best, Years, of, Our, Lives, It's, the, ..."
3,His Girl Friday,"[His, Girl, Friday, Hildy,, the, journalist, f..."
4,In a Lonely Place,"[In, a, Lonely, Place, An, aspiring, actress, ..."
...,...,...
5845,Fine Wine,"[Fine, Wine, A, beautiful, love, story, that, ..."
5846,C/O Kaadhal,"[C/O, Kaadhal, A, heart, warming, film, that, ..."
5847,Lokillo,"[Lokillo, A, controversial, TV, host, and, com..."
5848,Dad Stop Embarrassing Me - The Afterparty,"[Dad, Stop, Embarrassing, Me, -, The, Afterpar..."


In [45]:
df_new['tags'] = df_new['tags'].apply(lambda x : str(x).split() if type(x) != list else x)

In [47]:
df_new['tags'] = df_new['tags'].apply(lambda x : ' '.join(x))

In [48]:
df_new

Unnamed: 0,title,tags
0,The Three Stooges,The Three Stooges The Three Stooges were an Am...
1,The General,"The General During America’s Civil War, Union ..."
2,The Best Years of Our Lives,The Best Years of Our Lives It's the hope that...
3,His Girl Friday,"His Girl Friday Hildy, the journalist former w..."
4,In a Lonely Place,In a Lonely Place An aspiring actress begins t...
...,...,...
5845,Fine Wine,Fine Wine A beautiful love story that can happ...
5846,C/O Kaadhal,C/O Kaadhal A heart warming film that explores...
5847,Lokillo,Lokillo A controversial TV host and comedian w...
5848,Dad Stop Embarrassing Me - The Afterparty,Dad Stop Embarrassing Me - The Afterparty Jami...


In [49]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000,stop_words='english')

In [52]:
vector = cv.fit_transform(df_new['tags']).toarray()

In [53]:
vector.shape

(18980, 2000)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
similarity  = cosine_similarity(vector)

In [78]:
def recommend(media):
    index = df_new[df_new['title'] == media].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df_new.iloc[i[0]]['title'])

In [79]:
recommend('Malaal')

Udhayam NH4
Shankar Guru
Dynamite
Silence
Anandha Jodhi


In [85]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [86]:
def stem(text):
    flst = []

    for i in text.split():
        flst.append(ps.stem(i))
    
    return " ".join(flst)

In [87]:
df_new['tags'] = df_new['tags'].apply(stem)

In [88]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [89]:
pickle.dump(df_new, open('tvshows.pkl', 'wb'))

In [91]:
pickle.dump(df[['title', 'imdb_id']], open('imdbid.pkl', 'wb'))

In [5]:
df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.270,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5845,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"['romance', 'drama']",['NG'],,tt13857480,6.8,45.0,1.466,
5846,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,['drama'],[],,tt11803618,7.7,348.0,,
5847,tm1059008,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,90,['comedy'],['CO'],,tt14585902,3.8,68.0,26.005,6.3
5848,tm1035612,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,37,[],['US'],,,,,1.296,10.0


In [9]:
id = df[df['title'] == 'The Three Stooges']['imdb_id'].str.split()

In [10]:
id

0       [tt0850645]
2406    [tt0214698]
Name: imdb_id, dtype: object