In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('netflix_titles_cleaned.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Create Dummy or Indicator Features for Categorical Variables

In [3]:
categorical_cols = ['type', 'country', 'rating', 'listed_in']

df_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


## Standardize the Magnitude of Numeric Features Using a Scaler

In [4]:
numeric_cols = ['release_year']

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

scaler = StandardScaler()

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",0.65993,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",0.773324,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",0.773324,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",0.773324,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",0.773324,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Split into Testing and Training Datasets

In [5]:
X = df.drop(columns=['show_id', 'title', 'description'])  
y = df['type']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7045, 9)
(1762, 9)
(7045,)
(1762,)


# Modeling

In [6]:
movies_df = df[df['type'] == 'Movie'].reset_index(drop= True)
movies_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",0.65993,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",Unknown,"September 24, 2021",0.773324,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
2,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",-2.401707,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
3,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",0.773324,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
4,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",0.773324,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...


In [7]:
movies = movies_df[['title','director', 'cast', 'country', 'rating', 'listed_in']]
movies

Unnamed: 0,title,director,cast,country,rating,listed_in
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,PG-13,Documentaries
1,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",Unknown,PG,Children & Family Movies
2,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",TV-MA,"Dramas, Independent Movies, International Movies"
3,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,PG-13,"Comedies, Dramas"
4,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic",TV-MA,"Dramas, International Movies"
...,...,...,...,...,...,...
6126,Zinzana,Majid Al Ansari,"Ali Suliman, Saleh Bakri, Yasa, Ali Al-Jabri, ...","United Arab Emirates, Jordan",TV-MA,"Dramas, International Movies, Thrillers"
6127,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,R,"Cult Movies, Dramas, Thrillers"
6128,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,R,"Comedies, Horror Movies"
6129,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,PG,"Children & Family Movies, Comedies"


In [8]:
movies.describe().T

Unnamed: 0,count,unique,top,freq
title,6131,6131,Dick Johnson Is Dead,1
director,6131,4355,Unknown,188
cast,6131,5446,Unknown,475
country,6131,652,United States,2058
rating,6131,18,TV-MA,2062
listed_in,6131,278,"Dramas, International Movies",362


In [9]:
tv_show = df[df['type'] == 'TV Show'].reset_index(drop= True)
tv_show.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",0.773324,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
1,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",0.773324,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
2,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",0.773324,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
3,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",0.773324,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
4,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",Unknown,"September 24, 2021",0.773324,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...


In [10]:
tv_df = tv_show[['title','director', 'cast', 'country', 'rating', 'listed_in']]
tv_df.head()

Unnamed: 0,title,director,cast,country,rating,listed_in
0,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,TV-MA,"International TV Shows, TV Dramas, TV Mysteries"
1,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,TV-MA,"Crime TV Shows, International TV Shows, TV Act..."
2,Jailbirds New Orleans,Unknown,Unknown,Unknown,TV-MA,"Docuseries, Reality TV"
3,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,TV-MA,"International TV Shows, Romantic TV Shows, TV ..."
4,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",Unknown,TV-MA,"TV Dramas, TV Horror, TV Mysteries"


In [11]:
tv_df.describe().T

Unnamed: 0,count,unique,top,freq
title,2676,2676,Blood & Water,1
director,2676,223,Unknown,2446
cast,2676,2284,Unknown,350
country,2676,197,United States,760
rating,2676,10,TV-MA,1145
listed_in,2676,236,Kids' TV,220


### Text Data and Cleaning for Movies
Cleans the text data in the  DataFrame by removing stopwords from several columns and removing special characters from the country column. This preprocessing step ensures that the text data is cleaner and more suitable for further analysis or modeling.

In [12]:
movies['director'] = movies['director'].apply(nfx.remove_stopwords)
movies['cast'] = movies['cast'].apply(nfx.remove_stopwords)
movies['country'] = movies['country'].apply(nfx.remove_stopwords)
movies['listed_in'] = movies['listed_in'].apply(nfx.remove_stopwords)

movies['country'] = movies['country'].apply(nfx.remove_special_characters)

movies.head()

Unnamed: 0,title,director,cast,country,rating,listed_in
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,PG-13,Documentaries
1,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",Unknown,PG,Children & Family Movies
2,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...",United States Ghana Burkina Faso United Kingdo...,TV-MA,"Dramas, Independent Movies, International Movies"
3,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,PG-13,"Comedies, Dramas"
4,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...",Germany Czech Republic,TV-MA,"Dramas, International Movies"


Remove Stopwords: Cleans the director, cast, country, and listed_in columns by removing common stopwords.

Remove Special Characters: Cleans the country column by removing special characters.

### Feature Extration using CountVectorizer for Movies
We perform feature extraction on several categorical text columns from the movies DataFrame using the CountVectorizer from the scikit-learn library. The goal is to convert these text columns into binary token matrices that can be used in machine learning models. 

In [13]:
countVector = CountVectorizer(binary= True)
country = countVector.fit_transform(movies['country']).toarray()

countVector = CountVectorizer(binary= True,
                             tokenizer=lambda x:x.split(','))
director = countVector.fit_transform(movies['director']).toarray()
cast = countVector.fit_transform(movies['cast']).toarray()
genres = countVector.fit_transform(movies['listed_in']).toarray()

- Binary Token Matrix: Each unique word or token in the text columns is represented as a binary feature (1 if present, 0 if not).
- Custom Tokenizer for Comma-Separated Values: Handles cases where multiple values are separated by commas within a single cell.

### Binary Encoding of Categorical Features for Movies
We convert the dictionaries containing binary-encoded information for the director, cast, country, and genres columns into DataFrames and transpose them. This process ensures that each row in the resulting DataFrames represents a movie and each column represents a binary feature, such as the presence of a specific director, actor, country, or genre. This transformation is essential for integrating these features into our recommendation system.

In [14]:
binary_director = pd.DataFrame(director).transpose()
binary_cast = pd.DataFrame(cast).transpose()
binary_country = pd.DataFrame(country).transpose()
binary_genres = pd.DataFrame(genres).transpose()

In [15]:
movies_binary = pd.concat([binary_director, binary_cast,  binary_country, binary_genres], axis=0,ignore_index=True)
movies_binary.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32916,32917,32918,32919,32920,32921,32922,32923,32924,32925
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6129,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculating Feature Similiarity for Movies
We are calculating the cosine similarity matrix for the binary-encoded features of the movies and storing the result in movies_sim.

In [16]:
movies_sim = cosine_similarity(movies_binary.T)
movies_sim

array([[1.        , 0.        , 0.2       , ..., 0.25819889, 0.23904572,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.07142857,
        0.        ],
       [0.2       , 0.        , 1.        , ..., 0.12909944, 0.11952286,
        0.12403473],
       ...,
       [0.25819889, 0.        , 0.12909944, ..., 1.        , 0.15430335,
        0.        ],
       [0.23904572, 0.07142857, 0.11952286, ..., 0.15430335, 1.        ,
        0.        ],
       [0.        , 0.        , 0.12403473, ..., 0.        , 0.        ,
        1.        ]])

In [17]:
movies_sim.shape

(6131, 6131)

## Movie Recommendations
This function generates a list of recommended movies based on a given title. It first checks if the title is in the movies DataFrame. If found, it calculates similarity scores with other titles using a precomputed similarity matrix. The function then sorts these scores in descending order, selects the most similar titles, and appends their similarity scores. Finally, it returns the top 5 similar titles (excluding the queried title itself). If the title is not found, it prints a message indicating that the title is not in the dataset.

In [18]:
def movie_recommendation(title):
    if title in movies_df['title'].values:
        movies_index = movies_df[movies_df['title'] == title].index.item()
        scores = dict(enumerate(movies_sim[movies_index]))
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

        selected_movies_index = [id for id, scores in sorted_scores.items()]
        selected_movies_score = [scores for id, scores in sorted_scores.items()]
        
        rec_movies = movies_df.iloc[selected_movies_index]
        rec_movies['similiarity'] = selected_movies_score

        movie_recommendation = rec_movies.reset_index(drop=True)
        return movie_recommendation[1:6] 
    else:
        print("Title not in dataset. Please check spelling.")

In [19]:
movie_recommendation('Dear John')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,similiarity
1,s1241,Movie,Safe Haven,Lasse Hallström,"Josh Duhamel, Julianne Hough, Cobie Smulders, ...",United States,"March 4, 2021",-0.133827,PG-13,116 min,"Dramas, Romantic Movies",When a mysterious woman arrives in a small Nor...,0.389249
2,s5512,Movie,Rodney King,Spike Lee,Roger Guenveur Smith,United States,"April 28, 2017",0.319749,TV-MA,53 min,Dramas,Roger Guenveur Smith gives voice to the man at...,0.34641
3,s1331,Movie,The World We Make,Brian Baugh,"Caleb Castille, Rose Reid, Kevin Sizemore, Gre...",United States,"February 10, 2021",0.546537,PG,108 min,"Dramas, Romantic Movies",A teenage equestrian and a local football play...,0.344265
4,s5688,Movie,Blue Jay,Alex Lehmann,"Sarah Paulson, Mark Duplass, Clu Gulager",United States,"December 6, 2016",0.206355,TV-MA,81 min,"Dramas, Independent Movies, Romantic Movies",Two former high school sweethearts unexpectedl...,0.344265
5,s6370,Movie,Brampton's Own,Michael Doneger,"Alex Russell, Rose McIver, Jean Smart, Scott P...",United States,"April 19, 2019",0.433143,TV-MA,90 min,"Dramas, Romantic Movies, Sports Movies","After 12 years in the minors, a struggling bas...",0.333333


In [20]:
movie_recommendation('Level 16')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,similiarity
1,s1748,Movie,Prospect,"Zeek Earl, Christopher Caldwell","Sophie Thatcher, Pedro Pascal, Jay Duplass, An...","Canada, United States","November 2, 2020",0.433143,R,100 min,"Sci-Fi & Fantasy, Thrillers",A father and daughter travel to an alien moon ...,0.240192
2,s805,Movie,Population 436,Michelle MacLaren,"Jeremy Sisto, Fred Durst, Charlotte Sullivan, ...","Canada, United States","June 2, 2021",-0.927585,R,93 min,"Horror Movies, Thrillers",A census taker sent to investigate why Rockwel...,0.210042
3,s4813,Movie,TAU,Federico D'Alessandro,"Maika Monroe, Ed Skrein, Gary Oldman",United States,"June 29, 2018",0.433143,R,98 min,"Sci-Fi & Fantasy, Thrillers",Kidnapped by an inventor who uses her as a tes...,0.204124
4,s2184,Movie,What Keeps You Alive,Colin Minihan,"Hannah Emily Anderson, Brittany Allen, Martha ...",Canada,"August 1, 2020",0.433143,R,99 min,"LGBTQ Movies, Thrillers",A couple's romantic anniversary retreat to a r...,0.19245
5,s5029,Movie,Mute,Duncan Jones,"Alexander Skarsgård, Paul Rudd, Justin Theroux","United Kingdom, Germany","February 23, 2018",0.433143,TV-MA,127 min,"Sci-Fi & Fantasy, Thrillers","When his girlfriend vanishes, a mute man ventu...",0.19245


In [21]:
movie_recommendation('The Interview')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,similiarity
1,s2710,Movie,Coffee & Kareem,Michael Dowse,"Ed Helms, Taraji P. Henson, Terrence Little Ga...",United States,"April 3, 2020",0.65993,TV-MA,88 min,"Action & Adventure, Comedies",An inept Detroit cop must team up with his gir...,0.356348
2,s4783,Movie,The Legacy of a Whitetail Deer Hunter,Jody Hill,"Josh Brolin, Danny McBride, Montana Jordan, Sc...",United States,"July 6, 2018",0.433143,TV-14,83 min,"Action & Adventure, Comedies, Dramas",A star of hunting videos strives to bond with ...,0.322329
3,s296,Movie,The Paper Tigers,Quoc Bao Tran,"Alain Uy, Ron Yuan, Mykel Shannon Jenkins, Jae...",United States,"August 7, 2021",0.773324,PG-13,111 min,"Action & Adventure, Comedies","After reuniting as middle-aged men, three kung...",0.308607
4,s2837,Movie,Spenser Confidential,Peter Berg,"Mark Wahlberg, Winston Duke, Alan Arkin, Bokee...",United States,"March 6, 2020",0.65993,R,111 min,"Action & Adventure, Comedies","Spenser, an ex-cop and ex-con, teams up with a...",0.308607
5,s4171,Movie,Polar,Unknown,Unknown,"United States, Germany","January 25, 2019",0.546537,TV-MA,119 min,"Action & Adventure, International Movies",An assassin on the verge of retirement must pu...,0.303046


### Text Data and Cleaning for TV Shows
Cleans the text data in the  DataFrame by removing stopwords from the 'country' and 'listed_in' columns and removing special characters from the country column. This preprocessing step ensures that the text data is cleaner and more suitable for further analysis or modeling.

In [22]:
tv_df['country'] = tv_df['country'].apply(nfx.remove_stopwords)
tv_df['listed_in'] = tv_df['listed_in'].apply(nfx.remove_stopwords)

tv_df['country'] = tv_df['country'].apply(nfx.remove_special_characters)

tv_df.head()

Unnamed: 0,title,director,cast,country,rating,listed_in
0,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,TV-MA,"International TV Shows, TV Dramas, TV Mysteries"
1,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,TV-MA,"Crime TV Shows, International TV Shows, TV Act..."
2,Jailbirds New Orleans,Unknown,Unknown,Unknown,TV-MA,"Docuseries, Reality TV"
3,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,TV-MA,"International TV Shows, Romantic TV Shows, TV ..."
4,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",Unknown,TV-MA,"TV Dramas, TV Horror, TV Mysteries"


### Feature Extration using CountVectorizer for TV Shows
We perform feature extraction on several categorical text columns from the TV Show DataFrame using the CountVectorizer from the scikit-learn library. The goal is to convert these text columns into binary token matrices that can be used in machine learning models. 

In [23]:
countVector = CountVectorizer(binary= True)
country = countVector.fit_transform(tv_df['country']).toarray()

countVector = CountVectorizer(binary= True,
                             tokenizer=lambda x:x.split(','))
cast = countVector.fit_transform(tv_df['cast']).toarray()
genres = countVector.fit_transform(tv_df['listed_in']).toarray()

### Binary Encoding of Categorical Features for TV Shows

In [24]:
tv_binary_cast = pd.DataFrame(cast).transpose()
tv_binary_country = pd.DataFrame(country).transpose()
tv_binary_genres = pd.DataFrame(genres).transpose()

In [25]:
tv_binary = pd.concat([tv_binary_cast,  tv_binary_country, tv_binary_genres], axis=0,ignore_index=True)
tv_binary.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15601,15602,15603,15604,15605,15606,15607,15608,15609,15610
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculating Feature Similiarity for TV Shows

In [26]:
tv_sim = cosine_similarity(tv_binary.T)
tv_sim

array([[1.        , 0.        , 0.        , ..., 0.05455447, 0.1132277 ,
        0.        ],
       [0.        , 1.        , 0.13867505, ..., 0.        , 0.        ,
        0.12403473],
       [0.        , 0.13867505, 1.        , ..., 0.        , 0.        ,
        0.4472136 ],
       ...,
       [0.05455447, 0.        , 0.        , ..., 1.        , 0.        ,
        0.11952286],
       [0.1132277 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.12403473, 0.4472136 , ..., 0.11952286, 0.        ,
        1.        ]])

In [27]:
tv_sim.shape

(2676, 2676)

# TV Show Reccomendations
This function generates a list of recommended TV Shows based on a given title. It first checks if the title is in the TV Shows DataFrame. If found, it calculates similarity scores with other titles using a precomputed similarity matrix. The function then sorts these scores in descending order, selects the most similar titles, and appends their similarity scores. Finally, it returns the top 5 similar titles (excluding the queried title itself). If the title is not found, it prints a message indicating that the title is not in the dataset.

In [28]:
def TV_show_recommendation(title):
    if title in tv_show['title'].values:
        tv_index = tv_show[tv_show['title'] == title].index.item()
        scores = dict(enumerate(tv_sim[tv_index]))
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

        selected_tv_index = [id for id, scores in sorted_scores.items()]
        selected_tv_score = [scores for id, scores in sorted_scores.items()]

        rec_tv = tv_show.iloc[selected_tv_index]
        rec_tv['similiarity'] = selected_tv_score

        tv_recommendation = rec_tv.reset_index(drop=True)
        return tv_recommendation[1:6]

    else:
        print("Title not in dataset. Please check spelling.")

In [29]:
TV_show_recommendation('Bridgerton')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,similiarity
1,s639,TV Show,Sex/Life,Unknown,"Sarah Shahi, Mike Vogel, Adam Demos, Margaret ...",United States,"June 25, 2021",0.773324,TV-MA,1 Season,"Romantic TV Shows, TV Dramas",A woman's daring sexual past collides with her...,0.288675
2,s1723,TV Show,DASH & LILY,Unknown,"Midori Francis, Austin Abrams, Dante Brown, Tr...",United States,"November 10, 2020",0.65993,TV-14,1 Season,"Romantic TV Shows, TV Comedies, TV Dramas",Opposites attract at Christmas as cynical Dash...,0.272166
3,s4571,TV Show,Hot Date,Unknown,"Emily Axford, Brian Murphy",United States,"October 1, 2018",0.433143,TV-MA,1 Season,"Romantic TV Shows, TV Comedies",Interconnected sketches and performances skewe...,0.25
4,s489,TV Show,Virgin River,Unknown,"Alexandra Breckenridge, Martin Henderson, Tim ...",United States,"July 9, 2021",0.773324,TV-14,3 Seasons,"Romantic TV Shows, TV Dramas","Searching for a fresh start, a nurse practitio...",0.246183
5,s5285,TV Show,No Tomorrow,Unknown,"Joshua Sasse, Tori Anderson, Jonathan Langdon,...",United States,"September 5, 2017",0.206355,TV-PG,1 Season,"Romantic TV Shows, TV Comedies, TV Dramas",Her straitjacketed life turned topsy-turvy by ...,0.246183


In [30]:
TV_show_recommendation('Midnight Mass')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,similiarity
1,s1874,TV Show,The Haunting of Bly Manor,Unknown,"Victoria Pedretti, Oliver Jackson-Cohen, Henry...",United States,"October 9, 2020",0.65993,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",Dead doesn't mean gone. An au pair plunges int...,0.298807
2,s7343,TV Show,Lost Girl,Unknown,"Anna Silk, Kris Holden-Ried, Ksenia Solo, Rich...",Canada,"April 17, 2016",0.092961,TV-14,5 Seasons,"TV Dramas, TV Horror, TV Mysteries",Discovering she's a succubus who sustains hers...,0.212132
3,s4519,TV Show,The Haunting of Hill House,Unknown,"Michiel Huisman, Carla Gugino, Timothy Hutton,...",United States,"October 12, 2018",0.433143,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries","Flashing between past and present, a fractured...",0.210819
4,s1096,TV Show,This Is a Robbery: The World's Biggest Art Heist,Unknown,Unknown,Unknown,"April 7, 2021",0.773324,TV-MA,1 Season,"Crime TV Shows, Docuseries, TV Mysteries","In 1990, two men dressed as cops con their way...",0.2
5,s272,TV Show,Brand New Cherry Flavor,Unknown,"Rosa Salazar, Eric Lange, Catherine Keener, Je...",United States,"August 13, 2021",0.773324,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",A filmmaker heads to Hollywood in the early '9...,0.193649


In [31]:
TV_show_recommendation('New Girl')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,similiarity
1,s4571,TV Show,Hot Date,Unknown,"Emily Axford, Brian Murphy",United States,"October 1, 2018",0.433143,TV-MA,1 Season,"Romantic TV Shows, TV Comedies",Interconnected sketches and performances skewe...,0.516398
2,s512,TV Show,Chelsea,Unknown,Unknown,United States,"July 6, 2021",0.319749,TV-MA,2 Seasons,"Stand-Up Comedy & Talk Shows, TV Comedies","It's not her first talk show, but it is a firs...",0.424264
3,s1440,TV Show,History of Swear Words,Unknown,Nicolas Cage,United States,"January 5, 2021",0.773324,TV-MA,1 Season,"Docuseries, TV Comedies","Nicolas Cage hosts this proudly profane, funny...",0.424264
4,s1531,TV Show,Schulz Saves America,Alexx Media,Andrew Schulz,United States,"December 17, 2020",0.65993,TV-MA,1 Season,"Stand-Up Comedy & Talk Shows, TV Comedies",Comedian Andrew Schulz takes on the year's mos...,0.424264
5,s2069,TV Show,Felipe Esparza: Bad Decisions,Unknown,Felipe Esparza,United States,"September 1, 2020",0.65993,TV-MA,1 Season,"Stand-Up Comedy & Talk Shows, TV Comedies","Two live performances, one in English and one ...",0.424264
