## Content-Based Filtering with sklearn Vectorization

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('netflix_titles.csv', encoding='ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,...,,,,,,,,,,
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,...,,,,,,,,,,
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,...,,,,,,,,,,
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,...,,,,,,,,,,
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,...,,,,,,,,,,


In [5]:
df.columns.tolist()

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description',
 'Unnamed: 12',
 'Unnamed: 13',
 'Unnamed: 14',
 'Unnamed: 15',
 'Unnamed: 16',
 'Unnamed: 17',
 'Unnamed: 18',
 'Unnamed: 19',
 'Unnamed: 20',
 'Unnamed: 21',
 'Unnamed: 22',
 'Unnamed: 23',
 'Unnamed: 24',
 'Unnamed: 25']

### Clean the dataset

In [6]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [7]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [8]:
df.columns.tolist()

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']

Combining the description, director, and listed_in (genre) columns into one helps create a richer, more comprehensive representation of each show. This combined text provides more context, allowing the recommendation system to better understand the content of each show, leading to more accurate similarity calculations and better recommendations.

In [9]:
# Fill missing values
df['description'] = df['description'].fillna('')
df['listed_in'] = df['listed_in'].fillna('')
df['director'] = df['director'].fillna('')

# Combine features into a single string
df['content'] = df['description'] + ' ' + df['listed_in'] + ' ' + df['director']

### Feature Extraction

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the content into TF-IDF vectors
tfidf_matrix = tfidf.fit_transform(df['content'])

### Compute Similarity

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Recommendation Function

In [16]:
def recommend(show_title, cosine_sim=cosine_sim):
    # Get the index of the show that matches the title
    idx = df[df['title'].str.contains(show_title, case=False)].index[0]

    # Get the pairwise similarity scores for all shows with that show
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the shows based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 8 most similar shows
    sim_scores = sim_scores[1:9]

    # Get the show indices
    show_indices = [i[0] for i in sim_scores]

    # Return the top 8 most similar shows
    return df['title'].iloc[show_indices]


In [17]:
print(recommend('Breaking Bad'))

3428              El Camino: A Breaking Bad Movie
3561           13 Reasons Why: Beyond the Reasons
470                   Bridgerton - The Afterparty
3756          Oprah Presents When They See Us Now
4546                  Monty Python: Live at Aspen
2778                Bethany Hamilton: Unstoppable
1932     The Boys in the Band: Something Personal
8282    The Drunk and on Drugs Happy Funtime Hour
Name: title, dtype: object
