# 🎯 Movie Recommendation using MiniBatchKMeans Clustering

This notebook builds a movie recommendation system by clustering movies based on TF-IDF features of their metadata and suggesting from the same cluster.

In [52]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import linear_kernel

#### Load Data

In [55]:
df = pd.read_csv('../data/netflix_titles.csv')
df.fillna('', inplace=True)
df['combined_features'] = (
    df['title'] + ' ' + 
    df['description'] + ' ' + 
    df['director'] + ' ' + 
    df['cast'] + ' ' + 
    df['listed_in'] + ' ' + 
    df['country']
)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,combined_features
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead As her father nears the e...
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","Blood & Water After crossing paths at a party,..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Ganglands To protect his family from a powerfu...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","Jailbirds New Orleans Feuds, flirtations and t..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Kota Factory In a city of coaching centers kno...


#### Create TF-IDF Features

In [58]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (8807, 52965)


#### MiniBatchKMeans Clustering

In [60]:
kmeans = MiniBatchKMeans(
    n_clusters=100,
    batch_size=500,
    random_state=42,
    init='k-means++'
)
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

#### Recommendation Function Based on Cluster

In [63]:
def recommend_by_cluster(movie_title, n_recommendations=10):
    if movie_title not in df['title'].values:
        return []
    movie_cluster = df[df['title'] == movie_title]['cluster'].values[0]
    similar_movies = df[df['cluster'] == movie_cluster]
    return similar_movies[similar_movies['title'] != movie_title]['title'].tolist()[:n_recommendations]

#### Recommendation Function Based on cosine similarity

In [65]:
def recommend_by_similarity(movie_title, n_recommendations=10):
    if movie_title not in df['title'].values:
        return []
    movie_idx = df[df['title'] == movie_title].index[0]
    cosine_similarities = linear_kernel(
        tfidf_matrix[movie_idx:movie_idx+1], 
        tfidf_matrix
    ).flatten()
    similar_indices = [i for i in cosine_similarities.argsort()[::-1] 
                      if i != movie_idx][:n_recommendations]
    return df.iloc[similar_indices]['title'].tolist()

#### Recommendation

In [67]:
def hybrid_recommend(movie_title, n_recommendations=10):
    """Hybrid recommendation combining cluster and similarity methods."""
    cluster_recs = set(recommend_by_cluster(movie_title, n_recommendations*2))
    similarity_recs = set(recommend_by_similarity(movie_title, n_recommendations*2))
    common_recs = list(cluster_recs.intersection(similarity_recs))
    remaining = n_recommendations - len(common_recs)
    if remaining > 0:
        unique_cluster = list(cluster_recs - set(common_recs))
        unique_similarity = list(similarity_recs - set(common_recs))
        additional_recs = []
        for i in range(min(remaining, len(unique_cluster) + len(unique_similarity))):
            if i % 2 == 0 and unique_similarity:
                additional_recs.append(unique_similarity.pop(0))
            elif unique_cluster:
                additional_recs.append(unique_cluster.pop(0))
            elif unique_similarity:
                additional_recs.append(unique_similarity.pop(0))
        return common_recs + additional_recs[:remaining]
    return common_recs[:n_recommendations]

#### Test

In [73]:
sample_movie = 'Twilight'
recommendations = hybrid_recommend(sample_movie, 5)

print(f"\nTop recommendations for '{sample_movie}':")
for i, movie in enumerate(recommendations, 1):
    print(f"{i}. {movie}")


Top recommendations for 'Twilight':
1. The Twilight Saga: Breaking Dawn: Part 1
2. Sankofa
3. The Twilight Saga: Breaking Dawn: Part 2
4. Confessions of an Invisible Girl
5. The Vanished
