# ðŸŽ¬ Netflix Show Clustering
This project uses K-Means clustering to group Netflix shows based on genre, rating, and duration.  
It is a beginner-friendly project to learn about unsupervised machine learning!



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:
df = pd.read_csv("netflix_titles.csv")
df.head()


In [None]:
# Drop rows with missing values in key columns
df.dropna(subset=['rating', 'duration', 'listed_in'], inplace=True)

# Extract numeric duration (e.g., "90 min" â†’ 90)
df['duration'] = df['duration'].str.extract('(\d+)').astype(float)

# Create a simplified 'genre' column (first genre listed)
df['genre'] = df['listed_in'].apply(lambda x: x.split(',')[0].strip())

df[['title', 'genre', 'rating', 'duration']].head()


In [None]:
# Select relevant features
features = df[['genre', 'rating', 'duration']]

# One-hot encode genre and rating
features_encoded = pd.get_dummies(features, columns=['genre', 'rating'])

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_encoded)


In [None]:
inertia = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()


In [None]:
k = 4  # Based on elbow method
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)

df[['title', 'genre', 'rating', 'duration', 'cluster']].head()


In [None]:
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_features[:, 0], y=pca_features[:, 1], hue=df['cluster'], palette="Set2")
plt.title("Netflix Show Clusters (PCA)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()


In [None]:
# See how shows are grouped
for i in range(k):
    print(f"\nðŸ“º Cluster {i} Sample Shows:")
    print(df[df['cluster'] == i]['title'].sample(5).values)
