<a href="https://colab.research.google.com/github/iiTsSUgar/UAS-ARTFICIAL-INTELEGENCE/blob/main/UAS_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load the dataset
df = pd.read_csv('netflix_titles.csv')

# Step 1: Data Preprocessing (25 points)
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Fill missing values
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['rating'].fillna('Not Rated', inplace=True)
df['duration'].fillna(df['duration'].median(), inplace=True)

# Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'])
df['release_year'] = pd.to_datetime(df['release_year'].astype(str), format='%Y')

# Extract features for clustering
# Convert categorical variables to numeric
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])
df['rating_encoded'] = le.fit_transform(df['rating'])

# Create feature matrix
features = ['type_encoded', 'rating_encoded', 'release_year', 'duration']
X = df[features].copy()

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Clustering (25 points)
# Determine optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot elbow curve
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

plt.subplot(1, 2, 2)
plt.plot(K, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.tight_layout()
plt.show()

# Perform final clustering with optimal k=4
optimal_k = 4
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans_final.fit_predict(X_scaled)

# Step 3: Analysis and Machine Learning Integration (25 points)
# Analyze clusters
cluster_analysis = df.groupby('Cluster').agg({
    'type': lambda x: x.value_counts().index[0],
    'rating': lambda x: x.value_counts().index[0],
    'release_year': 'mean',
    'duration': 'mean',
    'show_id': 'count'
}).round(2)

print("\nCluster Analysis:")
print(cluster_analysis)

# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['Cluster'], cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Clusters Visualization using PCA')
plt.colorbar(scatter)
plt.show()

# Step 4: Evaluation and Insights (25 points)
# Calculate cluster characteristics
print("\nDetailed Cluster Insights:")
for i in range(optimal_k):
    cluster_data = df[df['Cluster'] == i]
    print(f"\nCluster {i} Analysis:")
    print(f"Size: {len(cluster_data)} titles")
    print(f"Most common type: {cluster_data['type'].mode()[0]}")
    print(f"Average release year: {cluster_data['release_year'].dt.year.mean():.0f}")
    print(f"Most common rating: {cluster_data['rating'].mode()[0]}")
    print(f"Average duration: {cluster_data['duration'].mean():.2f}")

# Generate comprehensive insights
print("\nComprehensive Clustering Insights:")
print("1. Content Distribution:")
for i in range(optimal_k):
    cluster_size = (df['Cluster'] == i).sum()
    percentage = (cluster_size / len(df)) * 100
    print(f"Cluster {i}: {cluster_size} titles ({percentage:.1f}%)")

print("\n2. Temporal Trends:")
for i in range(optimal_k):
    avg_year = df[df['Cluster'] == i]['release_year'].dt.year.mean()
    print(f"Cluster {i} average release year: {avg_year:.0f}")

print("\n3. Content Type Distribution:")
for i in range(optimal_k):
    type_dist = df[df['Cluster'] == i]['type'].value_counts().to_dict()
    print(f"Cluster {i} content types:", type_dist)