# COGS 118B Final Project (Group RILS): Clustering Code #

## Imports & Helper Functions ##

In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

## Step 1: Load Preprocessed Data ##

In [None]:
def load_data(filename):
    # This function will load the movie data from a CSV file
    return pd.read_csv(filename)

## Step 2: Choosing the Right K ##

- We will use the **Elbow Method** to choose the best K.

In [None]:
distortions = []
K = range(1, 11)  # Assuming we test for k values from 1 to 10
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=0)
    kmeanModel.fit(scaled_features)
    distortions.append(kmeanModel.inertia_)

#Plot the elmbow curve
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

- Based on the plot **k = [CHOOSE BEST k]** is the optimal value

In [None]:
optimal_k = [CHOSEN VALUE]

## Step 3: Implement K-Means ##

- We will run K-means clustering using sklearn

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=0)
kmeans.fit(scaled_features)
clusters = kmeans.predict(scaled_features)
# Add clusters to your dataset
dataset['cluster'] = clusters

plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=clusters, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.title('Visualization of Clusters')
plt.show()

## Step 4: Silhouette Score Evaluation ##

- We will calculate the **Silhouette Score** to see how well the clusters are separated.
- A higher score indicates a better-defined cluster

In [None]:
silhouette_avg = silhouette_score(scaled_features, clusters)
print(f"The silhouette score for k={optimal_k}: {silhouette_avg:.2f}")

- ***HOW DID THE CLUSTERING DO? EXPLAIN BRIEFLY***

## Step 5: Recommendation System ##

- Now, we will create the recommendation system based off the clusters we've created.
- This will be done in 2 steps:
    1) User preference profiling
    2) Recommending movies based off the user profile

### Step 5a: User Preference Profiling ###

- We will create a user profile based on the features of movies they like
- This will be done by aggregating the cluster labels of movies a user likes

In [None]:
def user_preference_profile(user_likes, movie_clusters):
    # user_likes is a list of movie titles the user likes
    # movie_clusters is a dictionary mapping movie titles to their cluster labels

    # Count the frequency of each cluster in the user's liked movies
    cluster_preference = {}
    for movie in user_likes:
        if movie in movie_clusters:
            cluster = movie_clusters[movie]
            cluster_preference[cluster] = cluster_preference.get(cluster, 0) + 1

    # Sort clusters by preference
    sorted_clusters = sorted(cluster_preference, key=cluster_preference.get, reverse=True)
    return sorted_clusters

# Example usage
user_likes = ['Movie Title 1', 'Movie Title 2']  # List of movie titles the user likes
user_pref_clusters = user_preference_profile(user_likes, movie_clusters)

### Step 5b: Recommend Movies ###

- Some recommendations we can have are: recommendations (collaborative filtering), genres (content based filtering), and credits.

In [None]:
def recommend_movies(preferred_clusters, movies, n_recommendations):
    recommendations = []

    for cluster in preferred_clusters:
        # Filter movies from the preferred cluster
        cluster_movies = movies[movies['cluster'] == cluster]

        # Exclude movies the user has already seen (user_likes)
        unseen_movies = cluster_movies[~cluster_movies['title'].isin(user_likes)]

        # Add recommendations
        recommendations.extend(unseen_movies.sample(min(n_recommendations, len(unseen_movies))))

        # If we have enough recommendations, stop
        if len(recommendations) >= n_recommendations:
            break

    return recommendations[:n_recommendations]

## Step 6: Testing ##

- We will now test the performance of the recommendation system

In [None]:
# Example usage
recommended = recommend_movies(user_pref_clusters, movies_df, 5)
print(recommended)