<div style="text-align: center; font-family: 'Lexend', sans-serif; font-size: 36px;">
    <strong>Task 2 for KDAG 2025 Selections by Neel Gupta 24BT10040 </strong>
</div>

<div style="text-align: center; font-family: 'Lexend', sans-serif; font-size: 30px;">
    <strong>Music Genre Clustering Analysis</strong>
</div>
<div style="text-align: center; font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>Interactive Exploration of Song Keywords</strong>
</div>


<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>1. Environment Setup</strong>
</div>


<div style="font-family: 'Lexend', sans-serif; font-size: 20px;">
    <strong>Install Requirements if not already present :</strong>
</div>
<div style="font-family: 'Lexend', sans-serif; font-size: 13px;">
    <strong>Breakdown of Dependencies and Installation guide:</strong><br>
    pandas → For data manipulation and analysis. → <code>pip install pandas</code> <button onclick="navigator.clipboard.writeText('pip install pandas')">Copy</button> <br>
    numpy → For numerical operations and arrays. → <code>pip install numpy</code> <button onclick="navigator.clipboard.writeText('pip install numpy')">Copy</button> <br>
    matplotlib → For plotting and data visualization. → <code>pip install matplotlib</code> <button onclick="navigator.clipboard.writeText('pip install matplotlib')">Copy</button> <br>
    seaborn → For statistical data visualization. → <code>pip install seaborn</code> <button onclick="navigator.clipboard.writeText('pip install seaborn')">Copy</button> <br>
    collections (built-in) → No need to install separately. <br>
    tqdm → For progress bars in loops. → <code>pip install tqdm</code> <button onclick="navigator.clipboard.writeText('pip install tqdm')">Copy</button> <br>
    ipywidgets → For interactive widgets in Jupyter. → <code>pip install ipywidgets</code> <button onclick="navigator.clipboard.writeText('pip install ipywidgets')">Copy</button> <br>
    <br>
    <strong></strong><br>
</strong>
</div>

In [11]:
%%capture 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from tqdm.notebook import tqdm
from ipywidgets import interact, widgets, interact_manual

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>2. Data Preparation</strong>
</div>

In [12]:
from ipywidgets import interact
import pandas as pd

# A function to load and preprocess the dataset
# Combine the keyword columns into a single column called 'document' for text analysis 
def load_data(file_path):

    df = pd.read_csv(file_path)
    df['document'] = df[['keyword_1', 'keyword_2', 'keyword_3']].apply(
        lambda x: ' '.join(x.dropna()), axis=1
    )
    return df

# Load dataset
file_path = "TASK2_dataset.csv"  # Update with your path
df = load_data(file_path)

# Display interactive data explorer to give a slider to select the number of rows to display
@interact
def show_data(rows=(5, 50, 5)):
    return df.head(rows)


interactive(children=(IntSlider(value=25, description='rows', max=50, min=5, step=5), Output()), _dom_classes=…

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>3a. BOW Vectorizer Implementation</strong>
</div>

In [13]:
#not used in implementation
import numpy as np
class BOWVectorizer:
    def __init__(self):
        self.vocab = [] # Stores unique words from all documents
    
    def fit(self, documents):
        # Split documents into individual words
        tokenized = [doc.split() for doc in documents]

        # Create vocabulary using set comprehension to get unique words
        self.vocab = list({word for doc in tokenized for word in doc})
        
    def transform(self, documents):
        # Create count vectors using nested list comprehensions
        return np.array([
            [doc.split().count(word) for word in self.vocab] # Word counts
            for doc in documents
        ])


<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>3b. TF-IDF Implementation</strong>
</div>

In [14]:
from tqdm.notebook import tqdm
import numpy as np
from collections import defaultdict

class TFIDFVectorizer:
    def __init__(self):
        self.vocab = []
        self.idf = {}
        
    def fit(self, documents):
        df_counter = defaultdict(int)
        tokenized_docs = [doc.split() for doc in tqdm(documents, desc="Processing documents")]
        unique_words = set(word for doc in tokenized_docs for word in doc)
        self.vocab = list(unique_words)
        
        for doc in tqdm(tokenized_docs, desc="Calculating DF"):
            for word in set(doc):
                df_counter[word] += 1
                
        total_docs = len(documents)
        self.idf = {word: np.log((total_docs + 1) / (df_counter[word] + 1)) + 1 
                   for word in self.vocab}
    
    def transform(self, documents):
        vectors = []
        for doc in tqdm(documents, desc="Transforming documents"):
            words = doc.split()
            word_counts = {word: words.count(word) for word in self.vocab}
            tf = {word: count/len(words) for word, count in word_counts.items()}
            vector = [tf[word] * self.idf[word] for word in self.vocab]
            vectors.append(vector)
        return np.array(vectors)

# Initialize and fit TF-IDF
tfidf = TFIDFVectorizer()
tfidf.fit(df['document'])
tfidf_vectors = tfidf.transform(df['document'])

    


Processing documents:   0%|          | 0/147 [00:00<?, ?it/s]

Calculating DF:   0%|          | 0/147 [00:00<?, ?it/s]

Transforming documents:   0%|          | 0/147 [00:00<?, ?it/s]

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>4. Dimensionality Reduction</strong>
</div>

In [15]:
class ManualPCA:
    def __init__(self, n_components=2):
        self.n_components = n_components
        self.components = None
        self.mean = None
        
    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean
        _, _, Vt = np.linalg.svd(X_centered, full_matrices=False)
        self.components = Vt[:self.n_components]
    
    def transform(self, X):
        return (X - self.mean) @ self.components.T

# Apply PCA
pca = ManualPCA(n_components=2)
pca.fit(tfidf_vectors)
pca_vectors = pca.transform(tfidf_vectors)

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>5. Clustering Analysis</strong>
</div>

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from ipywidgets import interact

class KMeansClustering:
    def __init__(self, k=6, max_iters=100, random_state=42):
        self.k = k
        self.max_iters = max_iters
        self.random_state = random_state
        self.centroids = None
        self.labels_ = None
        
    def _initialize_centroids(self, X):
        np.random.seed(self.random_state)
        centroids = [X[np.random.randint(X.shape[0])]]
        for _ in range(1, self.k):
            dist_sq = np.array([min(np.linalg.norm(x - c)**2 for c in centroids) for x in X])
            probs = dist_sq / dist_sq.sum()
            cumulative_probs = probs.cumsum()
            r = np.random.rand()
            centroids.append(X[np.where(cumulative_probs >= r)[0][0]])
        return np.array(centroids)
    
    def fit(self, X):
        self.centroids = self._initialize_centroids(X)
        for _ in tqdm(range(self.max_iters), desc="Clustering Iterations"):
            distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
            labels = np.argmin(distances, axis=1)
            new_centroids = np.array([
                X[labels == i].mean(axis=0) if np.any(labels == i) 
                else self.centroids[i] 
                for i in range(self.k)
            ])
            if np.allclose(self.centroids, new_centroids):
                break
            self.centroids = new_centroids
        self.labels_ = labels
        return self

# Interactive clustering control
@interact(n_clusters=(2, 10, 1), random_seed=(0, 100, 1))
def run_clustering(n_clusters=6, random_seed=42):
    global df, pca_vectors
    kmeans = KMeansClustering(k=n_clusters, random_state=random_seed).fit(pca_vectors)
    df['cluster'] = kmeans.labels_

    # Manual Silhouette Score Calculation
    def manual_silhouette_score(X, labels):
        n_samples = X.shape[0]
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels)

        if n_clusters <= 1:
            return 0.0

        distances = np.sqrt((X[:, np.newaxis, :] - X) ** 2).sum(axis=2)
        silhouette_scores = []

        for i in range(n_samples):
            current_label = labels[i]
            same_cluster = np.where(labels == current_label)[0]
            same_cluster = same_cluster[same_cluster != i]
            
            a_i = np.mean(distances[i, same_cluster]) if len(same_cluster) > 0 else 0
            b_i = np.min([np.mean(distances[i, np.where(labels == label)[0]]) 
                        for label in unique_labels if label != current_label])
            
            silhouette_scores.append((b_i - a_i) / max(a_i, b_i))

        return np.nanmean(silhouette_scores)

    # Manual NMI Calculation
    def manual_nmi(true_labels, pred_labels):
        contingency = pd.crosstab(pred_labels, true_labels)
        pi_j = contingency.sum(axis=0) / len(true_labels)
        pi_k = contingency.sum(axis=1) / len(true_labels)
        
        mi = 0
        for k in contingency.index:
            for j in contingency.columns:
                p_jk = contingency.loc[k,j] / len(true_labels)
                if p_jk > 0:
                    mi += p_jk * np.log(p_jk / (pi_j[j] * pi_k[k]))
        
        h_true = -np.sum(pi_j * np.log(pi_j + 1e-12))
        h_pred = -np.sum(pi_k * np.log(pi_k + 1e-12))
        
        return mi / np.sqrt(h_true * h_pred)

    # Calculate both metrics
    sil_score = manual_silhouette_score(pca_vectors, kmeans.labels_)
    
    label_encoder = {genre: idx for idx, genre in enumerate(df['genre'].unique())}
    true_labels = df['genre'].map(label_encoder)
    nmi_score = manual_nmi(true_labels, df['cluster'])

    # Visualization with both metrics
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x=pca_vectors[:, 0], 
        y=pca_vectors[:, 1],
        hue=df['genre'],
        style=df['cluster'],
        palette='viridis',
        s=100,
        edgecolor='w'
    )
    plt.title(f"Clustering Result\nSilhouette: {sil_score:.3f}, NMI: {nmi_score:.3f}")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

interactive(children=(IntSlider(value=6, description='n_clusters', max=10, min=2), IntSlider(value=42, descrip…

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>6. Dynamic Metric Calculations</strong>
</div>

In [17]:
def calculate_metrics(X, labels):
    # Silhouette Score
    def pairwise_dist(X):
        return np.sqrt(((X[:, None] - X) ** 2).sum(axis=2))
    
    dist_matrix = pairwise_dist(X)
    silhouette_scores = []
    
    for i in range(len(X)):
        a = np.mean(dist_matrix[i, labels == labels[i]])
        b = np.min([np.mean(dist_matrix[i, labels == k]) 
                   for k in np.unique(labels) if k != labels[i]])
        silhouette_scores.append((b - a)/max(a, b))
    
    # Intra-cluster distance
    intra_dists = [np.mean(pairwise_dist(X[labels == k])) 
                  for k in np.unique(labels)]
    
    return {
        'silhouette': np.mean(silhouette_scores),
        'intra_cluster': np.mean(intra_dists)
    }

# Add to clustering callback
@interact(n_clusters=(2, 10, 1))
def update_clusters(n_clusters=6):
    kmeans = KMeansClustering(k=n_clusters).fit(pca_vectors)
    metrics = calculate_metrics(pca_vectors, kmeans.labels_)

    plt.figure(figsize=(8, 6))  # Set figure size

    # Scatter plot of clustered points
    plt.scatter(pca_vectors[:, 0], pca_vectors[:, 1], c=kmeans.labels_, cmap='tab10', alpha=0.7)
    
    # Plot cluster centroids
    plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1], c='red', marker='x', s=200, label='Centroids')

    # Update plot with metrics in title
    plt.title(f"Clusters: {n_clusters}\nSilhouette: {metrics['silhouette']:.2f}"
              f"\nIntra-Cluster Distance: {metrics['intra_cluster']:.2f}")

    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend()
    plt.show()  # Ensure plot renders


interactive(children=(IntSlider(value=6, description='n_clusters', max=10, min=2), Output()), _dom_classes=('w…

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>7. NMI Implementation</strong>
</div>

In [18]:
def nmi_score(true_labels, pred_labels):
    """Compute Normalized Mutual Information (NMI) without sklearn"""
    unique_true, true_indices = np.unique(true_labels, return_inverse=True)
    unique_pred, pred_indices = np.unique(pred_labels, return_inverse=True)
    
    n = len(true_labels)
    contingency = np.zeros((len(unique_pred), len(unique_true)), dtype=np.float64)
    
    for p, t in zip(pred_indices, true_indices):
        contingency[p, t] += 1.0
    
    P = contingency / n
    P_pred = np.sum(P, axis=1) + 1e-12
    P_true = np.sum(P, axis=0) + 1e-12
    
    mi = 0.0
    for i in range(len(unique_pred)):
        for j in range(len(unique_true)):
            if P[i, j] > 0.0:
                mi += P[i, j] * np.log(P[i, j] / (P_pred[i] * P_true[j]))
    
    h_pred = -np.sum(P_pred * np.log(P_pred))
    h_true = -np.sum(P_true * np.log(P_true))
    return mi / np.sqrt(h_pred * h_true)

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>8. Genre Prediction System</strong>
</div>

In [19]:
from ipywidgets import widgets
class GenrePredictor:
    def __init__(self, vectorizer, pca, kmeans, df):
        self.vectorizer = vectorizer
        self.pca = pca
        self.kmeans = kmeans
        self.df = df
        self.cluster_genres = df.groupby('cluster')['genre'].agg(pd.Series.mode)
        
    def predict(self, keywords):
        valid_keywords = [word.lower().strip() for word in keywords 
                         if word in self.vectorizer.vocab]
        if not valid_keywords:
            return "Unknown"
        
        tfidf = self.vectorizer.transform([' '.join(valid_keywords)])[0]
        if np.all(tfidf == 0):
            return "Unknown"
        
        pca_proj = self.pca.transform(tfidf.reshape(1, -1))
        distances = np.linalg.norm(self.kmeans.centroids - pca_proj, axis=1)
        cluster = np.argmin(distances)
        return self.cluster_genres.loc[cluster]

# Initialize predictor
predictor = GenrePredictor(tfidf, pca, KMeansClustering(k=6).fit(pca_vectors), df)

# Created interactive widget
keyword_input = widgets.Text(
    placeholder='Enter comma-separated keywords',
    description='Keywords:',
    layout={'width': '500px'}
)

output = widgets.Output()

def on_submit(_):
    output.clear_output()
    with output:
        keywords = [k.strip() for k in keyword_input.value.split(',')]
        prediction = predictor.predict(keywords)
        print(f"Input Keywords: {keywords}")
        print(f"Predicted Genre: {prediction}")

keyword_input.on_submit(on_submit)

display(keyword_input, output)


Clustering Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

  keyword_input.on_submit(on_submit)


Text(value='', description='Keywords:', layout=Layout(width='500px'), placeholder='Enter comma-separated keywo…

Output()

<div style="font-family: 'Lexend', sans-serif; font-size: 24px;">
    <strong>9. Advanced Visualizations</strong>
</div>

In [20]:
def cluster_analysis(n_top):
    plt.figure(figsize=(18, 6))
    
    # --- Genre distribution per cluster (Heatmap for top n_top genres) ---
    plt.subplot(1, 2, 1)
    cluster_genre_counts = df.groupby(['cluster', 'genre']).size().unstack().fillna(0).astype(int)
    total_genre_counts = cluster_genre_counts.sum()
    top_genres = total_genre_counts.nlargest(n_top).index
    cluster_genre_counts = cluster_genre_counts[top_genres]

    sns.heatmap(cluster_genre_counts, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Genre Distribution Across Clusters (Top {n_top})')
    plt.xlabel('Genre')
    plt.ylabel('Cluster')

    # --- Sum of frequencies of the top n_top words per cluster (Bar Chart) ---
    plt.subplot(1, 2, 2)
    from collections import Counter
    top_words_freq_sum = []
    
    for cluster_id in sorted(df['cluster'].unique()):
        cluster_docs = ' '.join(df[df['cluster'] == cluster_id]['document'])
        words = [word for word in cluster_docs.split() if word in tfidf.vocab]
        counter = Counter(words)
        most_common_n = counter.most_common(n_top)
        freq_sum = sum(count for _, count in most_common_n)
        top_words_freq_sum.append(freq_sum)

    df_top_words_freq_sum = pd.DataFrame({
        'Cluster': sorted(df['cluster'].unique()),
        f'Sum of Top {n_top} Word Frequencies': top_words_freq_sum
    }).set_index('Cluster')

    df_top_words_freq_sum.plot(kind='barh', legend=False, ax=plt.gca(), color='skyblue')
    plt.xlabel(f'Sum of Top {n_top} Word Frequencies')
    plt.ylabel('Cluster')
    plt.title(f'Sum of Frequencies of Top {n_top} Words per Cluster')

    plt.tight_layout()
    plt.show()


# Interactive widget to adjust `n_top`
interact(cluster_analysis, n_top=(1, 10, 1))



interactive(children=(IntSlider(value=5, description='n_top', max=10, min=1), Output()), _dom_classes=('widget…

<function __main__.cluster_analysis(n_top)>