 **Task 3.3: Parameter Tuning Using Grid Search**

The grid search systematically explored 7 parameters across three main pipeline stages:
- **TF–IDF Vectorization**
    - `max_features`
    - `ngram_range`
    - `min_df`
    - `max_df`
- **SVD (Latent Semantic Analysis**
    - `n_components`
    - `svd_random_state`
- **Hierarchical Clustering**
    - `n_clusters`
    - `linkage_method`


In [1]:
from sqlalchemy import create_engine
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import pdist, cdist
from scipy.special import softmax
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np


def full_clustering_pipeline(connection_string,
                             table_name='songs',
                             text_column='cleanTokens',
                             # TF-IDF params
                             max_features=9000,
                             ngram_range=(1, 2),
                             min_df=25,
                             max_df=0.9,
                             # SVD params
                             n_components=120,
                             svd_random_state=100,
                             # Hierarchical params
                             n_clusters=6,
                             linkage_method='ward',
                             cut_height=None,
                             # options
                             soft_assign=True,
                             distance_metric='euclidean',
                             visualize=True,
                             table_show=False,
                             save_excel=False):
    """
    Full pipeline: load SimilarityData, TF-IDF, SVD, Hierarchical clustering, soft assignment, PCA visualization.
    Returns: df, X_reduced, prob_df, silhouette_score, vectorizer, svd, linkage_matrix
    """
    # =====  Load & preprocess =====
    engine = create_engine(connection_string)
    df = pd.read_sql(f"""
        SELECT song_id, name, {text_column}, cleanGenre
        FROM {table_name}
        WHERE {text_column} IS NOT NULL
    """, engine)

    df[text_column] = df[text_column].apply(ast.literal_eval)
    df['clean_text'] = df[text_column].apply(lambda tokens: ' '.join(map(str, tokens)))

    # =====  TF-IDF =====
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        lowercase=True,
        strip_accents='unicode',
        token_pattern=r'\b\w+\b'
    )
    X_sparse = vectorizer.fit_transform(df['clean_text'])

    # =====  Truncated SVD (LSA) =====
    svd = TruncatedSVD(n_components=min(n_components, X_sparse.shape[1]),
                       random_state=svd_random_state)
    X_reduced = svd.fit_transform(X_sparse)

    # =====  Hierarchical clustering =====
    Z = linkage(X_reduced, method=linkage_method)

    if cut_height:
        clusters = fcluster(Z, t=cut_height, criterion='distance')
    else:
        clusters = fcluster(Z, t=n_clusters, criterion='maxclust')

    df['hierarchical_label'] = clusters

    # Approx silhouette score
    sil_score = silhouette_score(X_reduced, clusters)

    # =====  Soft assignments =====
    prob_df = None
    assigned_cluster = clusters
    if soft_assign:
        # Approximate centroids per cluster
        centroids = []
        for k in sorted(set(clusters)):
            centroids.append(X_reduced[clusters == k].mean(axis=0))
        centroids = np.vstack(centroids)

        dists = cdist(X_reduced, centroids, metric=distance_metric)
        probabilities = softmax(-dists, axis=1)
        assigned_cluster = probabilities.argmax(axis=1) + 1  # match fcluster labels (1-based)
        prob_df = pd.DataFrame(probabilities,
                               columns=[f'Cluster_{i + 1}' for i in range(centroids.shape[0])],
                               index=df.index)
        prob_df['assigned_cluster'] = assigned_cluster

    # =====  Visualization =====
    if visualize:
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_reduced)
        plt.figure(figsize=(10, 7))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
                              c=assigned_cluster,
                              cmap='tab10', alpha=0.8, s=40)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.title(f'Hierarchical Clusters (k={n_clusters}) in 2D PCA')
        plt.legend(*scatter.legend_elements(), title="Clusters")
        plt.tight_layout()
        plt.show()

        # Optional dendrogram
        plt.figure(figsize=(15, 7))
        dendrogram(Z)
        if cut_height:
            plt.axhline(y=cut_height, c='red', lw=2, linestyle='--')
        plt.title("Hierarchical Clustering Dendrogram")
        plt.show()

    # =====  Top subgenres per cluster & Excel =====
    if table_show or save_excel:
        for cluster_id in sorted(df['hierarchical_label'].unique()):
            cluster_songs = df[df['hierarchical_label'] == cluster_id]
            total_songs = len(cluster_songs)
            all_subgenres = cluster_songs['cleanGenre'].dropna().apply(lambda x: [g.strip() for g in x.split(',')])
            all_subgenres_flat = [g for sublist in all_subgenres for g in sublist]
            subgenre_counts = Counter(all_subgenres_flat)
            top5_subgenres = subgenre_counts.most_common(5)
            top5_summary = ", ".join([f"{g} ({c}/{total_songs})" for g, c in top5_subgenres])
            print(f"\n=== Cluster {cluster_id} ({total_songs} songs) | Top subgenres: {top5_summary} ===")

            if save_excel:
                filename = f"cluster_{cluster_id}.xlsx"
                cluster_songs[['name', 'cleanGenre', 'hierarchical_label']].to_excel(filename, index=False)

    return df, X_reduced, prob_df, sil_score, vectorizer, svd, Z


In [2]:
def clustering_grid_search(connection_string,
                           max_features_list=[8000, 9000, 10000],
                           ngram_range_list=[(1, 1), (1, 2)],
                           min_df_list=[20, 25, 30],
                           max_df_list=[0.8, 0.9, 1.0],
                           n_components_list=[100, 120, 150],
                           svd_random_state_list=[42, 100],
                           n_clusters_list=[5, 6, 7],
                           linkage_method_list=['ward', 'complete', 'average'],
                           verbose=True):
    """
    Perform a grid search over TF-IDF, SVD, and Hierarchical clustering params,
    including max_df. Returns a pandas DataFrame sorted by silhouette score.
    """
    import itertools
    results = []

    for max_features, ngram_range, min_df, max_df, n_components, svd_rs, n_clusters, linkage_method in itertools.product(
        max_features_list, ngram_range_list, min_df_list, max_df_list,
        n_components_list, svd_random_state_list,
        n_clusters_list, linkage_method_list):

        if verbose:
            print(f"Testing: max_features={max_features}, ngram_range={ngram_range}, "
                  f"min_df={min_df}, max_df={max_df}, n_components={n_components}, "
                  f"svd_random_state={svd_rs}, n_clusters={n_clusters}, "
                  f"linkage_method={linkage_method}")

        try:
            _, _, _, sil_score, _, _, _ = full_clustering_pipeline(
                connection_string=connection_string,
                max_features=max_features,
                ngram_range=ngram_range,
                min_df=min_df,
                max_df=max_df,
                n_components=n_components,
                svd_random_state=svd_rs,
                n_clusters=n_clusters,
                linkage_method=linkage_method,
                soft_assign=False,
                visualize=False,
                table_show=False,
                save_excel=False
            )

            results.append({
                'max_features': max_features,
                'ngram_range': ngram_range,
                'min_df': min_df,
                'max_df': max_df,
                'n_components': n_components,
                'svd_random_state': svd_rs,
                'n_clusters': n_clusters,
                'linkage_method': linkage_method,
                'silhouette_score': sil_score
            })

            if verbose:
                print(f"→ Silhouette score: {sil_score:.4f}\n")

        except Exception as e:
            print(f"Skipped combination due to error: {e}")

    results_df = pd.DataFrame(results).sort_values(by='silhouette_score', ascending=False).reset_index(drop=True)
    return results_df


In [3]:
conn_str = "mssql+pyodbc://IVAN_PC\\SQLEXPRESS/TextMiningHA?driver=ODBC+Driver+17+for+SQL+Server"

grid_results = clustering_grid_search(
    connection_string=conn_str,
    max_features_list=[8000, 9000,],           # different max_features
    ngram_range_list=[(1,1) ,(1, 2)],             # unigram and bigram combinations
    min_df_list=[15, 20, 30],                       # different minimum document frequency
    max_df_list=[0.8,0.9],                    # different maximum document frequency
    n_components_list=[ 30, 40,100],             # different SVD dimensions
    svd_random_state_list=[42,100],               # different SVD random states
    n_clusters_list=[ 5, 6, 7],                      # different hierarchical cluster numbers
    linkage_method_list=['ward' , 'average']  # different linkage methods
)

display(grid_results)


Testing: max_features=8000, ngram_range=(1, 1), min_df=15, max_df=0.8, n_components=30, svd_random_state=42, n_clusters=5, linkage_method=ward
→ Silhouette score: 0.1200

Testing: max_features=8000, ngram_range=(1, 1), min_df=15, max_df=0.8, n_components=30, svd_random_state=42, n_clusters=5, linkage_method=average
→ Silhouette score: 0.2219

Testing: max_features=8000, ngram_range=(1, 1), min_df=15, max_df=0.8, n_components=30, svd_random_state=42, n_clusters=6, linkage_method=ward
→ Silhouette score: 0.0547

Testing: max_features=8000, ngram_range=(1, 1), min_df=15, max_df=0.8, n_components=30, svd_random_state=42, n_clusters=6, linkage_method=average
→ Silhouette score: 0.1977

Testing: max_features=8000, ngram_range=(1, 1), min_df=15, max_df=0.8, n_components=30, svd_random_state=42, n_clusters=7, linkage_method=ward
→ Silhouette score: 0.0601

Testing: max_features=8000, ngram_range=(1, 1), min_df=15, max_df=0.8, n_components=30, svd_random_state=42, n_clusters=7, linkage_method=a

KeyboardInterrupt: 

## Best Combination Found
| max_features | ngram_range | min_df | max_df | n_components | svd_random_state | n_clusters | linkage_method | silhouette_score |
|--------------|-------------|--------|--------|---------------|------------------|------------|----------------|------------------|
| 8000         | (1, 2)      | 15     | 0.8    | 40            | 42               | 6          | average        | 0.149912         |



 Conclusion : Hierarchical clustering provides better separation than KMeans in this setup, though score remain weakly defined.