 **Task 3.3: Parameter Tuning Using Grid Search**<br>The grid search explored **8 parameters** across three pipeline stages:

- **TF–IDF Vectorization**
  - `max_features`
  - `ngram_range`
  - `min_df`
- **SVD (Latent Semantic Analysis)**
  - `n_components`
  - `svd_random_state`
- **KMeans Clustering**
  - `n_clusters`
  - `kmeans_n_init`
  - `kmeans_random_state`


In [4]:
from sqlalchemy import create_engine
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from scipy.special import softmax
import matplotlib.pyplot as plt
from collections import Counter
import itertools


def full_clustering_pipeline(connection_string,
                             table_name='songs',
                             text_column='cleanTokens',
                             # TF-IDF params
                             max_features=9000,
                             ngram_range=(1, 2),
                             min_df=25,
                             max_df=0.9,
                             # SVD params
                             n_components=120,
                             svd_random_state=100,
                             # KMeans params
                             n_clusters=6,
                             kmeans_n_init=250,
                             kmeans_random_state=100,
                             # options
                             soft_assign=True,
                             distance_metric='euclidean',
                             visualize=True,
                             table_show=False,
                             save_excel=False):
    """
    Full pipeline: load SimilarityData, TF-IDF, SVD, KMeans clustering, soft assignment, PCA visualization.
    Returns: df, X_reduced, prob_df, silhouette_score, vectorizer, svd, kmeans
    """
    # =====  Load & preprocess =====
    engine = create_engine(connection_string)
    df = pd.read_sql(f"""
        SELECT song_id, name, {text_column}, cleanGenre
        FROM {table_name}
        WHERE {text_column} IS NOT NULL
    """, engine)

    df[text_column] = df[text_column].apply(ast.literal_eval)
    df['clean_text'] = df[text_column].apply(lambda tokens: ' '.join(map(str, tokens)))

    # =====  TF-IDF =====
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        lowercase=True,
        strip_accents='unicode',
        token_pattern=r'\b\w+\b'
    )
    X_sparse = vectorizer.fit_transform(df['clean_text'])

    # =====  Truncated SVD (LSA) =====
    svd = TruncatedSVD(n_components=min(n_components, X_sparse.shape[1]),
                       random_state=svd_random_state)
    X_reduced = svd.fit_transform(X_sparse)

    # ===== 4 KMeans clustering =====
    kmeans = KMeans(n_clusters=n_clusters,
                    n_init=kmeans_n_init,
                    random_state=kmeans_random_state)
    kmeans.fit(X_reduced)

    df['kmeans_label'] = kmeans.labels_
    sil_score = silhouette_score(X_reduced, kmeans.labels_)

    # =====  Soft assignments =====
    prob_df = None
    assigned_cluster = kmeans.labels_
    if soft_assign:
        centroids = kmeans.cluster_centers_
        dists = cdist(X_reduced, centroids, metric=distance_metric)
        probabilities = softmax(-dists, axis=1)
        assigned_cluster = probabilities.argmax(axis=1)
        prob_df = pd.DataFrame(probabilities,
                               columns=[f'Cluster_{i + 1}' for i in range(n_clusters)],
                               index=df.index)
        prob_df['assigned_cluster'] = assigned_cluster

    # =====  Visualization =====
    if visualize:
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_reduced)
        plt.figure(figsize=(10, 7))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
                              c=assigned_cluster,
                              cmap='tab10', alpha=0.8, s=40)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.title(f'KMeans Clusters (k={n_clusters}) in 2D PCA')
        plt.legend(*scatter.legend_elements(), title="Clusters")
        plt.tight_layout()
        plt.show()

    # ===== 7️⃣ Top subgenres per cluster & Excel =====
    if table_show or save_excel:
        for cluster_id in sorted(df['kmeans_label'].unique()):
            cluster_songs = df[df['kmeans_label'] == cluster_id]
            total_songs = len(cluster_songs)
            all_subgenres = cluster_songs['cleanGenre'].dropna().apply(lambda x: [g.strip() for g in x.split(',')])
            all_subgenres_flat = [g for sublist in all_subgenres for g in sublist]
            subgenre_counts = Counter(all_subgenres_flat)
            top5_subgenres = subgenre_counts.most_common(5)
            top5_summary = ", ".join([f"{g} ({c}/{total_songs})" for g, c in top5_subgenres])
            print(f"\n=== Cluster {cluster_id} ({total_songs} songs) | Top subgenres: {top5_summary} ===")

            if save_excel:
                filename = f"cluster_{cluster_id}.xlsx"
                cluster_songs[['name', 'cleanGenre', 'kmeans_label']].to_excel(filename, index=False)

    return df, X_reduced, prob_df, sil_score, vectorizer, svd, kmeans





In [5]:
def clustering_grid_search(connection_string,
                           max_features_list=[9000],
                           ngram_range_list=[(1, 2)],
                           min_df_list=[25],
                           n_components_list=[120],
                           svd_random_state_list=[100],
                           n_clusters_list=[6],
                           kmeans_n_init_list=[250],
                           kmeans_random_state_list=[100],
                           verbose=True):
    """
    Perform a grid search over TF-IDF, SVD, and KMeans params.
    Returns a pandas DataFrame sorted by silhouette score.
    Grid-searches 8 features: max_features, ngram_range, min_df,
    n_components, svd_random_state, n_clusters, kmeans_n_init,
    kmeans_random_state.
    """
    results = []

    for max_features, ngram_range, min_df, n_components, svd_rs, n_clusters, kmeans_init, kmeans_rs in itertools.product(
        max_features_list, ngram_range_list, min_df_list,
        n_components_list, svd_random_state_list,
        n_clusters_list, kmeans_n_init_list, kmeans_random_state_list):

        if verbose:
            print(f"Testing: max_features={max_features}, ngram_range={ngram_range}, "
                  f"min_df={min_df}, n_components={n_components}, svd_random_state={svd_rs}, "
                  f"n_clusters={n_clusters}, kmeans_n_init={kmeans_init}, kmeans_random_state={kmeans_rs}")

        try:
            _, _, _, sil_score, _, _, _ = full_clustering_pipeline(
                connection_string=connection_string,
                max_features=max_features,
                ngram_range=ngram_range,
                min_df=min_df,
                n_components=n_components,
                svd_random_state=svd_rs,
                n_clusters=n_clusters,
                kmeans_n_init=kmeans_init,
                kmeans_random_state=kmeans_rs,
                soft_assign=False,
                visualize=False,
                table_show=False,
                save_excel=False
            )
            results.append({
                'max_features': max_features,
                'ngram_range': ngram_range,
                'min_df': min_df,
                'n_components': n_components,
                'svd_random_state': svd_rs,
                'n_clusters': n_clusters,
                'kmeans_n_init': kmeans_init,
                'kmeans_random_state': kmeans_rs,
                'silhouette_score': sil_score
            })
            if verbose:
                print(f"→ Silhouette score: {sil_score:.4f}\n")
        except Exception as e:
            print(f"Skipped combination due to error: {e}")

    results_df = pd.DataFrame(results).sort_values(by='silhouette_score', ascending=False).reset_index(drop=True)
    return results_df



In [6]:
conn_str = "mssql+pyodbc://IVAN_PC\\SQLEXPRESS/TextMiningHA?driver=ODBC+Driver+17+for+SQL+Server"

grid_results = clustering_grid_search(
    connection_string=conn_str,
    max_features_list=[8000,10000],
    ngram_range_list=[(1,1),(1, 2)],
    min_df_list=[90,100],
    n_components_list=     [35,40,50],
    svd_random_state_list=[ 40,50],
    n_clusters_list=[5,6,7],
    kmeans_n_init_list=[50,100],
    kmeans_random_state_list=[ 80,90, 100]
)



display(grid_results)

Testing: max_features=8000, ngram_range=(1, 1), min_df=90, n_components=35, svd_random_state=40, n_clusters=5, kmeans_n_init=50, kmeans_random_state=80
→ Silhouette score: 0.0969

Testing: max_features=8000, ngram_range=(1, 1), min_df=90, n_components=35, svd_random_state=40, n_clusters=5, kmeans_n_init=50, kmeans_random_state=90
→ Silhouette score: 0.0968

Testing: max_features=8000, ngram_range=(1, 1), min_df=90, n_components=35, svd_random_state=40, n_clusters=5, kmeans_n_init=50, kmeans_random_state=100
→ Silhouette score: 0.0970

Testing: max_features=8000, ngram_range=(1, 1), min_df=90, n_components=35, svd_random_state=40, n_clusters=5, kmeans_n_init=100, kmeans_random_state=80
→ Silhouette score: 0.0970

Testing: max_features=8000, ngram_range=(1, 1), min_df=90, n_components=35, svd_random_state=40, n_clusters=5, kmeans_n_init=100, kmeans_random_state=90
→ Silhouette score: 0.0968

Testing: max_features=8000, ngram_range=(1, 1), min_df=90, n_components=35, svd_random_state=40, 

KeyboardInterrupt: 

## Best Combination Found
| max_features | ngram_range | min_df | n_components | svd_random_state | n_clusters | kmeans_n_init | kmeans_random_state | silhouette_score |
|--------------|-------------|--------|---------------|------------------|------------|----------------|---------------------|------------------|
| 10000        | (1, 2)      | 100    | 40            | 40               | 5          | 50             | 90                  | 0.084666         |


Conclusion: the pipeline runs successfully and systematically explores 8 parameters, but the data itself still not lend well to clean KMeans partitions under TF–IDF + SVD.

