In [4]:
import itertools
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sqlalchemy import create_engine
import ast

def full_lda_pipeline(connection_string,
                      table_name='songs',
                      text_column='cleanTokens',
                      n_topics=5,
                      n_clusters=5,
                      max_features=5000,
                      lda_random_state=42,
                      kmeans_random_state=42):
    # Load SimilarityData
    engine = create_engine(connection_string)
    df = pd.read_sql(f"""
        SELECT song_id, name, {text_column}, cleanGenre
        FROM {table_name}
        WHERE {text_column} IS NOT NULL
    """, engine)
    df[text_column] = df[text_column].apply(ast.literal_eval)
    df['clean_text'] = df[text_column].apply(lambda tokens: ' '.join(map(str, tokens)))

    # Bag-of-Words
    vectorizer = CountVectorizer(max_features=max_features)
    X_counts = vectorizer.fit_transform(df['clean_text'])

    # LDA
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=lda_random_state)
    lda.fit(X_counts)
    topic_vectors = lda.transform(X_counts)
    df['assigned_topic'] = topic_vectors.argmax(axis=1)

    # KMeans in topic space
    kmeans = KMeans(n_clusters=n_clusters, random_state=kmeans_random_state)
    kmeans.fit(topic_vectors)
    df['kmeans_label'] = kmeans.labels_

    return df, topic_vectors


In [5]:
def lda_grid_search(connection_string,
                    n_topics_list=[5, 6],
                    max_features_list=[5000, 10000],
                    k_clusters_list=[5, 6],
                    lda_random_state_list=[42],
                    kmeans_random_state_list=[42],
                    verbose=True):
    """
    Grid search for LDA + KMeans with separate random states.
    Returns a sorted pandas DataFrame with silhouette scores.
    """
    results = []

    for n_topics, max_features, k_clusters, lda_rs, km_rs in itertools.product(
            n_topics_list, max_features_list, k_clusters_list,
            lda_random_state_list, kmeans_random_state_list):

        if verbose:
            print(f"Testing: n_topics={n_topics}, max_features={max_features}, "
                  f"k_clusters={k_clusters}, lda_random_state={lda_rs}, kmeans_random_state={km_rs}")

        try:
            df, topic_vectors = full_lda_pipeline(
                connection_string,
                n_topics=n_topics,
                n_clusters=k_clusters,
                max_features=max_features,
                lda_random_state=lda_rs,
                kmeans_random_state=km_rs
            )

            labels = df['kmeans_label']
            sil_score = silhouette_score(topic_vectors, labels)

            results.append({
                'n_topics': n_topics,
                'max_features': max_features,
                'k_clusters': k_clusters,
                'lda_random_state': lda_rs,
                'kmeans_random_state': km_rs,
                'silhouette_score': sil_score
            })

            if verbose:
                print(f"→ Silhouette score: {sil_score:.3f}\n")

        except Exception as e:
            print(f"Skipped combination due to error: {e}")

    results_df = pd.DataFrame(results).sort_values(by='silhouette_score', ascending=False).reset_index(drop=True)
    return results_df


In [6]:

# Example usage
conn_str = "mssql+pyodbc://IVAN_PC\\SQLEXPRESS/TextMiningHA?driver=ODBC+Driver+17+for+SQL+Server"

grid_results = lda_grid_search(
    connection_string=conn_str,
    n_topics_list=[ 5,6,7],
    max_features_list=[10000],
    k_clusters_list=[ 5,6],
    lda_random_state_list=[42, 100],
    kmeans_random_state_list=[20,30, 90]
)

display(grid_results)


Testing: n_topics=5, max_features=10000, k_clusters=5, lda_random_state=42, kmeans_random_state=20
→ Silhouette score: 0.629

Testing: n_topics=5, max_features=10000, k_clusters=5, lda_random_state=42, kmeans_random_state=30


KeyboardInterrupt: 

## **LDA + KMeans Results**

To further improve clustering quality, Latent Dirichlet Allocation (LDA) was combined with KMeans clustering.
This approach learns latent topics from the corpus and then groups the documents based on their topic distributions.
##  Best Combination Found
| n_topics | max_features | k_clusters | lda_random_state | kmeans_random_state | silhouette_score |
|----------|--------------|------------|------------------|----------------------|------------------|
| 6        | 10000        | 6          | 42               | 30                   | 0.795476         |

**Discussion**
- The best silhouette score ≈ 0.78 is significantly higher than the scores obtained from:
  - KMeans with TF–IDF + SVD (~0.085)
  - Hierarchical clustering (~0.150)
- This indicates that LDA topics provide a much more meaningful representation of the text than raw TF–IDF features reduced with SVD.
- With 6 latent topics and 6 final clusters, the structure of the dataset is captured far more effectively.
- The results are satisfactory and demonstrate that probabilistic topic modeling can uncover coherent groups in the music data that traditional vectorization methods failed to separate.




## Using Multithreading / Multiprocessing

During my assignment process, I ran three notebooks simultaneously, each performing a grid search for a different clustering method (KMeans, Hierarchical, LDA).
As I added more parameters, the time required increased significantly.

Implementing Parallel Processing could help me to be more time efficient
- Each parameter combination is independent, so multiple evaluations could run at the same time.
- By utilizing multiple CPU cores multithreading  would reduce total runtime.

Conclusion: Since I have a **2-core CPU**, I could not fully implement and evaluate multithreading,
Parallelization could have greatly sped up my grid searches and reduced the overall experimentation time.
