In [0]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.clustering import KMeans

# Load the data and select the relevant column, dropping nulls
df = spark.table("users.luis_herrera.pipelineiq_view").select("business_usecase_type").na.drop()

# Tokenize the business use case type into words
tokenizer = Tokenizer(inputCol="business_usecase_type", outputCol="words")
words_data = tokenizer.transform(df)

# Vectorize the tokenized words using HashingTF
vectorizer = HashingTF(inputCol="words", outputCol="features", numFeatures=1000)
vectorized_data = vectorizer.transform(words_data)

# Evaluate silhouette scores for k in range 2 to 15
scores = []
for k in range(2, 16):
    # Fit KMeans model for current k
    kmeans = KMeans(featuresCol="features", k=k, seed=42)
    kmeans_model = kmeans.fit(vectorized_data)
    # Assign clusters to data
    clusters = kmeans_model.transform(vectorized_data)
    # Evaluate clustering using silhouette score
    evaluator = ClusteringEvaluator(featuresCol="features")
    score = evaluator.evaluate(clusters)
    scores.append((k, score))

# Create a DataFrame of silhouette scores for each k and display
scores_df = spark.createDataFrame(scores, ["k", "silhouette_score"])
display(scores_df)

In [0]:
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.clustering import KMeans
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Load the data and select the relevant column, dropping nulls
df = spark.table("users.luis_herrera.pipelineiq_view").select("business_usecase_type").na.drop()

# Tokenize the business use case type into words
tokenizer = Tokenizer(inputCol="business_usecase_type", outputCol="words")
words_data = tokenizer.transform(df)

# Vectorize the tokenized words using HashingTF
vectorizer = HashingTF(inputCol="words", outputCol="features", numFeatures=1000)
vectorized_data = vectorizer.transform(words_data)

# Fit KMeans model with k=16 clusters
kmeans = KMeans(featuresCol="features", k=16, seed=42)
kmeans_model = kmeans.fit(vectorized_data)
# Assign clusters to data
clusters = kmeans_model.transform(vectorized_data)

# Get top 5 words per cluster to use as category names
top_words = (
    clusters
    .select("prediction", F.explode("words").alias("word"))
    .groupBy("prediction", "word")
    .count()
    .withColumn("rank", F.row_number().over(Window.partitionBy("prediction").orderBy(F.desc("count"))))
    .filter(F.col("rank") <= 5)
    .groupBy("prediction")
    .agg(F.collect_list(F.col("word")).alias("top_words"))
    .withColumn("category_name", F.expr("concat_ws(' ', slice(top_words, 1, 5))"))
    .select("prediction", "category_name")
)

# Count the number of use cases per cluster/category
category_counts = (
    clusters.groupBy("prediction")
    .count()
    .withColumnRenamed("count", "usecase_count")
)

# Join top words and counts for each cluster
top_words_with_counts = top_words.join(category_counts, on="prediction", how="left")

# Add category names and counts to the original clustered data
clusters_with_names = clusters.join(top_words_with_counts, on="prediction", how="left")

# Display the clustered data with category names
display(clusters_with_names.select("business_usecase_type", "prediction", "category_name"))

# Display the top words and use case counts for each cluster
display(top_words_with_counts.orderBy("prediction"))

In [0]:
display(category_names)