## Step 1: Load the dataset and inspect first rows


In [5]:
import pandas as pd

df = pd.read_csv("feedback_data.csv")

print(df.head(6))

  feedback_id                                               text  \
0        F001  "Steering wheel leather peeling after 6 months...   
1        F002  "Leather on Steering Wheel has discolored & fa...   
2        F003  "steering wheel has deep scratches n cracks on...   
3        F004  "Steering wheel buttons not responsive intermi...   

        component    source  
0  Steering Wheel  Customer  
1  Steering Wheel  Customer  
2  Steering Wheel  Customer  
3  Steering Wheel  Workshop  
4      ABS System  Workshop  
5      ABS System  Customer  


## Step 2: Preprocess and clean feedback text


In [6]:
import re

# Remove surrounding quotes and clean text
def clean_text(text):
    text = str(text)
    text = text.strip('"').strip("'")
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

df[['feedback_id', 'text', 'cleaned_text']].head(10)


Unnamed: 0,feedback_id,text,cleaned_text
0,F001,"""Steering wheel leather peeling after 6 months...",steering wheel leather peeling after 6 months ...
1,F002,"""Leather on Steering Wheel has discolored & fa...",leather on steering wheel has discolored faded
2,F003,"""steering wheel has deep scratches n cracks on...",steering wheel has deep scratches n cracks on ...
3,F004,"""Steering wheel buttons not responsive intermi...",steering wheel buttons not responsive intermit...
4,F005,"""ABS Sensor FAILURE triggered warning light on...",abs sensor failure triggered warning light on ...
5,F006,"""ABS warning lght stays on intermittently""",abs warning lght stays on intermittently
6,F007,"""Brake pedal is unusually stiff & hard to press""",brake pedal is unusually stiff hard to press
7,F008,"""Brake pads wear out faster than expected??""",brake pads wear out faster than expected
8,F009,"""Brake caliper making grinding noise!!!""",brake caliper making grinding noise
9,F010,"""Seatbelt fraying near the buckle after 3months""",seatbelt fraying near the buckle after 3months


In [9]:
df.head(10)


Unnamed: 0,feedback_id,text,component,source,cleaned_text
0,F001,"""Steering wheel leather peeling after 6 months...",Steering Wheel,Customer,steering wheel leather peeling after 6 months ...
1,F002,"""Leather on Steering Wheel has discolored & fa...",Steering Wheel,Customer,leather on steering wheel has discolored faded
2,F003,"""steering wheel has deep scratches n cracks on...",Steering Wheel,Customer,steering wheel has deep scratches n cracks on ...
3,F004,"""Steering wheel buttons not responsive intermi...",Steering Wheel,Workshop,steering wheel buttons not responsive intermit...
4,F005,"""ABS Sensor FAILURE triggered warning light on...",ABS System,Workshop,abs sensor failure triggered warning light on ...
5,F006,"""ABS warning lght stays on intermittently""",ABS System,Customer,abs warning lght stays on intermittently
6,F007,"""Brake pedal is unusually stiff & hard to press""",Brake System,Customer,brake pedal is unusually stiff hard to press
7,F008,"""Brake pads wear out faster than expected??""",Brake System,Workshop,brake pads wear out faster than expected
8,F009,"""Brake caliper making grinding noise!!!""",Brake System,Customer,brake caliper making grinding noise
9,F010,"""Seatbelt fraying near the buckle after 3months""",Seatbelt,Workshop,seatbelt fraying near the buckle after 3months


## Step 3: Generate embeddings using Sentence-BERT (all-MiniLM-L6-v2)


In [11]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and good for clustering

texts = df['cleaned_text'].tolist()
embeddings = model.encode(texts, show_progress_bar=True)

print(f"Number of feedbacks: {len(embeddings)}")
print(f"Embedding dimension: {embeddings[0].shape}")

np.save("feedback_embeddings.npy", embeddings)

df['embedding'] = embeddings.tolist()

df.head()


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.54s/it]

Number of feedbacks: 24
Embedding dimension: (384,)





Unnamed: 0,feedback_id,text,component,source,cleaned_text,embedding
0,F001,"""Steering wheel leather peeling after 6 months...",Steering Wheel,Customer,steering wheel leather peeling after 6 months ...,"[-0.014327894896268845, -0.05661596357822418, ..."
1,F002,"""Leather on Steering Wheel has discolored & fa...",Steering Wheel,Customer,leather on steering wheel has discolored faded,"[0.008238011039793491, 0.03961119055747986, 0...."
2,F003,"""steering wheel has deep scratches n cracks on...",Steering Wheel,Customer,steering wheel has deep scratches n cracks on ...,"[-0.050107747316360474, -0.06628596782684326, ..."
3,F004,"""Steering wheel buttons not responsive intermi...",Steering Wheel,Workshop,steering wheel buttons not responsive intermit...,"[0.049614306539297104, -0.0813332200050354, 0...."
4,F005,"""ABS Sensor FAILURE triggered warning light on...",ABS System,Workshop,abs sensor failure triggered warning light on ...,"[0.002642096718773246, -0.012814835645258427, ..."


## Step 4: Cluster feedback texts using HDBSCAN

I use **HDBSCAN** because:  
- Works without predefining number of clusters.  
- Handles noisy/unstructured data (labels outliers as `-1`).  
- Finds clusters of varying shapes and densities.  

This makes it ideal for messy automotive feedback compared to K-Means or DBSCAN.


In [13]:
import hdbscan
import pandas as pd
import numpy as np


clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean')  # min_cluster_size=2 avoids very small clusters

cluster_labels = clusterer.fit_predict(embeddings)

df['cluster_id'] = cluster_labels

print(df[['feedback_id', 'cleaned_text', 'cluster_id']].sort_values('cluster_id'))


   feedback_id                                       cleaned_text  cluster_id
17        F018           windshield wipers leave streaks on glass          -1
15        F016    air conditioning blows warm air instead of cold          -1
14        F015               check engine light comes on randomly          -1
3         F004  steering wheel buttons not responsive intermit...          -1
13        F014                      coolant leakage from radiator          -1
12        F013     engine overheating after 10 minutes of driving          -1
18        F019               wiper motor is making grinding sound           0
8         F009                brake caliper making grinding noise           0
16        F017                    ac compressor making loud noise           0
7         F008           brake pads wear out faster than expected           0
6         F007       brake pedal is unusually stiff hard to press           0
22        F023                      front left door handle broke

## Step 5: Inspect clustering results
- Count how many clusters were detected (excluding noise).  
- View example feedback texts from a selected cluster.  


In [14]:
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
print(f"Number of clusters detected: {n_clusters}")

print(df[df['cluster_id'] == 3]['cleaned_text'].tolist())


Number of clusters detected: 4
['seatbelt fraying near the buckle after 3months', 'seatbelt stitching coming off near anchor point', 'seatbelt retracts slowly gets stuck']


In [15]:
df.head()

Unnamed: 0,feedback_id,text,component,source,cleaned_text,embedding,cluster_id
0,F001,"""Steering wheel leather peeling after 6 months...",Steering Wheel,Customer,steering wheel leather peeling after 6 months ...,"[-0.014327894896268845, -0.05661596357822418, ...",2
1,F002,"""Leather on Steering Wheel has discolored & fa...",Steering Wheel,Customer,leather on steering wheel has discolored faded,"[0.008238011039793491, 0.03961119055747986, 0....",2
2,F003,"""steering wheel has deep scratches n cracks on...",Steering Wheel,Customer,steering wheel has deep scratches n cracks on ...,"[-0.050107747316360474, -0.06628596782684326, ...",2
3,F004,"""Steering wheel buttons not responsive intermi...",Steering Wheel,Workshop,steering wheel buttons not responsive intermit...,"[0.049614306539297104, -0.0813332200050354, 0....",-1
4,F005,"""ABS Sensor FAILURE triggered warning light on...",ABS System,Workshop,abs sensor failure triggered warning light on ...,"[0.002642096718773246, -0.012814835645258427, ...",-1


## Step 6: Label clusters automatically using KeyBERT
I used **KeyBERT** to extract representative keywords/keyphrases for each cluster.  
This gives short, human-readable labels without manual annotation.


In [16]:
!pip install keybert

from keybert import KeyBERT

kw_model = KeyBERT(model='all-MiniLM-L6-v2')  # same embedding model as before

cluster_labels_dict = {}

for cluster_id in set(df['cluster_id']):
    if cluster_id == -1:
        continue  # skip noise

    cluster_texts = df[df['cluster_id'] == cluster_id]['text'].tolist()
    combined_text = " ".join(cluster_texts)

    # Extract top keyphrase
    keywords = kw_model.extract_keywords(combined_text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=1)

    # Store cluster label
    cluster_labels_dict[cluster_id] = keywords[0][0] if keywords else f"Cluster {cluster_id}"

# Add cluster_label column to DataFrame
df['cluster_label_keybert'] = df['cluster_id'].map(lambda x: cluster_labels_dict.get(x, 'Noise'))

print(df[['feedback_id', 'text', 'cluster_id', 'cluster_label_keybert']].sort_values('cluster_id'))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting rich>=10.4.0 (from keybert)
  Downloading rich-14.2.0-py3-none-any.whl.metadata (18 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=10.4.0->keybert)
  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.4.0->keybert)
  Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
Downloading rich-14.2.0-py3-none-any.whl (243 kB)
Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Installing collected packages: mdurl, markdown-it-py, rich, keybert
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [keybert]m2/4[0m [rich]
[1A[2KSuccessfully installed keybert-0.9.0 markdown-it-py-3.0.0 mdurl-0.1.2 rich-14.2.0
   feedback_

## Step 7: Label clusters automatically using a summarization model (BART)
I used **facebook/bart-large-cnn** to summarize all feedback texts in a cluster.  
This generates a short, descriptive label (e.g., *"Steering wheel leather issues"*).  


In [17]:

from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

cluster_labels_dict = {}

for cluster_id in set(df['cluster_id']):
    if cluster_id == -1:
        continue  # skip noise

    # Get all texts in this cluster
    cluster_texts = df[df['cluster_id'] == cluster_id]['cleaned_text'].tolist()

    # Combine texts into one string
    combined_text = " ".join(cluster_texts)

    # Run summarization (limit input length)
    summary = summarizer(combined_text[:1000], max_length=20, min_length=5, do_sample=False)[0]['summary_text']

    # Save cluster label
    cluster_labels_dict[cluster_id] = summary

# Map cluster IDs to labels
df['cluster_label_transfomer'] = df['cluster_id'].map(lambda x: cluster_labels_dict.get(x, 'Noise'))

print(df[['feedback_id', 'cleaned_text', 'cluster_id', 'cluster_label_transfomer']].sort_values('cluster_id'))


Device set to use mps:0


   feedback_id                                       cleaned_text  cluster_id  \
17        F018           windshield wipers leave streaks on glass          -1   
15        F016    air conditioning blows warm air instead of cold          -1   
14        F015               check engine light comes on randomly          -1   
3         F004  steering wheel buttons not responsive intermit...          -1   
13        F014                      coolant leakage from radiator          -1   
12        F013     engine overheating after 10 minutes of driving          -1   
18        F019               wiper motor is making grinding sound           0   
8         F009                brake caliper making grinding noise           0   
16        F017                    ac compressor making loud noise           0   
7         F008           brake pads wear out faster than expected           0   
6         F007       brake pedal is unusually stiff hard to press           0   
22        F023              

In [18]:
df.head()

Unnamed: 0,feedback_id,text,component,source,cleaned_text,embedding,cluster_id,cluster_label_keybert,cluster_label_transfomer
0,F001,"""Steering wheel leather peeling after 6 months...",Steering Wheel,Customer,steering wheel leather peeling after 6 months ...,"[-0.014327894896268845, -0.05661596357822418, ...",2,wheel discolored,leather peeling after 6 months of use. faded ...
1,F002,"""Leather on Steering Wheel has discolored & fa...",Steering Wheel,Customer,leather on steering wheel has discolored faded,"[0.008238011039793491, 0.03961119055747986, 0....",2,wheel discolored,leather peeling after 6 months of use. faded ...
2,F003,"""steering wheel has deep scratches n cracks on...",Steering Wheel,Customer,steering wheel has deep scratches n cracks on ...,"[-0.050107747316360474, -0.06628596782684326, ...",2,wheel discolored,leather peeling after 6 months of use. faded ...
3,F004,"""Steering wheel buttons not responsive intermi...",Steering Wheel,Workshop,steering wheel buttons not responsive intermit...,"[0.049614306539297104, -0.0813332200050354, 0....",-1,Noise,Noise
4,F005,"""ABS Sensor FAILURE triggered warning light on...",ABS System,Workshop,abs sensor failure triggered warning light on ...,"[0.002642096718773246, -0.012814835645258427, ...",-1,Noise,Noise


## Step 8: Label clusters using W&G-BERT (NER)
I used **W&G-BERT**, a model specific to the automotive domain,  
to extract entities (failure location & type) and form structured cluster labels.


In [19]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "lukasweber/WG_BERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("W&G-BERT loaded successfully")


Some weights of BertModel were not initialized from the model checkpoint at lukasweber/WG_BERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


W&G-BERT loaded successfully


In [20]:
from transformers import AutoModelForTokenClassification, pipeline

# Load W&G-BERT for NER
ner_model = AutoModelForTokenClassification.from_pretrained("lukasweber/WG_BERT")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")

cluster_labels = {}

for cluster_id in set(df['cluster_id']):
    if cluster_id == -1:
        continue  # skip noise

    cluster_texts = df[df['cluster_id'] == cluster_id]['cleaned_text'].tolist()

    entities = []
    for txt in cluster_texts[:5]:
        result = ner_pipeline(txt)
        for ent in result:
            entities.append(ent['word'])

    # Pick most common entities
    if entities:
        label = ", ".join(list(set(entities)))
    else:
        label = f"Cluster {cluster_id}"

    cluster_labels[cluster_id] = label

# Assign labels to DataFrame
df['cluster_label_WG_Bert'] = df['cluster_id'].map(lambda x: cluster_labels.get(x, "Noise"))

print(df[['feedback_id', 'cleaned_text', 'cluster_id', 'cluster_label_WG_Bert']].head())


Device set to use mps:0


  feedback_id                                       cleaned_text  cluster_id  \
0        F001  steering wheel leather peeling after 6 months ...           2   
1        F002     leather on steering wheel has discolored faded           2   
2        F003  steering wheel has deep scratches n cracks on ...           2   
3        F004  steering wheel buttons not responsive intermit...          -1   

                               cluster_label_WG_Bert  
0  disco, cracks, peeling, leather, steering whee...  
1  disco, cracks, peeling, leather, steering whee...  
2  disco, cracks, peeling, leather, steering whee...  
3                                              Noise  
4                                              Noise  


In [21]:
df.head(10)

Unnamed: 0,feedback_id,text,component,source,cleaned_text,embedding,cluster_id,cluster_label_keybert,cluster_label_transfomer,cluster_label_WG_Bert
0,F001,"""Steering wheel leather peeling after 6 months...",Steering Wheel,Customer,steering wheel leather peeling after 6 months ...,"[-0.014327894896268845, -0.05661596357822418, ...",2,wheel discolored,leather peeling after 6 months of use. faded ...,"disco, cracks, peeling, leather, steering whee..."
1,F002,"""Leather on Steering Wheel has discolored & fa...",Steering Wheel,Customer,leather on steering wheel has discolored faded,"[0.008238011039793491, 0.03961119055747986, 0....",2,wheel discolored,leather peeling after 6 months of use. faded ...,"disco, cracks, peeling, leather, steering whee..."
2,F003,"""steering wheel has deep scratches n cracks on...",Steering Wheel,Customer,steering wheel has deep scratches n cracks on ...,"[-0.050107747316360474, -0.06628596782684326, ...",2,wheel discolored,leather peeling after 6 months of use. faded ...,"disco, cracks, peeling, leather, steering whee..."
3,F004,"""Steering wheel buttons not responsive intermi...",Steering Wheel,Workshop,steering wheel buttons not responsive intermit...,"[0.049614306539297104, -0.0813332200050354, 0....",-1,Noise,Noise,Noise
4,F005,"""ABS Sensor FAILURE triggered warning light on...",ABS System,Workshop,abs sensor failure triggered warning light on ...,"[0.002642096718773246, -0.012814835645258427, ...",-1,Noise,Noise,Noise
5,F006,"""ABS warning lght stays on intermittently""",ABS System,Customer,abs warning lght stays on intermittently,"[0.0338529609143734, -0.058870285749435425, -0...",-1,Noise,Noise,Noise
6,F007,"""Brake pedal is unusually stiff & hard to press""",Brake System,Customer,brake pedal is unusually stiff hard to press,"[-0.02062085084617138, -0.13128358125686646, -...",0,brake pads,brake pedal is unusually stiff hard to press ...,"sound, brake pedal, stiff hard to press, ac co..."
7,F008,"""Brake pads wear out faster than expected??""",Brake System,Workshop,brake pads wear out faster than expected,"[0.02139277011156082, -0.07071702182292938, -0...",0,brake pads,brake pedal is unusually stiff hard to press ...,"sound, brake pedal, stiff hard to press, ac co..."
8,F009,"""Brake caliper making grinding noise!!!""",Brake System,Customer,brake caliper making grinding noise,"[-0.024891823530197144, -0.12650926411151886, ...",0,brake pads,brake pedal is unusually stiff hard to press ...,"sound, brake pedal, stiff hard to press, ac co..."
9,F010,"""Seatbelt fraying near the buckle after 3months""",Seatbelt,Workshop,seatbelt fraying near the buckle after 3months,"[-0.015528365038335323, 0.009660453535616398, ...",3,seatbelt fraying,Seatbelt stitching coming off near anchor poin...,"##cts, fray, buckle, ##tra, ##belt stitching, ..."


## Step 9: Save clustered feedback results to CSV

In [22]:
df.to_csv("clustered_feedback_result.csv", index=False)

print("✅ Clustered feedback saved to clustered_feedback_result.csv")


✅ Clustered feedback saved to clustered_feedback_result.csv
