In [22]:
import pandas as pd

# breinigte Daten aus Schritt 1
csv_path = r"C:\projct nlp 311\data\text_clean.csv"

df_clean = pd.read_csv(csv_path, encoding="utf-8")
texts = df_clean["text_clean"].fillna("").astype(str).tolist()

# Testausgabe, kurzer Einblick

print("Anzahl Texte:", len(texts))
print("Beispiel 1:", texts[0] if len(texts) > 0 else "<leer>")
print("Beispiel 2:", texts[1] if len(texts) > 1 else "<leer>")
print("Beispiel 3:", texts[2] if len(texts) > 2 else "<leer>")


Anzahl Texte: 364558
Beispiel 1: loud music party
Beispiel 2: access
Beispiel 3: access


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Parameter
vec = TfidfVectorizer(min_df=5, max_df=0.9, max_features=10000)

X_tfidf = vec.fit_transform(texts)   # Matrix Dokumente und Terme
terms = vec.get_feature_names_out()

# Testausgabe, kurzer Einblick

print("TF-IDF-Form:", X_tfidf.shape)       
print("Anzahl Terme:", len(terms))
print("Beispiel-Terme:", list(terms[:15]))


TF-IDF-Form: (364558, 68)
Anzahl Terme: 68
Beispiel-Terme: ['access', 'area', 'banging', 'blocked', 'blocking', 'building', 'bus', 'car', 'chained', 'chronic', 'commercial', 'complaint', 'congestion', 'detached', 'details']


In [24]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Reduzieren auf wenige Dimensionen/Themen

lsa = TruncatedSVD(n_components=10, random_state=42)
X_lsa = lsa.fit_transform(X_tfidf)

print("LSA-Form:", X_lsa.shape)

# Begriffe mit höchstem Gewicht je Thema anzeigen

terms = vec.get_feature_names_out()
n_top_terms = 8

for i, comp in enumerate(lsa.components_):
    top_indices = np.argsort(comp)[::-1][:n_top_terms]
    top_terms = [terms[j] for j in top_indices]
    print(f"\n🟢 Thema {i+1}: {', '.join(top_terms)}")


LSA-Form: (364558, 10)

🟢 Thema 1: access, partial, language, complaint, details, parking, layover, unauthorized

🟢 Thema 2: loud, party, music, talking, car, truck, horn, television

🟢 Thema 3: parking, posted, sign, violation, overnight, commercial, storage, route

🟢 Thema 4: blocked, hydrant, sidewalk, route, parking, unauthorized, layover, bus

🟢 Thema 5: plate, license, route, parking, unauthorized, layover, bus, horn

🟢 Thema 6: talking, loud, parking, sign, posted, commercial, overnight, television

🟢 Thema 7: partial, storage, unlicensed, chained, traffic, sign, posted, area

🟢 Thema 8: overnight, commercial, parking, storage, party, loud, television, unlicensed

🟢 Thema 9: car, truck, horn, talking, music, overnight, commercial, route

🟢 Thema 10: blocking, parked, double, traffic, vehicle, storage, unlicensed, complaint


In [25]:
from sklearn.cluster import KMeans

# K-Means Cluster
k = 10 
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_lsa)

# jedem Text das Cluster zuweisen

df_clean["cluster"] = clusters

# kurze Übersicht

print(df_clean["cluster"].value_counts().sort_index())

# Begriffe, je Cluster

for i in range(k):
    cluster_indices = np.where(clusters == i)[0]
    cluster_texts = [texts[j] for j in cluster_indices]
    joined = " ".join(cluster_texts)
    tfidf_sub = vec.transform([joined])
    top_terms = [terms[t] for t in np.argsort(tfidf_sub.toarray()[0])[::-1][:10]]
    print(f"\n🔹 Cluster {i}: {', '.join(top_terms)}")


cluster
0    33849
1    75888
2    69708
3    27200
4    53633
5    23810
6    24993
7    21661
8    17099
9    16717
Name: count, dtype: int64

🔹 Cluster 0: blocked, hydrant, sidewalk, language, est, gridlock, horn, hours, idling, layover

🔹 Cluster 1: access, language, est, gridlock, horn, hours, hydrant, idling, layover, vehicle

🔹 Cluster 2: party, music, loud, violation, idling, est, gridlock, horn, hours, hydrant

🔹 Cluster 3: posted, sign, violation, parking, music, loud, licensed, license, layover, language

🔹 Cluster 4: parked, double, blocking, traffic, vehicle, idling, engine, neglected, pounding, banging

🔹 Cluster 5: talking, loud, violation, idling, est, gridlock, horn, hours, hydrant, language

🔹 Cluster 6: partial, access, language, est, gridlock, horn, hours, hydrant, idling, layover

🔹 Cluster 7: plate, license, violation, idling, est, gridlock, horn, hours, hydrant, layover

🔹 Cluster 8: overnight, commercial, parking, storage, violation, idling, gridlock, horn, hour

In [26]:
# Ergebnisse speichern 

output_path = "../data/311_clustered_sample.csv"
df_clean[["text_clean", "cluster"]].to_csv(output_path, index=False)
print("Ergebnisse gespeichert unter:", output_path)



Ergebnisse gespeichert unter: ../data/311_clustered_sample.csv
