In [46]:
__author__ =  'Julia Schmid'

# DAMI01 Data Mining
## Erstellung eines Data Mining Projektes unter der Berücksichtigung des CRISP-DM Ansatzes

In [47]:
# Auskommentieren für die Installation der benötigen Pakete?
#pip install numpy pandas matplotlib math seaborn os
# #X!X 

In [48]:
#BASE_DIR = os.path.dirname(os.path.abspath(""))
#INPUT_CSV_FILE = os.path.join(BASE_DIR, "03_Clustering_Marketing.csv")
#print("Local path to the dataset: %s" % INPUT_CSV_FILE)
#df = pd.read_csv(INPUT_CSV_FILE, sep=',')


### Einstellungen

In [49]:
# Imports
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
from sklearn.metrics import davies_bouldin_score
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


In [50]:
# Alle Spalten eines Datensatzes anzeigen
pd.set_option('display.max_columns', None) 

### Daten einlesen
Datenquelle: Ullah, Z. (2024). Kaggle. Angerufen am 14. Februar 2024 von https:// www.kaggle.com/ datasets/ zabihullah18/ students-
social-network-profile-clustering

In [51]:
df = pd.read_csv("03_Clustering_Marketing.csv", sep=',')

### Daten Verstehen 

In [None]:
# Ausgabe der ersten 5 Zeilen
df.head()

In [None]:
# Ausgabe der Anzahl der Zeilen und Spalten
print(f'Anzahl Zeilen: {df.shape[0]}')
print(f'Anzahl Spalten: {df.shape[1]}')

In [None]:
# Ausgabe der Datensatz-Info
df.info()

In [None]:
# Ausgabe der Statistischenkennzahlen der numerischen Variablen
df.describe()

In [None]:
# Bestimmung der Numerische Variablen 
numerical_var = [col for col in df if df[col].dtype != 'object']
print(numerical_var)

# Bestimmung der Kategorische Variablen
categorical_var = [col for col in df if df[col].dtype == 'object']
print(categorical_var)

In [57]:
# # Kategorische Variablen plotten 
# for i in categorical_var:
#     plt.figure(figsize=(5, 3)) 
#     df[i].value_counts().plot(kind='bar')
#     plt.show()

In [58]:
# # Numerische Variablen plotten 
# for i in numerical_var:
#     plt.figure(figsize=(5, 3))
#     sns.histplot(data=df, x=i)  
#     plt.xlabel(i)  
#     plt.ylabel('Anzahl')        
#     plt.show()

### Daten aufbereiten

**NaN-Werte**

In [None]:
# Ausgabe des Namens der Variablen mit NaN-Werte und die Anzahl der Einträge mit NaN-Werten
count_nan = df.isna().sum()
count_nan[count_nan > 0]

In [None]:
# Fehlende Werte beim der Variable gender werden durch "U" (Unknonw) gefüllt
print(df['gender'].unique())
df['gender']= df['gender'].fillna('U')

In [None]:
# Fehlende Werte bei der Variable age werden mit dem Durchschnittsalter vom dazugehörigen Abschlussjahr gefüllt
print(df['age'].unique())

df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['age'] = df['age'].apply(lambda x: math.floor(x) if pd.notna(x) else x)
df['age'] = df.groupby('gradyear')['age'].transform(lambda x: x.fillna(x.mean()))
df['age'] = df['age'].astype(int)

df['age'].describe()

**Duplikate**

In [None]:
df.duplicated().sum()

In [63]:
df = df.drop_duplicates()

**Bearbeitung der Ausreißer**

In [None]:
# Alter ändern: Ausreißer werden mit der IQR-Methode bearbeitet
q1 = df['age'].quantile(0.25)
q3 = df['age'].quantile(0.75)
iqr = q3-q1
df = df[(df['age'] > (q1 - 1.5*iqr)) & (df['age'] < (q3 + 1.5*iqr))]
df['age'].describe()

**Datentransformation**

In [None]:
# 'gender' in einen numerischen Datentyp ändern
# Female (F) = 1, Male (M) = 2, Unknonwn (U) = 2
df['gender'] = df['gender'].replace({'F': 1, 'M': 2, 'U': 3}).astype(int)

In [None]:
df.info()

In [67]:
# Speicherung der Namen der Variablen mit der Häufigekits 
mentions = df.columns[4:].tolist()

**Skalierung**

In [68]:
df_scaled = df.copy()

scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled[df.columns] = scaler.fit_transform(df)


In [None]:
df_scaled.head()

In [None]:
df_scaled.shape

### Modellierung

**Bestimmung der Clusteranzahl**

1. Ellbogenmethode

In [None]:
wcss = []

for i in range(1, 41):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

knee_locator = KneeLocator(range(1, 41), wcss, curve="convex", direction="decreasing")
best_n_clusters = knee_locator.knee
    
# Plot der WCSS für verschiedene Clusteranzahlen
plt.plot(range(1, 41), wcss)
plt.title('Ellbogenmethode')
plt.xlabel('Anzahl an Clustern')
plt.ylabel('WCSS')
plt.show()

print(f"Die optimale Clusteranzahl ist {best_n_clusters}.")

2. Silhoutten-Score

In [None]:
silScores = []

for i in range(2, 40):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_scaled)
    labels = kmeans.labels_
    score = silhouette_score(df_scaled, labels)
    silScores.append(score)

# Plot der Silhouetten Score für verschiedene Clusteranzahlen
plt.plot(range(2, 40), silScores)
plt.title('Silhouetten Score')
plt.xlabel('Anzahl an Clustern')
plt.ylabel('Silhouetten Score')
plt.show()

# Bestimmung der optiomalen Clusteranzahl (Maximaler Silhouetten Score)
silScores_neu = silScores[1:] # Ersten Wert ausschließen
best_n_clusters = silScores[1:].index(max(silScores[1:])) + 2 # Clusteranzahl beginnt bei 2
print(f"Die optimale Clusteranzahlr ist {best_n_clusters}.")

**Modelle**

In [73]:
df_dbIndex = pd.DataFrame(columns=["clusterName", "db_index"])

1. K-Means

In [74]:
def KMeans_Modell(numbCluster, randomState, dfScaled, dfUnscaled, nameVar, df_dbIndex):
    kmeans = KMeans(n_clusters = numbCluster,  random_state = randomState)
    kmeans = kmeans.fit(dfScaled)
    y_pred = kmeans.predict(dfScaled)
    dfUnscaled[nameVar] = kmeans.labels_
    temp = kmeans.labels_
    db_index_value = davies_bouldin_score(dfScaled, temp)
    df_dbIndex.loc[len(df_dbIndex)] = [nameVar, db_index_value]
    return dfUnscaled, df_dbIndex

In [75]:
df, df_dbIndex =  KMeans_Modell(6, 42, df_scaled, df, 'cluster_kmeans (k=6)', df_dbIndex)
df, df_dbIndex =  KMeans_Modell(7, 42, df_scaled, df, 'cluster_kmeans (k=7)', df_dbIndex)

2. Hierarchisches Clustering 

In [None]:
""" def hierarchy_Modell(numbCluster, nameVar, dfScaled, dfUnscaled, df_dbIndex):
    hierarch = linkage(dfScaled, method='complete')
    hierarch_label = fcluster(hierarch, numbCluster, criterion='maxclust')
    dfUnscaled[nameVar] = hierarch_label
    db_index_value = davies_bouldin_score(dfScaled, hierarch_label)
    df_dbIndex.loc[len(df_dbIndex)] = [nameVar, db_index_value]
    return dfUnscaled, df_dbIndex """

In [None]:
""" df, df_dbIndex = hierarchy_Modell(6, 'cluster_hierarch', df_scaled, df, df_dbIndex) """

3. DBSCAN

Bestimmung der optimalen Parameter eps und MinPts

In [78]:
# MinPts≥D+1 oder MinPts=2×D
# eps = ellbogen Methode (am stärksten zunimmt) --> k = MinPts−1

In [None]:
# Dimension des Datensatzes
D = df_scaled.shape[1]
print(f"Die Dimension des Datensatz ist {D}.")

In [None]:
minPts_var = D+1 
minPts_var

In [None]:
k = 41 - 1
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(df_scaled)
distances, indices = neighbors_fit.kneighbors(df_scaled)
distances = np.sort(distances, axis=0)

distances = distances[:,k-1] # Letzte Distanz betrachten
plt.figure(figsize = (5,3))
plt.plot(distances)
plt.show()

knee_locator = KneeLocator(range(1, len(distances) + 1), distances, curve="convex", direction="increasing")
optimal_k = knee_locator.knee
epsilon_var = distances[optimal_k - 1]
print(epsilon_var)

In [None]:
""" from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.45, min_samples=(2*D))
dbscan.fit(df_scaled)

df['cluster_dbscans'] = dbscan.labels_

temp = dbscan.labels_
db_index_value = davies_bouldin_score(df_scaled, temp)
df_dbIndex.loc[len(df_dbIndex)] = ['cluster_dbscans', db_index_value] """

In [83]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=epsilon_var, min_samples=(minPts_var))
dbscan.fit(df_scaled)

df['cluster_dbscans'] = dbscan.labels_

temp = dbscan.labels_
db_index_value = davies_bouldin_score(df_scaled, temp)
df_dbIndex.loc[len(df_dbIndex)] = ['cluster_dbscans', db_index_value]

4. Fuzzy C-Means Clustering

In [84]:
import numpy as np
import skfuzzy as fuzz

n_clusters = 6

# Fuzzy C-Means Clustering
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
    df_scaled.T, c=n_clusters, m=2.0, error=0.005, maxiter=1000, init=None        
)
df["cluster_fuzzy (k=6)"] = np.argmax(u, axis=0)

temp = np.argmax(u, axis=0)
db_index_value = davies_bouldin_score(df_scaled, temp)
df_dbIndex.loc[len(df_dbIndex)] = ['cluster_fuzzy (k=6)', db_index_value]

In [None]:
import numpy as np
import skfuzzy as fuzz

n_clusters = 7

# Fuzzy C-Means Clustering
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
    df_scaled.T, c=n_clusters, m=2.0, error=0.005, maxiter=1000, init=None        
)
df["cluster_fuzzy (k=7)"] = np.argmax(u, axis=0)

temp = np.argmax(u, axis=0)
db_index_value = davies_bouldin_score(df_scaled, temp)
df_dbIndex.loc[len(df_dbIndex)] = ['cluster_fuzzy (k=7)', db_index_value]

## Evaluierung

In [None]:
columnsCluster = ['cluster_kmeans (k=6)','cluster_kmeans (k=7)', 'cluster_dbscans','cluster_fuzzy (k=6)', 'cluster_fuzzy (k=7)']

value_counts_dict = {column: df[column].value_counts() for column in columnsCluster}

df_result = pd.DataFrame(value_counts_dict).fillna(0).astype(int)
df_result.columns = [f'{col}' for col in df_result.columns]
df_result.reset_index(inplace=True)
df_result.rename(columns={'index': 'Cluster'}, inplace=True)
df_result = df_result.sort_values(by='Cluster').reset_index(drop=True)
df_result = df_result.set_index('Cluster').T

df_result.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.legend(title="Cluster", loc='upper left', bbox_to_anchor=(1.05, 1))

In [86]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.manifold import TSNE


# Funktion: Bestimmung der Top 10 Themen pro Cluster
def top10Mention(df, clusterVar):
    cluster_analysis = df.groupby(clusterVar).sum().T
    df_temp = pd.DataFrame(cluster_analysis)
    df_temp = df_temp.reset_index()
    df_temp = df_temp[df_temp['index'].isin(mentions)]
    top10MentionDic = {}
    for i in df_temp.columns[1:].tolist():
        top_10 = df_temp[['index', i]].sort_values(by=i, ascending=False).head(10)
        top10MentionDic[i] = top_10['index'].tolist()

    return top10MentionDic


# Funktion: Ausgabe der Top 10 Themen pro Cluster
def printTop10Mentions(top10Mentions):
    for key, values in top10Mentions.items():
        print(f"Cluster {key}: " + ', '.join(values))


# Funktion: Alter pro Cluster 
def plotAgePerCluster(ax, df, clusterName):
    df_temp = df.groupby(["age", clusterName]).size().unstack(fill_value=0)
    df_temp.plot(kind="bar", stacked=True, ax=ax)

    ax.set_title("Altersverteilung pro Cluster")
    ax.set_xlabel("Alter")
    ax.set_ylabel("Anzahl")
    ax.legend(title="Cluster", loc='upper left', bbox_to_anchor=(1.05, 1))


# Funktion: Alter pro Cluster 
def plotGenderPerCluster(ax, df, clusterName):
    df_temp = df.groupby(["gender", clusterName]).size().unstack(fill_value=0)
    df_temp.plot(kind="bar", stacked=True, ax=ax)

    ax.set_title("Geschlechtsverteilung pro Cluster")
    ax.set_xlabel("Geschlecht")
    ax.set_ylabel("Anzahl")
    ax.legend(title="Cluster", loc='upper left', bbox_to_anchor=(1.05, 1))

# Funktion: Wortnennungen pro Cluster 
def plotWordMentionsperCluster(ax, df, clusterName, mentions_var):
    df_temp = pd.melt(df, id_vars=[clusterName], value_vars=mentions_var, var_name='Mentions', value_name='Anzahl')
    df_temp = df_temp.pivot_table(index='Mentions', columns=clusterName, values='Anzahl', fill_value=0)
    df_temp.plot(kind='bar', stacked=True, ax=ax)

    ax.set_title("Wortnennungen pro Cluster")
    ax.set_xlabel("")
    ax.set_ylabel("Anzahl")
    ax.legend(title="Cluster", loc='upper left', bbox_to_anchor=(1.05, 1))

# t-SNE
tSNE = TSNE(n_components=2, random_state=123) 
X_tSNE = tSNE.fit_transform(df_scaled)

# Funktion: t-SNE Plot
def plotSNE(ax, df, clusterName, X_tSNE):
    scatter = ax.scatter(X_tSNE[:, 0], X_tSNE[:, 1], c=df[clusterName], alpha=0.3)
    titleName = "t-SNE " + clusterName
    ax.set_title(titleName)
    ax.set_xlabel("Dim 1")
    ax.set_ylabel("Dim 2")
    plt.colorbar(scatter, ax=ax)


# Funktion: Plot Ausgabe
def plotClusterAnalysis(df, clusterName, mentions_var):

    fig = plt.figure(figsize=(10, 6))
    gs = GridSpec(2, 3, figure=fig)
    
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    ax3 = fig.add_subplot(gs[1, :])  
    ax4 = fig.add_subplot(gs[0, 2])


    plotAgePerCluster(ax1, df, clusterName)
    plotGenderPerCluster(ax2, df, clusterName)
    plotWordMentionsperCluster(ax3, df, clusterName, mentions_var)
    plotSNE(ax4, df, clusterName, X_tSNE)
    plt.tight_layout()
    plt.show()

    print('\n\033[1mDavies-Bouldin-Index\033[0m')
    print(df_dbIndex[df_dbIndex['clusterName'] == clusterName]['db_index'].iloc[0])

    print('\n\033[1mTop 10 Themen\033[0m')
    top10 = top10Mention(df, clusterName)
    printTop10Mentions(top10)


In [None]:
plotClusterAnalysis(df, "cluster_kmeans (k=6)", mentions)

In [None]:
plotClusterAnalysis(df, "cluster_kmeans (k=7)", mentions)

In [None]:
""" plotClusterAnalysis(df, "cluster_hierarch", mentions) """

In [None]:
plotClusterAnalysis(df, "cluster_dbscans", mentions)

In [None]:
plotClusterAnalysis(df, "cluster_fuzzy (k=6)", mentions)

In [None]:
plotClusterAnalysis(df, "cluster_fuzzy (k=7)", mentions)

In [91]:
# def valueCounts(df, var):
#     print(df[var].value_counts())
    
# def evaluateModell(df, var, modelName):
#     valueCounts(df, var)
#     personenBezogendeErgebnisse(df, modelName)

# def durchschnittsAlterProCluster(df, clusterVar):
#     mean_age = df.groupby(clusterVar)['age'].mean()
#     for cluster, age in mean_age.items():
#         print(f"Cluster {cluster}: Durchschnittlich {age:.2f} Jahre")

# def verteilungGeschlechtProCluster(df, clusterVar):
#     gender = df.groupby([clusterVar, 'gender']).size().unstack(fill_value=0)
#     for i in gender.index:
#         print(f"Cluster {i}: {gender.loc[i, 1]} Frauen, {gender.loc[i, 2]} Männer, {gender.loc[i, 3]} Unbekannt")

# def durchschnittsAnzahlFreundeProCluster(df, clusterVar):
#     mean_friends = df.groupby(clusterVar)['NumberOffriends'].mean()
#     for cluster, avg_friends in mean_friends.items():
#         print(f"Cluster {cluster}: Durchschnittlich {avg_friends:.2f} Freunde")

# def personenBezogendeErgebnisse(df, clusterVar):
#     durchschnittsAlterProCluster(df, clusterVar)
#     print('')
#     verteilungGeschlechtProCluster(df, clusterVar)
#     print('')
#     durchschnittsAnzahlFreundeProCluster(df,  clusterVar)

In [92]:
# def evaluateModell(df, modelName):
#     print('\033[1mAnzahl Einträge pro Cluster\033[0m')
#     valueCounts(df,modelName,  )
#     print('\n\033[1mPersonen Bezogene Ergebnisse\033[0m')
#     personenBezogendeErgebnisse(df, modelName)
#     print('\n\033[1mTop 10 Themen\033[0m')
#     top10 = top10Mention(df, modelName)
#     printTop10Mentions(top10)

In [93]:
#evaluateModell(df,'cluster_kmeans' )

In [94]:
# list_sport = ['basketball', 'football', 'soccer', 'softball', 'volleyball', 'swimming', 'cheerleading', 'baseball', 'tennis', 'sports']
# list_emotions = ['cute', 'sex', 'sexy', 'hot', 'kissed']
# list_music = ['dance', 'band', 'marching', 'music', 'rock']
# list_religion:  ['god', 'church', 'jesus', 'bible']
# list_fashion: ['hair', 'dress', 'blonde', 'mall', 'shopping', 'clothes', 'hollister', 'abercrombie']
# list_death: ['die', 'death']
# list_drug:'[drunk', 'drugs']

***
***