In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_train = pd.read_csv("data/smoker_train.csv")
# df_train = df_train.drop_duplicates()

cardinal_features = [
    "waist(cm)",
    "age",
    "height(cm)",
    "weight(kg)",
    "smoking",
    "systolic",
    "relaxation",
    "eyesight(left)",
    "eyesight(right)",
    "Cholesterol",
    "triglyceride",
    "HDL",
    "LDL",
    "AST",
    "ALT",
    "Gtp",
    "fasting blood sugar",
    "hemoglobin",
    "serum creatinine",
]

ordinal_features = [
    "Urine protein" # 1, 2, 3, 4, 5, 6
]

nominal_features = [
    "hearing(left)",    # 1 and 2
    "hearing(right)",   # 1 and 2
    "dental caries"     # 0 and 1
]


: 

# New Correlation Analysis

# Deprecated

In [None]:


df_cardinal = df_train[cardinal_features]
# Rangbasierte Korrelation
corr = df_cardinal.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=False, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Kardinal Features')
plt.show()

In [None]:
# Ordinal skalierte Merkmale laut Skalenniveau-Tabelle


# Subset der ordinalen Merkmale
df_ordinal = df_train[ordinal_features]

# Berechnung der Spearman-Korrelationsmatrix
corr_spearman = df_ordinal.corr(method="spearman")

# Darstellung als Heatmap
sns.heatmap(corr_spearman, annot=False, fmt=".2f", cmap="coolwarm")
plt.title("Spearman-Korrelation – Ordinal skalierte Merkmale")
plt.tight_layout()
plt.show()

In [None]:
# Balkendiagramm der Altersverteilung
plt.figure(figsize=(10, 6))
sns.countplot(x="age", data=df_train, palette="Blues")

plt.title("Altersverteilung (in 5-Jahres-Intervallen)")
plt.xlabel("Alter (5-Jahres-Intervalle)")
plt.ylabel("Anzahl der Probanden")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 6))

In [None]:
# Streudiagramm mit Regressionslinie
sns.scatterplot(x="age", y="height(cm)", data=df_train, alpha=0.5)
sns.regplot(x="age", y="height(cm)", data=df_train, scatter=False, color='red')

plt.title("Zusammenhang zwischen Alter und Körpergröße")
plt.xlabel("Alter (5-Jahres-Intervalle)")
plt.ylabel("Größe (cm)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Nominal skalierte Variablen
nominal_features = [
    "hearing(left)",
    "hearing(right)",
    "dental caries",
    "smoking"
]

# Funktion zur Berechnung von Cramér's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1)*(r - 1)) / (n - 1))  # Bias-Korrektur
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Korrelationsmatrix berechnen
cramers_results = pd.DataFrame(
    np.zeros((len(nominal_features), len(nominal_features))),
    index=nominal_features,
    columns=nominal_features
)

for col1 in nominal_features:
    for col2 in nominal_features:
        cramers_results.loc[col1, col2] = cramers_v(df_train[col1], df_train[col2])

# Heatmap darstellen
plt.figure(figsize=(8, 6))
sns.heatmap(cramers_results, annot=False, cmap="coolwarm", fmt=".2f")
plt.title("Cramér’s V – Nominal skalierte Merkmale")
plt.tight_layout()
plt.show()

In [None]:
cardinal_nominal_features = [
    "waist(cm)",
    "eyesight(left)",
    "eyesight(right)",
    "systolic",
    "relaxation",
    "fasting blood sugar",
    "Cholesterol",
    "triglyceride",
    "HDL",
    "LDL",
    "hemoglobin",
    "serum creatinine",
    "AST",
    "ALT",
    "Gtp",
    "hearing(left)",
    "hearing(right)",
    "dental caries",
    "smoking"
]

df_cardinal_nominal = df_train[cardinal_nominal_features]
# Rangbasierte Korrelation
corr = df_cardinal_nominal.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=False, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Kardinal/Nominal Features')
plt.show()


In [None]:
# Ordinal skalierte Merkmale laut Skalenniveau-Tabelle
ordinal_kardinal_features = [
    "age",
    "height(cm)",
    "weight(kg)",
    "Urine protein",
    "waist(cm)",
    "eyesight(left)",
    "eyesight(right)",
    "systolic",
    "relaxation",
    "fasting blood sugar",
    "Cholesterol",
    "triglyceride",
    "HDL",
    "LDL",
    "hemoglobin",
    "serum creatinine",
    "AST",
    "ALT",
    "Gtp"
]

# Subset der ordinalen Merkmale
df_ordinal_kardinal = df_train[ordinal_kardinal_features]

# Berechnung der Spearman-Korrelationsmatrix
corr_spearman = df_ordinal_kardinal.corr(method="spearman")

# Darstellung als Heatmap
sns.heatmap(corr_spearman, annot=False, fmt=".2f", cmap="coolwarm")
plt.title("Spearman-Korrelation – Ordinal/Kardinal skalierte Merkmale")
plt.show()