In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string
from dotenv import dotenv_values

## Assigning relevant info from the .env File to create a connection 
config = dotenv_values()
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
## Creating Engine for Connection
engine = create_engine(url, echo=False)
engine.url

## Setting the search path
with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {pg_schema};'))
    
## Establishing connection
engine = create_engine(url, echo=False)
## Loading Data
data = pd.read_sql("SELECT * FROM capstone_group_1.all_data_13_to_18", engine)

In [6]:
# Zielvariable definieren (z. B. Bluthochdruck ab systolisch ≥ 130 oder diastolisch ≥ 80)
data['hypertension'] = ((data['systolic_bp'] >= 130) | (data['diastolic_bp'] >= 80)).astype(int)

# Ersetze Sonderwerte durch NaN
missing_vals = [7777, 9999, 8888, 6666, 5555]
data = data.replace(missing_vals, np.nan)

# Lifestyle-Faktoren + Zielvariable
features = [
    'age', 'bmi', 'waist_circumference', 'daily_calories',
    'total_sugar', 'total_fat', 'total_carbohydrates', 'total_alcohol', 'caffeine',
    'sport_days', 'sitting_per_day', 'walk_<10_minutes_p_day'
]

# Zusätzliche dichotome Variablen umwandeln
data['physical_activity_work_num'] = data['physical_activity_work'].map({'Yes': 1, 'No': 0})
data['trouble_sleeping_num'] = data['trouble_sleeping'].map({'Yes': 1, 'No': 0})
features += ['physical_activity_work_num', 'trouble_sleeping_num']

# Nur vollständige Zeilen
df_model = data[features + ['hypertension']].dropna()

# Trainings- & Testdaten splitten
X = df_model[features]
y = df_model['hypertension']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modell trainieren
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Vorhersagen
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Bewertung
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.2f}")



ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Optional: Interpretierbarkeit mit SHAP
# Wenn du wissen willst, welche Faktoren das Risiko wie stark beeinflussen, kannst du mit SHAP arbeiten.

# python
# Kopieren
# Bearbeiten
import shap
explainer = shap.Explainer(clf, X_test)
shap_values = explainer(X_test)

# Globale Wichtigkeit
shap.plots.bar(shap_values)