In [106]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('titanic.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
#check duplicates
df.duplicated().sum() 

In [None]:
# Löschen der Spalte 'Cabin' aus dem DataFrame weil 77% missing values
df.drop('Cabin', axis=1, inplace=True)
#Age abrunden
df['Age'] = np.floor(df['Age'])
#missing values treatment
df['Age'].fillna(df['Age'].median(), inplace=True) # ist's sinvoll?
df['Age'] = df['Age'].astype(int)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])


In [None]:
#check missing values
df.isnull().sum()/df.shape[0]*100

In [115]:
#extrahiere new column 'Titel',das könnte wichtig für datenanalyse sein?
df["Title"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
df["Name"] = df["Name"].apply(lambda x: ' '.join([word for word in x.split() if not word.endswith('.')]))


In [None]:
df.describe().T

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df

In [16]:
# Import libraries
import base64
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import plotly.graph_objects as go
import plotly.express as px
from dash import Dash, dcc, html, Input, Output

# 1. DATA PREPARATION
df = pd.read_csv('titanic.csv')

# Clean the data
# These columns (PassengerId, Name, Ticket, Cabin) are not useful for predicting survival. They either contain unique identifiers or non-informative text data that do not contribute to the model's predictive power.
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], errors='ignore')
# The 'Age' column has missing values. Using the median to fill these gaps is a common practice because it is robust to outliers and provides a central tendency measure.
df['Age'] = np.floor(df['Age'])
df['Age'].fillna(df['Age'].median(), inplace=True)
# The 'Embarked' column has missing values. Filling them with the most frequent value.
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
# Features and target: To prepare the data for modeling, we need to separate the features (input variables) from the target (output variable).
X = df.drop(columns=['Survived'])
y = df['Survived']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3)
}

# 2. Methods: Cross-validation and Bootstrap .632
def cross_validate_method(model, X, y):
    """ Perform 10-fold cross-validation and return accuracy scores. """
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # or KFold, difference?
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return scores

def bootstrap_632_method(model, X, y):
    """ Perform Bootstrap .632 method and return accuracy score. """
    train_acc, test_acc = [], []

    for _ in range(100):  # 100 bootstrap samples
        # How is This Implemented? The code does not need to explicitly enforce the 63.2% or 36.8% split because it happens naturally due to random sampling with replacement using resample().
        X_boot, y_boot = resample(X, y, replace=True) # Bootstrap sampling: Create the training set
        X_oob = X.loc[~X.index.isin(X_boot.index)] # Out-of-bag (oob) samples: identify the test set
        y_oob = y.loc[~y.index.isin(y_boot.index)]

        model.fit(X_boot, y_boot)
        acc_train = accuracy_score(y_boot, model.predict(X_boot))
        acc_test = accuracy_score(y_oob, model.predict(X_oob)) if len(X_oob) > 0 else 0
        train_acc.append(acc_train)
        test_acc.append(acc_test)

    # .632 Bootstrap formula
    return np.mean(0.368 * np.array(train_acc) + 0.632 * np.array(test_acc))

# Evaluating function: 
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """Train and evaluate model performance."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }
    return metrics

# Prepare results for Dash
results = {}
for model_name, model in models.items():
    results[model_name] = {
        'cross_val': cross_validate_method(model, X, y),
        'bootstrap': bootstrap_632_method(model, X, y)
    }

# 3. Build Dash App
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Titanic Classification Models with Evaluation Methods", style={'textAlign': 'center'}),

    # Dropdown to select model
    html.Div([
        html.Label("Select Model:"),
        dcc.Dropdown(
            id='model-dropdown',
            options=[{'label': name, 'value': name} for name in models.keys()],
            value='Logistic Regression'
        )
    ], style={'width': '50%', 'margin': 'auto'}),

    # Metrics table
    html.Div([
        html.H3("Performance Metrics", style={'textAlign': 'center'}),
        dcc.Graph(id='metrics-table'),
    ]),

    # Cross-validation plot
    html.Div([
        html.H3("Cross-Validation (10-Fold) Results", style={'textAlign': 'center'}),
        dcc.Graph(id='crossval-plot'),
    ]),

    # Bootstrap accuracy gauge
    html.Div([
        html.H3("Bootstrap .632 Method Results", style={'textAlign': 'center'}),
        dcc.Graph(id='bootstrap-plot'),
    ]),

    # Confusion matrix
    html.Div([
        html.H3("Confusion Matrix", style={'textAlign': 'center'}),
        dcc.Graph(id='confusion-matrix'),
    ]),

    # Decision tree plot (for Decision Tree only)
    html.Div([
        html.H3("Decision Tree Visualization", style={'textAlign': 'center'}),
        html.Img(id='tree-plot', style={'display': 'block', 'margin': 'auto'})
    ])
])

# Callbacks
@app.callback(
    [Output('metrics-table', 'figure'),
    Output('crossval-plot', 'figure'),
    Output('bootstrap-plot', 'figure'),
    Output('confusion-matrix', 'figure'),
    Output('tree-plot', 'src')],
    Input('model-dropdown', 'value')
)
def update_plots(selected_model):
    model = models[selected_model]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Evaluate metrics
    metrics = evaluate_model(model, X_train, y_train, X_test, y_test)

    # Metrics Table
    metrics_fig = go.Figure(data=[go.Table(
        header=dict(values=["Metric", "Value"], align="center"),
        cells=dict(values=[
            list(metrics.keys())[:-1],
            [f"{v:.4f}" if isinstance(v, float) else v for v in list(metrics.values())[:-1]]
        ], align="center")
    )])
    
    # Cross-validation results
    cross_val_scores = results[selected_model]['cross_val']
    crossval_fig = px.line(
        y=cross_val_scores, 
        x=list(range(1, 11)),
        title=f"Cross-Validation Accuracy Scores for {selected_model}",
        labels={'x': 'Fold', 'y': 'Accuracy'}
    )

    # Bootstrap accuracy
    bootstrap_accuracy = results[selected_model]['bootstrap']
    bootstrap_fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=bootstrap_accuracy,
        title={'text': f"Bootstrap .632 Accuracy for {selected_model}"}
    ))

    # Confusion matrix
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    cm_fig = px.imshow(
        cm, text_auto=True, color_continuous_scale='Blues',
        labels=dict(x="Predicted", y="Actual", color="Count"),
        title=f"Confusion Matrix for {selected_model}"
    )
    cm_fig.update_xaxes(side="top")

    # Decision tree visualization
    tree_src = None
    if selected_model == "Decision Tree":
        buf = io.BytesIO()
        plt.figure(figsize=(12, 8))
        plot_tree(model, filled=True, feature_names=X.columns, class_names=['Not Survived', 'Survived'])
        plt.savefig(buf, format="png")
        buf.seek(0)
        tree_src = "data:image/png;base64," + base64.b64encode(buf.read()).decode("utf-8")
        buf.close()
        plt.close()

    return metrics_fig, crossval_fig, bootstrap_fig, cm_fig, tree_src

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [20]:
# Import libraries
import base64
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import plotly.graph_objects as go
import plotly.express as px
from dash import Dash, dcc, html

# 1. DATA PREPARATION
df = pd.read_csv('titanic.csv')

# Clean the data
# These columns (PassengerId, Name, Ticket, Cabin) are not useful for predicting survival. They either contain unique identifiers or non-informative text data that do not contribute to the model's predictive power.
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], errors='ignore')
# The 'Age' column has missing values. Using the median to fill these gaps is a common practice because it is robust to outliers and provides a central tendency measure.
df['Age'] = np.floor(df['Age'])
df['Age'].fillna(df['Age'].median(), inplace=True)
# The 'Embarked' column has missing values. Filling them with the most frequent value.
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
# Features and target: To prepare the data for modeling, we need to separate the features (input variables) from the target (output variable).
X = df.drop(columns=['Survived'])
y = df['Survived']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3)
}

# Cross-validation and Bootstrap methods
def cross_validate_method(model, X, y):
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    return cross_val_score(model, X, y, cv=cv, scoring='accuracy')

def bootstrap_632_method(model, X, y):
    train_acc, test_acc = [], []
    for _ in range(100):
        # How is This Implemented? The code does not need to explicitly enforce the 63.2% or 36.8% split because it happens naturally due to random sampling with replacement using resample().
        X_boot, y_boot = resample(X, y, replace=True) # Bootstrap sampling: Create the training set
        X_oob = X[~np.in1d(np.arange(X.shape[0]), X_boot.index)] # Out-of-bag (oob) samples: identify the test set
        y_oob = y[~np.in1d(np.arange(y.shape[0]), X_boot.index)]
        model.fit(X_boot, y_boot)
        acc_train = accuracy_score(y_boot, model.predict(X_boot))
        acc_test = accuracy_score(y_oob, model.predict(X_oob)) if len(y_oob) > 0 else 0
        train_acc.append(acc_train)
        test_acc.append(acc_test)
    return np.mean(0.368 * np.array(train_acc) + 0.632 * np.array(test_acc))


# Evaluate models
metrics_results = {}
confusion_matrices = {}
crossval_results = {}
bootstrap_results = {}

for model_name, model in models.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Save metrics
    metrics_results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    }
    confusion_matrices[model_name] = confusion_matrix(y_test, y_pred)
    crossval_results[model_name] = cross_validate_method(model, X, y)
    bootstrap_results[model_name] = bootstrap_632_method(model, X, y)

# 3. Build Dash App
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Titanic Classification Models", style={'textAlign': 'center'}),

    # Performance Metrics
    html.H3("Performance Metrics"),
    dcc.Graph(figure=px.bar(
        pd.DataFrame(metrics_results).reset_index(),
        x='index', y=list(models.keys()),
        barmode='group',
        title="Performance Metrics for All Models",
        labels={'index': 'Metrics', 'value': 'Score'}
    )),

    html.H3("Confusion Matrices"),
    html.Div([
        dcc.Graph(figure=px.imshow(confusion_matrices[model], text_auto=True, title=f"{model}", width=300, height=300))
        for model in models.keys()
    ], style={'display': 'flex', 'flexDirection': 'row'}),

    # Cross-validation
    html.H3("Cross-Validation Results"),
    dcc.Graph(figure=px.line(
        x=list(range(1, 11)),
        y=[crossval_results[model] for model in models.keys()],
        title="Cross-Validation Accuracy (10-Fold)",
        labels={'x': 'Fold', 'y': 'Accuracy', 'color': 'Models'}
    )),

    # Bootstrap .632 Results
    html.H3("Bootstrap .632 Results"),
    dcc.Graph(figure=px.bar(
        x=list(models.keys()),
        y=[bootstrap_results[model] for model in models.keys()],
        title="Bootstrap .632 Accuracy",
        labels={'x': 'Model', 'y': 'Bootstrap Accuracy', 'color': 'Models'}
    )),
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



