In [None]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC, LinearSVC

df_train = pd.read_csv("data/smoker_train.csv")
df_train = df_train.drop_duplicates()

# Allgemeine Analyse

In [None]:
df_train.head()

In [None]:
df_train.info()

## Cross Correlation

In [None]:
correlation_matrix = df_train.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Features')
plt.show()

## Class Imbalance

In [None]:
smoking_counts = df_train['smoking'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(smoking_counts, labels=['Non-Smoker', 'Smoker'], autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'))
plt.title('Class Distribution: Smokers vs Non-Smokers')
plt.axis('equal')
plt.show()

# Machine Learning Modelle

In [None]:
# store test error and train error for each model
# [model, test_mse, train_mse, test_accuracy, train_accuracy, test_f1, train_f1]
model_errors = []

## Naives Modell

In [None]:
# Modell, das immer "Non-Smoker" (0) vorhersagt
y_true = df_train['smoking']
y_pred_naive = np.zeros_like(y_true)

# Fehlerwerte berechnen
accuracy = accuracy_score(y_true, y_pred_naive)
f1 = f1_score(y_true, y_pred_naive)
mse = mean_squared_error(y_true, y_pred_naive)

print(f"Accuracy (immer Non-Smoker): {accuracy:.4f}")
print(f"F1-Score (immer Non-Smoker): {f1:.4f}")
print(f"Mean Squared Error (immer Non-Smoker): {mse:.4f}")


model_errors.append(['Naive Model', mse, mse, accuracy, accuracy, f1, f1])

## Linear Regression

In [None]:
# Use all columns except 'weight(kg)' and smoking as features
X = df_train[['height(cm)', 'waist(cm)', 'age', 'hemoglobin']]
y = df_train['weight(kg)']

# Split data into training and testing sets (using only training set for comparison)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Training der Modelle
lr_model = LinearRegression().fit(X_train_1, y_train_1)

y_pred = lr_model.predict(X_test_1)

print(f"Intercept: {lr_model.intercept_}")
for name, coef in zip(X.columns, lr_model.coef_):
    print(f"Coefficient for {name}: {coef}")

mse = mean_squared_error(y_test_1, y_pred)
print(f"Test Mean Squared Error: {mse}")

## Decision Tree limited leaves

In [None]:
X = df_train[['height(cm)', 'waist(cm)', 'hemoglobin']]
y = df_train['smoking']

# Split data into training and testing sets (using only training set for comparison)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Training der Modelle
dtc = DecisionTreeClassifier(random_state=0, max_leaf_nodes=7).fit(X_train_1, y_train_1)

y_pred_train = dtc.predict(X_train)
y_pred = dtc.predict(X_test)


print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))

model_errors.append(['Decision Tree', 0, 0, accuracy_score(y_pred, y_test), accuracy_score(y_pred_train, y_train), f1_score(y_test, y_pred), f1_score(y_train, y_pred_train)])

In [None]:
plot_tree(dtc)

## Decision Tree limited depth

In [None]:

# Input Variablen
X = df_train.drop('smoking', axis=1)
# Output Variable
y = df_train['smoking']

# Test und Trainingssplit
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# array100 = list(range(1, 20))
# for depth in array100:
#     # Decision Tree Classifier
#     dtc = DecisionTreeClassifier(random_state=0, max_depth=depth).fit(X_train, y_train)
    
#     # Modellanwendung
#     y_pred_train = dtc.predict(X_train)
#     y_pred = dtc.predict(X_test)
    
#     print(f'Depth: {depth}')
#     print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))
#     print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))
    # Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=0, max_depth=4).fit(X_train, y_train)

# Modellanwendung
y_pred_train = dtc.predict(X_train)
y_pred = dtc.predict(X_test)

print('Accuracy of Decision Tree-Train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Decision Tree-Test: ', accuracy_score(y_pred, y_test))


model_errors.append(['Decision Tree (max_depth=4)', 0, 0, accuracy_score(y_pred, y_test), accuracy_score(y_pred_train, y_train), f1_score(y_test, y_pred), f1_score(y_train, y_pred_train)])


In [None]:
plot_tree(dtc)

In [None]:

# Input Variablen
X = df_train.drop('smoking', axis=1)
# Output Variable
y = df_train['smoking']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ermittlung aller ccp_alpha-Werte durch den Pruning-Pfad
path = dtc.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# Verwendung von GridSearchCV zur Bestimmung des besten ccp_alpha-Werts
param_grid = {'ccp_alpha': ccp_alphas}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Das beste ccp_alpha ermitteln
best_ccp_alpha = grid_search.best_params_['ccp_alpha']
print(f"Bestes ccp_alpha durch Cross-Validation: {best_ccp_alpha}")

# Modell mit dem besten ccp_alpha trainieren
best_dtc = DecisionTreeClassifier(random_state=42, ccp_alpha=best_ccp_alpha)
best_dtc.fit(X_train, y_train)

# Vorhersagen und Genauigkeit auf den Testdaten
y_pred = best_dtc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}")
print(f"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}")
print(f"Mean Squared Error des besten Modells auf dem Testdatensatz: {mean_squared_error(y_test, y_pred):.4f}")
# Entscheidungsbaum des besten Modells visualisieren
plt.figure(figsize=(20, 10))
plot_tree(best_dtc, filled=True, feature_names=X.columns, class_names=["Non Smoker", "Smoker"])
plt.show()

model_errors.append(['Decision Tree (best ccp_alpha)', 0, 0, accuracy, accuracy_score(y_train, best_dtc.predict(X_train)), f1, f1_score(y_train, best_dtc.predict(X_train))])

## SVC Modelle

In [None]:
# Define features and target variable
X = df_train.drop('smoking', axis=1)
y = df_train['smoking']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVC model
svc_model = SVC(random_state=42)
svc_model.fit(X_train, y_train)

# Make predictions
y_pred = svc_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}")
print(f"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}")


model_errors.append(['SVC', 0, 0, accuracy, accuracy_score(y_train, svc_model.predict(X_train)), f1, f1_score(y_train, svc_model.predict(X_train))])

In [None]:
# Define features and target variable
X = df_train.drop('smoking', axis=1)
y = df_train['smoking']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVC model
svc_model = LinearSVC(random_state=42)
svc_model.fit(X_train, y_train)

# Make predictions
y_pred = svc_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}")
print(f"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}")


model_errors.append(['Linear SVC', 0, 0, accuracy, accuracy_score(y_train, svc_model.predict(X_train)), f1, f1_score(y_train, svc_model.predict(X_train))])

In [None]:
# # Define features and target variable
# X = df_train.drop('smoking', axis=1)
# y = df_train['smoking']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# svc=SVC() 



# # declare parameters for hyperparameter tuning
# parameters = [ {'C':[1, 10, 100, 1000], 'kernel':['linear']},
#                {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
#                {'C':[1, 10, 100, 1000], 'kernel':['poly'], 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05]} 
#               ]




# grid_search = GridSearchCV(estimator = svc,  
#                            param_grid = parameters,
#                            scoring = 'accuracy',
#                            cv = 5,
#                            verbose=0)


# grid_search.fit(X_train, y_train)

# y_pred = grid_search.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# print(f"Genauigkeit des besten Modells auf dem Testdatensatz: {accuracy:.4f}")
# print(f"Genauigkeit des besten Modells auf dem Testdatensatz (f1): {f1:.4f}")

 

## Model Evaluation

In [None]:
# Visualize model errors
model_errors_df = pd.DataFrame(model_errors, columns=['Model', 'Test MSE', 'Train MSE', 'Test Accuracy', 'Train Accuracy', 'Test F1', 'Train F1'])
plt.figure(figsize=(12, 8))
sns.lineplot(x='Model', y='Test Accuracy', data=model_errors_df, color='blue', label='Test Accuracy')
sns.lineplot(x='Model', y='Train Accuracy', data=model_errors_df, color='orange', label='Train Accuracy', marker='o')
plt.xticks(rotation=45)
plt.title('Model Performance Comparison')
plt.legend()
plt.tight_layout()
plt.show()