Support Vector Machines (SVM) beschreiben eine ML-Methode für supervised Learning.

In [1]:
import pandas as pd

dataset = pd.read_csv("lung_cancer_dataset.csv")

# Convert text to categorical data
dataset['gender'] = dataset['gender'].astype('category')
dataset['radon_exposure'] = dataset['radon_exposure'].astype('category')
dataset['alcohol_consumption'] = dataset['alcohol_consumption'].fillna('None').astype('category')

dataset['asbestos_exposure'] = dataset['asbestos_exposure'].map({'Yes': True, 'No': False})
dataset['secondhand_smoke_exposure'] = dataset['secondhand_smoke_exposure'].map({'Yes': True, 'No': False})
dataset['copd_diagnosis'] = dataset['copd_diagnosis'].map({'Yes': True, 'No': False})
dataset['family_history'] = dataset['family_history'].map({'Yes': True, 'No': False})
dataset['lung_cancer'] = dataset['lung_cancer'].map({'Yes': True, 'No': False})

# no duplicate rows
dataset.duplicated().sum()

# show data
dataset.head()

Unnamed: 0,patient_id,age,gender,pack_years,radon_exposure,asbestos_exposure,secondhand_smoke_exposure,copd_diagnosis,alcohol_consumption,family_history,lung_cancer
0,100000,69,Male,66.025244,High,False,False,True,Moderate,False,False
1,100001,32,Female,12.7808,High,False,True,True,Moderate,True,True
2,100002,89,Female,0.408278,Medium,True,True,True,,False,True
3,100003,78,Female,44.065232,Low,False,True,False,Moderate,False,True
4,100004,38,Female,44.43244,Medium,True,False,True,,True,True


In [2]:
from sklearn.model_selection import train_test_split

# get the data and target from the data frame 
data = dataset.loc[:, 'age':'family_history']
target = dataset['lung_cancer']

train_data, test_data, train_label, test_label = train_test_split(data, target, test_size=0.3, random_state=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Spaltennamen nach Typ trennen
categorical_cols = data.select_dtypes(include=['category']).columns.tolist()
numeric_cols = data.select_dtypes(include=['number', 'bool']).columns.tolist()

# Preprocessing-Transformer (numerisch: RobustScaler, kategorisch: OneHotEncoder)
preprocessor = ColumnTransformer([
    ("num", "passthrough", numeric_cols), # RobustScaler() hat keine/kaum Auswirkung
    ("cat", OneHotEncoder(drop='first'), categorical_cols)
])

X_processed = preprocessor.fit_transform(train_data)

# Get feature names from OneHotEncoder
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = numeric_cols + list(cat_feature_names)

# Create DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names)
X_processed_df.head()

InvalidParameterError: The 'drop' parameter of OneHotEncoder must be a str among {'if_binary', 'first'}, an array-like or None. Got 'fist' instead.

Allg. gilt:
- k beschreibt die Anzahl der Folds einer Cross-Validation (wie oft Datne in Trainings- und Testsets aufgeteilt werden)
    - -> beeinflusst Robustheit des Modells & nicht das Modell selbst
- C beschreibt die Stärke der Regularisierung vom SVM-Modell
    - -> kleiner C-Wert (z.B. 0.0001) = starke Regularisierung // das Modell toleriert mehr Fehler im Training
    - -> großer C-Wert (z.B. 1) = schwächere Regularisierung // das Modell passt sich stärker an die Trainingsdaten an

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

svc_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC()),
])

Mithilfe von GridSearchCV soll man laut CoPilot die Hyperparameter (wie z.B. C, gamma) verbessern können

In [None]:
from sklearn.model_selection import GridSearchCV

# Parameter-Raster definieren
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', "sigmoid"],  # precomputed kann nur mit vorgegebenen Kernel-Matrizen verwendet werden
    'gamma': ['auto', 'scale', 0.0, 0.1, 0.3, 0.5, 0.7, 1],  # gamma nur für rbf/poly/sigmoid
    'class_weight': ['balanced', None], # balanciert ungleiche Klassenverteilungen aus
    'degree': [2, 3, 4], # nur für poly // steuert Grad der Polynomfunktion
    'coef0': [0.0, 0.1, 0.5], # poly/sigmoid // steuert Einfluss von höherwertigen vs. niederwertigen Merkmalen (Form des Kernels)
    'shrinking': [True, False], # ob Shrinking-Algorithmus verwendet wird
    'probability': [True, False], # ob Wahrscheinlichkeiten geschätzt werden (langsamer)
    'tol': [1e-3, 1e-4] # Toleranz für die Optimierung
}

# GridSearchCV mit SVM
grid_search = GridSearchCV(svc_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2) 
# n_jobs=-1 nutzt alle verfügbaren CPU-Kerne
# verbose=2 gibt detaillierte Informationen während der Suche aus
grid_search.fit(train_data, train_label)

# Beste Parameter und Modell
print("Beste Parameter:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ergebnisse als DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Beispiel: Vergleich der Kernels bei verschiedenen C-Werten
kernels = results['param_kernel'].unique()
c_values = results['param_C'].unique()

plt.figure(figsize=(10, 7))
for kernel in kernels:
    mask = results['param_kernel'] == kernel
    plt.plot(
        results[mask]['param_C'],
        results[mask]['mean_test_score'],
        marker='o',
        label=f'Kernel: {kernel}'
    )

plt.xlabel('C')
plt.ylabel('Mean CV Accuracy')
plt.title('GridSearchCV: Vergleich der SVM-Kernel')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_pred = best_model.predict(test_data)

print("Accuracy:", accuracy_score(test_label, y_pred))
print("F1-Score:", f1_score(test_label, y_pred))
print("Confusion Matrix:\n", confusion_matrix(test_label, y_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_label, y_pred))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, figsize=(10, 8))
_ = ConfusionMatrixDisplay.from_estimator(
    best_model, 
    test_data, 
    test_label, 
    display_labels=dataset['lung_cancer'].unique(),
    cmap=plt.cm.Blues,
    ax=ax
)

In [None]:
from sklearn.model_selection import validation_curve

# collect data for plotting
#c_values = [0.5, 1, 2, 3, 4, 5] # oder [0.01, 0.1, 1, 10]
train_scores, valid_scores = validation_curve(
    best_model, 
    X=train_data, 
    y=train_label, 
    param_name='C', 
    param_range=grid_search['param_C'].unique(), 
    cv=5
)

# first, calculate the means and standard deviations
train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
valid_scores_mean = valid_scores.mean(axis=1)
valid_scores_std = valid_scores.std(axis=1)

# create the figure
fig, ax = plt.subplots(figsize=(10, 10))

ax.plot(c_values, train_scores_mean, label="Training score", color="darkorange")
ax.fill_between(
    c_values,
    train_scores_mean - train_scores_std,
    train_scores_mean + train_scores_std,
    color="darkorange",
    alpha=0.2
)

ax.plot(c_values, valid_scores_mean, label="Cross-validation score", color="navy")
ax.fill_between(
    c_values,
    valid_scores_mean - valid_scores_std,
    valid_scores_mean + valid_scores_std,
    color="navy",
    alpha=0.2
)

ax.set(
    title=f"Validation Curve with best model (Kernel: {kernel})",
    xlabel="C",
    ylabel="Score"
)
ax.legend(loc="best")
fig.show()