In [1]:
# 1. Importy
# ===============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

# ===============================
# 2. Wczytanie danych
# ===============================
df = pd.read_csv("penguins (2).csv")   
print(df.head())
print(df.info())
print(df.describe())

# ===============================
# 3. Podział na cechy i etykiety
# ===============================
X = df.drop("Species", axis=1)
y = df["Species"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ===============================
# 4. Preprocessing
# ===============================
preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# ===============================
# 5. Modele
# ===============================

# --- KNN ---
knn = Pipeline([
    ("prep", preprocess),
    ("model", KNeighborsClassifier(n_neighbors=5))
])
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# --- Decision Tree ---
dt = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", DecisionTreeClassifier(random_state=42))
])
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# --- SVC ---
svc = Pipeline([
    ("prep", preprocess),
    ("model", SVC())
])
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

# --- Random Forest ---
rf = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", RandomForestClassifier(random_state=42))
])
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# --- AdaBoost ---
ada = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", AdaBoostClassifier(random_state=42))
])
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)

# ===============================
# 6. Voting Ensemble
# ===============================
voting = VotingClassifier(
    estimators=[
        ("knn", knn),
        ("svc", svc),
        ("rf", rf)
    ],
    voting="hard"
)

voting.fit(X_train, y_train)
y_pred_voting = voting.predict(X_test)

# ===============================
# 7. GridSearchCV (SVC)
# ===============================
param_grid = {
    "model__C": [0.1, 1, 10],
    "model__gamma": ["scale", "auto"]
}

svc_pipeline = Pipeline([
    ("prep", preprocess),
    ("model", SVC())
])

grid = GridSearchCV(
    svc_pipeline,
    param_grid,
    cv=5,
    scoring="accuracy"
)

grid.fit(X_train, y_train)
y_pred_grid = grid.predict(X_test)

# ===============================
# 8. Wyniki
# ===============================
models_results = {
    "KNN": accuracy_score(y_test, y_pred_knn),
    "DecisionTree": accuracy_score(y_test, y_pred_dt),
    "SVC": accuracy_score(y_test, y_pred_svc),
    "RandomForest": accuracy_score(y_test, y_pred_rf),
    "AdaBoost": accuracy_score(y_test, y_pred_ada),
    "VotingEnsemble": accuracy_score(y_test, y_pred_voting),
    "SVC_GridSearch": accuracy_score(y_test, y_pred_grid)
}

print("\nACCURACY MODELI:")
for model, acc in models_results.items():
    print(f"{model}: {acc:.3f}")

print("\nNajlepsze parametry GridSearchCV:")
print(grid.best_params_)

# ===============================
# 9. Raport klasyfikacji (przykład)
# ===============================
print("\nClassification Report – Voting Ensemble")
print(classification_report(y_test, y_pred_voting))

   CulmenLength  CulmenDepth  FlipperLength  BodyMass  Species
0          39.1         18.7          181.0    3750.0        0
1          39.5         17.4          186.0    3800.0        0
2          40.3         18.0          195.0    3250.0        0
3           NaN          NaN            NaN       NaN        0
4          36.7         19.3          193.0    3450.0        0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CulmenLength   342 non-null    float64
 1   CulmenDepth    342 non-null    float64
 2   FlipperLength  342 non-null    float64
 3   BodyMass       342 non-null    float64
 4   Species        344 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 13.6 KB
None
       CulmenLength  CulmenDepth  FlipperLength     BodyMass     Species
count    342.000000   342.000000     342.000000   342.000000  344.000000
mean    