In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# 1. Wczytaj dane
df = pd.read_csv('penguins.csv')

# 2. Eksploracja danych
print(df.head())
print(df.info())
print(df['Species'].value_counts())

# 3. Usuń brakujące dane
df = df.dropna()

# 4. Zamień kolumny tekstowe na zmienne liczbowe (one-hot encoding)
df = pd.get_dummies(df, drop_first=True)

# 5. Rozdziel cechy i etykiety
X = df.drop('Species', axis=1)
y = df['Species']

# 6. Zakoduj etykiety (np. imię jako cyfra)
le = LabelEncoder()
y = le.fit_transform(y)

# 7. Podział na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# 8. Skalowanie cech (dla KNN i regresji logistycznej)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 9. KNN – GridSearch
param_knn = {'n_neighbors': [3, 5, 7]}
knn = GridSearchCV(KNeighborsClassifier(), param_knn, cv=5)
knn.fit(X_train_scaled, y_train)
print("KNN – najlepsze parametry:", knn.best_params_)
print("KNN – dokładność:", accuracy_score(y_test, knn.predict(X_test_scaled)))

# 10. Drzewo decyzyjne – GridSearch (bez skalowania)
param_tree = {'max_depth': [3, 5, 7]}
tree = GridSearchCV(DecisionTreeClassifier(), param_tree, cv=5)
tree.fit(X_train, y_train)
print("Drzewo – najlepsze parametry:", tree.best_params_)
print("Drzewo – dokładność:", accuracy_score(y_test, tree.predict(X_test)))

# 11. Regresja logistyczna – GridSearch
param_logreg = {'C': [0.01, 0.1, 1.0, 10.0]}  # C to odwrotność regularyzacji
logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_logreg, cv=5)
logreg.fit(X_train_scaled, y_train)
print("Regresja logistyczna – najlepsze parametry:", logreg.best_params_)
print("Regresja logistyczna – dokładność:", accuracy_score(y_test, logreg.predict(X_test_scaled)))


   CulmenLength  CulmenDepth  FlipperLength  BodyMass  Species
0          39.1         18.7          181.0    3750.0        0
1          39.5         17.4          186.0    3800.0        0
2          40.3         18.0          195.0    3250.0        0
3           NaN          NaN            NaN       NaN        0
4          36.7         19.3          193.0    3450.0        0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CulmenLength   342 non-null    float64
 1   CulmenDepth    342 non-null    float64
 2   FlipperLength  342 non-null    float64
 3   BodyMass       342 non-null    float64
 4   Species        344 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 13.6 KB
None
Species
0    152
1    124
2     68
Name: count, dtype: int64
KNN – najlepsze parametry: {'n_neighbors': 5}
KNN – dokładność: 0.9855072463768116
Drzewo – n