In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install boruta

## Porównanie modeli: XGBoost vs Random Forest + Wykresy ROC-AUC

W tym zadaniu zbudujemy modele klasyfikacyjne XGBoost i RandomForestClassifier, wytrenujemy je na tym samym zbiorze danych, narysujemy wykresy ROC-AUC dla obu, a następnie przeprowadzimy selekcję cech algorytmem Boruta (dla RF).


In [None]:
# 1. Przygotuj dane
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Zbiór
X, y = make_classification(n_samples=1200, n_features=12, n_informative=7, n_redundant=3,
                           n_classes=2, flip_y=0.04, class_sep=1.2, random_state=42)
df = pd.DataFrame(X, columns=[f'feat_{i+1}' for i in range(X.shape[1])])
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, stratify=y, random_state=1)


In [None]:
X

In [None]:
y

In [None]:
# 2. Trening XGBoost i RandomForest
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict_proba(X_test)[:, 1]

# RandomForest
rf_model = RandomForestClassifier(n_estimators=200, random_state=1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict_proba(X_test)[:, 1]

In [None]:
# 3. Wykres ROC AUC dla obu modeli
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
auc_xgb = auc(fpr_xgb, tpr_xgb)
auc_rf = auc(fpr_rf, tpr_rf)

plt.figure(figsize=(7,5))
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC={auc_xgb:.3f})', lw=2)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC={auc_rf:.3f})', lw=2, linestyle='--')
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: XGBoost vs RandomForest')
plt.legend()
plt.grid()
plt.show()

## Boruta – selekcja cech z Random Forest

Poniżej – porównanie AUC modelu RF przed i po selekcji cech.

In [None]:
# 4. Boruta z RandomForest
from boruta import BorutaPy

boruta_selector = BorutaPy(
    estimator=RandomForestClassifier(n_estimators=2000, random_state=1),
    n_estimators='auto', random_state=1, verbose=2, perc=85, max_iter=50
)
boruta_selector.fit(X_train.values, y_train)

selected = boruta_selector.support_
selected_features = X_train.columns[selected]
print(f"Wybrane cechy przez Borutę: {list(selected_features)}")

if len(selected_features) > 0:
    # Trening na wybranych cechach
    rf_model_boruta = RandomForestClassifier(n_estimators=200, random_state=1)
    rf_model_boruta.fit(X_train[selected_features], y_train)
    y_pred_rf_boruta = rf_model_boruta.predict_proba(X_test[selected_features])[:, 1]
    fpr_rf_boruta, tpr_rf_boruta, _ = roc_curve(y_test, y_pred_rf_boruta)
    auc_rf_boruta = auc(fpr_rf_boruta, tpr_rf_boruta)
else:
    auc_rf_boruta = None
    print("Boruta nie wybrała żadnych cech – porównanie ograniczone do pełnego zbioru.")

In [None]:
# 5. Wykres ROC RF przed/po Boruta
plt.figure(figsize=(7,5))
plt.plot(fpr_rf, tpr_rf, label=f'RF - wszystkie cechy (AUC={auc_rf:.3f})', lw=2)
if auc_rf_boruta:
    plt.plot(fpr_rf_boruta, tpr_rf_boruta, label=f'RF + Boruta (AUC={auc_rf_boruta:.3f})', lw=2, linestyle='--')
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RF: ROC Curve (pełny vs Boruta)')
plt.legend()
plt.grid()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

selected_features = ['feat_1', 'feat_2', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_9', 'feat_10', 'feat_11', 'feat_12']

# Załóżmy, że Twój dataframe nazywa się df i masz już kolumnę 'y' z etykietami klasy
df_plot = df.copy()
df_plot['target'] = y  # jeśli jeszcze nie masz etykiety jako kolumna

sns.pairplot(df_plot[selected_features + ['target']], hue='target', palette='Set1', plot_kws={'alpha':0.5})
plt.suptitle('Pairplot wybranych cech', y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(data=df_plot, x='feat_1', y='feat_2', hue='target')
plt.title('feat_1 vs feat_2')
plt.show()

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(data=df_plot, x='feat_3', y='feat_4', hue='target')
plt.title('feat_3 vs feat_4')
plt.show()