In [1]:
# import bibliotek
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#Wczytanie
df = pd.read_csv("diabetes (2).csv")
df.head()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


In [4]:
# Przygotowanie danych
X = df.drop(columns=["Diabetic", "PatientID"])
y = df["Diabetic"]
# %%
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [5]:
# Model bazowy-bez Pipeline
baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)

y_pred_baseline = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)

baseline_accuracy

# %%
print(classification_report(y_test, y_pred_baseline))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2500
           1       0.91      0.89      0.90      1250

    accuracy                           0.93      3750
   macro avg       0.93      0.92      0.93      3750
weighted avg       0.93      0.93      0.93      3750



In [7]:
#Przygotowanie danych +model
pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])


# %%
pipeline.fit(X_train, y_train)

y_pred_pipeline = pipeline.predict(X_test)
pipeline_accuracy = accuracy_score(y_test, y_pred_pipeline)

pipeline_accuracy
# %%
print(classification_report(y_test, y_pred_pipeline))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      2500
           1       0.72      0.60      0.65      1250

    accuracy                           0.79      3750
   macro avg       0.77      0.74      0.75      3750
weighted avg       0.78      0.79      0.78      3750



In [8]:
# Porównanie modeli
results = pd.DataFrame({
    "Model": ["Random Forest (baseline)", "Logistic Regression (Pipeline)"],
    "Accuracy": [baseline_accuracy, pipeline_accuracy]
})

results

Unnamed: 0,Model,Accuracy
0,Random Forest (baseline),0.933867
1,Logistic Regression (Pipeline),0.788


In [13]:
### Wnioski:
 #RandoForest osiąga wyraźnie wyższą skuteczność (accuracy ~0.93)
 #Pipeline z regresją logistyczną daje niższy wynik (~0.79),
  #ale zapewnia poprawny i bezpieczny proces przetwarzania danych
#Pipeline ułatwia walidację, tuning hiperparametrów i wdrożenie modelu