In [5]:
import pandas as pd


data = pd.read_csv("diabetes_dataset.csv")
print(data.head())
print(data.info())
print(data.describe())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [6]:

print((data == 0).sum())

print(data.isnull().sum())


Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor


cols_with_missing = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
data[cols_with_missing] = data[cols_with_missing].replace(0, np.nan)


def fill_missing_with_rf(data, target_col):

    if data[target_col].isnull().sum() > 0:

        not_null_data = data[data[target_col].notnull()]

        null_data = data[data[target_col].isnull()]


        X_train = not_null_data.drop(columns=[target_col])
        y_train = not_null_data[target_col]
        X_predict = null_data.drop(columns=[target_col])

        rf = RandomForestRegressor(n_estimators=100, random_state=0)
        rf.fit(X_train, y_train)

        data.loc[data[target_col].isnull(), target_col] = rf.predict(X_predict)


for col in cols_with_missing:
    fill_missing_with_rf(data, col)


print(data.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [8]:

X = data.drop(columns=["Outcome"])
y = data["Outcome"]


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [10]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0)

model.fit(X_train_scaled, y_train)


In [12]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Doğruluk Oranı:", accuracy)

print("Sınıflandırma Raporu:\n", classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Doğruluk Oranı: 0.8181818181818182
Sınıflandırma Raporu:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87       107
           1       0.74      0.62      0.67        47

    accuracy                           0.82       154
   macro avg       0.79      0.76      0.77       154
weighted avg       0.81      0.82      0.81       154

Confusion Matrix:
 [[97 10]
 [18 29]]
