In [1]:
import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', None)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, f1_score, jaccard_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("../SistersLab-Project/PCOS_data.csv")

In [3]:
df = data.drop(["Sl. No", "Patient File No.",  "Unnamed: 44"], axis=1)

In [4]:
df.rename(columns={'Height(Cm) ': 'Height(Cm)'}, inplace=True)
df.rename(columns={'Marraige Status (Yrs)': 'Marriage Status (Yrs)'}, inplace=True)
df.rename(columns={'Pulse rate(bpm) ': 'Pulse rate(bpm)'}, inplace=True)
df.rename(columns={'II    beta-HCG(mIU/mL)': 'II_beta_HCG(mIU/mL)'}, inplace=True)
df.rename(columns={' Age (yrs)': 'Age (yrs)'}, inplace=True)
df.rename(columns={'  I   beta-HCG(mIU/mL)': 'I_beta_HCG(mIU/mL)'}, inplace=True)
df.rename(columns={'No. of abortions': 'No_of_abortions'}, inplace=True)
df.rename(columns={'BP _Systolic (mmHg)': 'BP_Systolic(mmHg)'}, inplace=True)
df.rename(columns={'BP _Diastolic (mmHg)': 'BP_Diastolic(mmHg)'}, inplace=True)
df.rename(columns={'Waist:Hip Ratio': 'WaistHip_Ratio'}, inplace=True)

In [5]:
df.loc[df['II_beta_HCG(mIU/mL)'] == '1.99.', 'II_beta_HCG(mIU/mL)'] = 1.99
df.loc[df['AMH(ng/mL)'] == 'a', 'AMH(ng/mL)'] = np.nan # eksik değer
df['II_beta_HCG(mIU/mL)'] = pd.to_numeric(df['II_beta_HCG(mIU/mL)'], errors='coerce').astype('float64')
df['AMH(ng/mL)'] = pd.to_numeric(df['AMH(ng/mL)'], errors='coerce').astype('float64')

In [9]:
# df[df.index == 329]
df.drop(329, inplace=True);

In [10]:
df.isnull().sum()

PCOS (Y/N)               0
Age (yrs)                0
Weight (Kg)              0
Height(Cm)               0
BMI                      0
Blood Group              0
Pulse rate(bpm)          0
RR (breaths/min)         0
Hb(g/dl)                 0
Cycle(R/I)               0
Cycle length(days)       0
Marriage Status (Yrs)    1
Pregnant(Y/N)            0
No_of_abortions          0
I_beta_HCG(mIU/mL)       0
II_beta_HCG(mIU/mL)      0
FSH(mIU/mL)              0
LH(mIU/mL)               0
FSH/LH                   0
Hip(inch)                0
Waist(inch)              0
WaistHip_Ratio           0
TSH (mIU/L)              0
AMH(ng/mL)               1
PRL(ng/mL)               0
Vit D3 (ng/mL)           0
PRG(ng/mL)               0
RBS(mg/dl)               0
Weight gain(Y/N)         0
hair growth(Y/N)         0
Skin darkening (Y/N)     0
Hair loss(Y/N)           0
Pimples(Y/N)             0
Fast food (Y/N)          1
Reg.Exercise(Y/N)        0
BP_Systolic(mmHg)        0
BP_Diastolic(mmHg)       0
F

In [11]:
df["Marriage Status (Yrs)"].fillna(df["Marriage Status (Yrs)"].median(), inplace=True)
df["AMH(ng/mL)"].fillna(df["AMH(ng/mL)"].median(), inplace=True)
df["Fast food (Y/N)"].fillna(df["Fast food (Y/N)"].mode()[0], inplace=True)

In [17]:
df.drop(['I_beta_HCG(mIU/mL)', 'II_beta_HCG(mIU/mL)'],axis=1, inplace=True)

In [18]:
y = df[["PCOS (Y/N)"]]
X = df.drop("PCOS (Y/N)", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

In [19]:
X.columns

Index(['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Blood Group',
       'Pulse rate(bpm)', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)',
       'Cycle length(days)', 'Marriage Status (Yrs)', 'Pregnant(Y/N)',
       'No_of_abortions', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)',
       'Waist(inch)', 'WaistHip_Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)',
       'PRL(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)',
       'Weight gain(Y/N)', 'hair growth(Y/N)', 'Skin darkening (Y/N)',
       'Hair loss(Y/N)', 'Pimples(Y/N)', 'Fast food (Y/N)',
       'Reg.Exercise(Y/N)', 'BP_Systolic(mmHg)', 'BP_Diastolic(mmHg)',
       'Follicle No. (L)', 'Follicle No. (R)', 'Avg. F size (L) (mm)',
       'Avg. F size (R) (mm)', 'Endometrium (mm)'],
      dtype='object')

In [24]:
cols = ['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Blood Group',
       'Pulse rate(bpm)', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)',
       'Cycle length(days)', 'Marriage Status (Yrs)', 'Pregnant(Y/N)',
       'No_of_abortions', #
       'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)',
       'WaistHip_Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)',
       'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)',
       'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)',
       'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)',
       'BP_Systolic(mmHg)', 'BP_Diastolic(mmHg)', 'Follicle No. (L)',
       'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)',
       'Endometrium (mm)']

sc = StandardScaler()
for col in cols:
    X_train[col] = sc.fit_transform(X_train[[col]])
    X_test[col] = sc.transform(X_test[[col]])

In [16]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.8703703703703703
F1-score: 0.8720481044424706
Jaccard score: 0.6818181818181818
AUC score: 0.8669275929549902
Confusion matrix: 
 [[64  9]
 [ 5 30]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.88      0.90        73
           1       0.77      0.86      0.81        35

    accuracy                           0.87       108
   macro avg       0.85      0.87      0.86       108
weighted avg       0.88      0.87      0.87       108



In [25]:
dt = LogisticRegression(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.9351851851851852
F1-score: 0.9337654006528178
Jaccard score: 0.8055555555555556
AUC score: 0.9074363992172212
Confusion matrix: 
 [[72  1]
 [ 6 29]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95        73
           1       0.97      0.83      0.89        35

    accuracy                           0.94       108
   macro avg       0.94      0.91      0.92       108
weighted avg       0.94      0.94      0.93       108



In [33]:
dt = SVC(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.9259259259259259
F1-score: 0.9239156920077972
Jaccard score: 0.7777777777777778
AUC score: 0.8931506849315068
Confusion matrix: 
 [[72  1]
 [ 7 28]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.99      0.95        73
           1       0.97      0.80      0.88        35

    accuracy                           0.93       108
   macro avg       0.94      0.89      0.91       108
weighted avg       0.93      0.93      0.92       108



In [26]:
dt = LGBMClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix:", confusion_matrix(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 142, number of negative: 290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1570
[LightGBM] [Info] Number of data points in the train set: 432, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.328704 -> initscore=-0.714054
[LightGBM] [Info] Start training from score -0.714054
Accuracy: 0.9629629629629629
F1-score: 0.962334455667789
Jaccard score: 0.8857142857142857
AUC score: 0.9428571428571428
Confusion matrix: [[73  0]
 [ 4 31]]
Classification Report:               precision    recall  f1-score   support

           0       0.95      1.00      0.97        73
           1       1.00      0.89      0.94        35

    accuracy                           0.96       108
   macro avg       0.97      0.94      0.96       108
weighted avg       0.96      0.96      0.96       108



In [27]:
dt = XGBClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.9629629629629629
F1-score: 0.9626685509038451
Jaccard score: 0.8888888888888888
AUC score: 0.9502935420743639
Confusion matrix: 
 [[72  1]
 [ 3 32]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        73
           1       0.97      0.91      0.94        35

    accuracy                           0.96       108
   macro avg       0.96      0.95      0.96       108
weighted avg       0.96      0.96      0.96       108



In [48]:
# Modellerin ve metriklerin isimleri
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier()
}

metrics = ['accuracy', 'f1', 'jaccard', 'auc']

results_list = []

# Her modeli eğit ve metrikleri hesapla
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred)
    
    # AUC score sadece binary sınıflandırma modelleri içindir, bu nedenle kontrol yapalım
    if isinstance(model, (LogisticRegression, SVC, DecisionTreeClassifier, XGBClassifier, LGBMClassifier)):
        auc = roc_auc_score(y_test, y_pred)
    else:
        auc = None
    
    # Liste üzerinde tuple olarak sonuçları ekleyin
    results_list.append((model_name, accuracy, f1, jaccard, auc))

# Listeyi kullanarak sonuçları DataFrame'e çevir
results_df = pd.DataFrame(results_list, columns=['Model', 'Accuracy', 'F1 Score', 'Jaccard Score', 'AUC Score'])

[LightGBM] [Info] Number of positive: 142, number of negative: 290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1723
[LightGBM] [Info] Number of data points in the train set: 432, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.328704 -> initscore=-0.714054
[LightGBM] [Info] Start training from score -0.714054


In [49]:
results_df

Unnamed: 0,Model,Accuracy,F1 Score,Jaccard Score,AUC Score
0,LogisticRegression,0.935185,0.892308,0.805556,0.907436
1,SVC,0.925926,0.875,0.777778,0.893151
2,DecisionTree,0.861111,0.805195,0.673913,0.867515
3,XGBoost,0.962963,0.941176,0.888889,0.950294
4,LightGBM,0.962963,0.939394,0.885714,0.942857


In [None]:
plt.figure(figsize=(5,4))
index =["non PCOS", "PCOS"]
columns=["non PCOS","PCOS"]
cm = pd.DataFrame(confusion_matrix(y_test,y_pred), index=index, columns=columns)
sns.heatmap(cm, annot=True, fmt='d', annot_kws={'size': 15, 'fontweight': 'semibold'})
plt.xlabel("Predicted value")
plt.ylabel("Actual value")
plt.show()

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.title('ROC eğrisi')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('Doğru Pozitif Oranı')
plt.xlabel('Yanlış Pozitif Oranı')
plt.show()

In [None]:

parameters = {'criterion':['gini','entropy'],'max_depth':[2,5,10,20,30,90],'random_state':[42,34,2]}
grid = GridSearchCV(DecisionTreeClassifier(), parameters, cv=10)

grid.fit(X_train, y_train)
best_parameters=grid.best_params_

print("en iyi parametreler: \n",best_parameters)

grid_model = DecisionTreeClassifier(criterion=best_parameters["criterion"],
                               max_depth=best_parameters["max_depth"])

tuned_model = grid_model.fit(X_train, y_train)
y_pred = tuned_model.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:

# Modelleri ve hiperparametre uzayını tanımlayın
models = {
    'LogisticRegression': (LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10, 100]}),
    'SVC': (SVC(), {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}),
    'DecisionTree': (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30, 40]}),
    'XGBoost': (XGBClassifier(), {'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [50, 100, 200, 300]}),
    'LightGBM': (LGBMClassifier(), {'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [50, 100, 200, 300]})
}

# Her model için GridSearchCV uygula
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # En iyi hiperparametreleri ve doğruluk skorunu yazdır
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best accuracy score for {model_name}: {grid_search.best_score_}")

    # Test seti üzerinde performansı değerlendir
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy score for {model_name}: {accuracy}")
    print("\n")