In [132]:
import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', None)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, f1_score, jaccard_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [115]:
data = pd.read_csv("../SistersLab-Project/PCOS_data.csv")
df = data.drop(["Sl. No", "Patient File No.",  "Unnamed: 44"], axis=1)

In [116]:
df.rename(columns={'Height(Cm) ': 'Height(Cm)'}, inplace=True)
df.rename(columns={'Marraige Status (Yrs)': 'Marriage Status (Yrs)'}, inplace=True)
df.rename(columns={'Pulse rate(bpm) ': 'Pulse rate(bpm)'}, inplace=True)
df.rename(columns={'II    beta-HCG(mIU/mL)': 'II_beta_HCG(mIU/mL)'}, inplace=True)
df.rename(columns={' Age (yrs)': 'Age (yrs)'}, inplace=True)
df.rename(columns={'  I   beta-HCG(mIU/mL)': 'I_beta_HCG(mIU/mL)'}, inplace=True)
df.rename(columns={'No. of abortions': 'No_of_abortions'}, inplace=True)
df.rename(columns={'BP _Systolic (mmHg)': 'BP_Systolic(mmHg)'}, inplace=True)
df.rename(columns={'BP _Diastolic (mmHg)': 'BP_Diastolic(mmHg)'}, inplace=True)
df.rename(columns={'Waist:Hip Ratio': 'WaistHip_Ratio'}, inplace=True)

df.loc[df['II_beta_HCG(mIU/mL)'] == '1.99.', 'II_beta_HCG(mIU/mL)'] = 1.99
df.loc[df['AMH(ng/mL)'] == 'a', 'AMH(ng/mL)'] = np.nan # eksik değer
df['II_beta_HCG(mIU/mL)'] = pd.to_numeric(df['II_beta_HCG(mIU/mL)'], errors='coerce').astype('float64')
df['AMH(ng/mL)'] = pd.to_numeric(df['AMH(ng/mL)'], errors='coerce').astype('float64')

# df[df.index == 329]
# df.drop(329, inplace=True);

In [117]:
df.isnull().sum()

PCOS (Y/N)               0
Age (yrs)                0
Weight (Kg)              0
Height(Cm)               0
BMI                      0
Blood Group              0
Pulse rate(bpm)          0
RR (breaths/min)         0
Hb(g/dl)                 0
Cycle(R/I)               0
Cycle length(days)       0
Marriage Status (Yrs)    1
Pregnant(Y/N)            0
No_of_abortions          0
I_beta_HCG(mIU/mL)       0
II_beta_HCG(mIU/mL)      0
FSH(mIU/mL)              0
LH(mIU/mL)               0
FSH/LH                   0
Hip(inch)                0
Waist(inch)              0
WaistHip_Ratio           0
TSH (mIU/L)              0
AMH(ng/mL)               1
PRL(ng/mL)               0
Vit D3 (ng/mL)           0
PRG(ng/mL)               0
RBS(mg/dl)               0
Weight gain(Y/N)         0
hair growth(Y/N)         0
Skin darkening (Y/N)     0
Hair loss(Y/N)           0
Pimples(Y/N)             0
Fast food (Y/N)          1
Reg.Exercise(Y/N)        0
BP_Systolic(mmHg)        0
BP_Diastolic(mmHg)       0
F

In [118]:
df["Marriage Status (Yrs)"].fillna(df["Marriage Status (Yrs)"].median(), inplace=True)
df["AMH(ng/mL)"].fillna(df["AMH(ng/mL)"].median(), inplace=True)
df["Fast food (Y/N)"].fillna(df["Fast food (Y/N)"].mode()[0], inplace=True)

In [119]:
def outlier_thresholds(dataframe, col_name, q1=0.01, q3=0.99):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

def check_outlier(dataframe, col_name, q1=0.01, q3=0.99):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name, q1, q3)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [120]:
num_cols = ['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI',
       'Blood Group', 'Pulse rate(bpm)', 'RR (breaths/min)', 'Hb(g/dl)',
        'Cycle length(days)', 'Marriage Status (Yrs)',
       'Pregnant(Y/N)', 'No_of_abortions', 'I_beta_HCG(mIU/mL)',
       'II_beta_HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH',
       'Hip(inch)', 'Waist(inch)', 'WaistHip_Ratio', 'TSH (mIU/L)',
       'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)',
       'RBS(mg/dl)', 'BP_Systolic(mmHg)',
       'BP_Diastolic(mmHg)', 'Follicle No. (L)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)']

In [121]:
for col in num_cols:
    print(col, check_outlier(df, col, 0.01, 0.99))

Age (yrs) False
Weight (Kg) False
Height(Cm) False
BMI False
Blood Group False
Pulse rate(bpm) True
RR (breaths/min) False
Hb(g/dl) False
Cycle length(days) False
Marriage Status (Yrs) False
Pregnant(Y/N) False
No_of_abortions False
I_beta_HCG(mIU/mL) False
II_beta_HCG(mIU/mL) True
FSH(mIU/mL) True
LH(mIU/mL) True
FSH/LH True
Hip(inch) False
Waist(inch) False
WaistHip_Ratio False
TSH (mIU/L) True
AMH(ng/mL) True
PRL(ng/mL) False
Vit D3 (ng/mL) True
PRG(ng/mL) True
RBS(mg/dl) True
BP_Systolic(mmHg) True
BP_Diastolic(mmHg) True
Follicle No. (L) False
Follicle No. (R) False
Avg. F size (L) (mm) False
Avg. F size (R) (mm) False
Endometrium (mm) False


In [122]:
replace_with_thresholds(df, "II_beta_HCG(mIU/mL)")
replace_with_thresholds(df, "FSH(mIU/mL)")
replace_with_thresholds(df, "FSH/LH")
replace_with_thresholds(df, "TSH (mIU/L)")
replace_with_thresholds(df, "PRL(ng/mL)")
replace_with_thresholds(df, "PRG(ng/mL)")

I_beta_HCG(mIU/mL) True
II_beta_HCG(mIU/mL) True

FSH(mIU/mL) True

FSH/LH True
TSH (mIU/L) True >> non pcos

PRL(ng/mL) True non pcos
PRG(ng/mL) True non pcos

BP_Systolic(mmHg) True
BP_Diastolic(mmHg) True

LH(mIU/mL) True >> pcoslu
AMH(ng/mL) True >> 66.00 pcoslu
RBS(mg/dl) True >> burada 350 değeri pcoslu biri
Vit D3 (ng/mL) True >> 6014.66 pcoslu

In [123]:
df.drop(["I_beta_HCG(mIU/mL)", "II_beta_HCG(mIU/mL)"], axis=1, inplace=True)

In [124]:
y = df[["PCOS (Y/N)"]]
X = df.drop("PCOS (Y/N)", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

In [125]:
cols = ['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Blood Group',
       'Pulse rate(bpm)', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)',
       'Cycle length(days)', 'Marriage Status (Yrs)', 'Pregnant(Y/N)',
       'No_of_abortions',
       'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)',
       'WaistHip_Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)',
       'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)',
       'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)',
       'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)',
       'BP_Systolic(mmHg)', 'BP_Diastolic(mmHg)', 'Follicle No. (L)',
       'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)',
       'Endometrium (mm)']

sc = StandardScaler()
for col in cols:
    X_train[col] = sc.fit_transform(X_train[[col]])
    X_test[col] = sc.transform(X_test[[col]])

In [126]:
dt = LogisticRegression(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.944954128440367
F1-score: 0.9435384750449937
Jaccard score: 0.8333333333333334
AUC score: 0.9166666666666667
Confusion matrix: 
 [[73  0]
 [ 6 30]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        73
           1       1.00      0.83      0.91        36

    accuracy                           0.94       109
   macro avg       0.96      0.92      0.93       109
weighted avg       0.95      0.94      0.94       109



In [127]:
dt = LinearSVC(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.9357798165137615
F1-score: 0.9337908957984511
Jaccard score: 0.8055555555555556
AUC score: 0.9027777777777778
Confusion matrix: 
 [[73  0]
 [ 7 29]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95        73
           1       1.00      0.81      0.89        36

    accuracy                           0.94       109
   macro avg       0.96      0.90      0.92       109
weighted avg       0.94      0.94      0.93       109



In [133]:
dt = LGBMClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix:", confusion_matrix(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 141, number of negative: 291
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1566
[LightGBM] [Info] Number of data points in the train set: 432, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.326389 -> initscore=-0.724563
[LightGBM] [Info] Start training from score -0.724563
Accuracy: 0.944954128440367
F1-score: 0.9435384750449937
Jaccard score: 0.8333333333333334
AUC score: 0.9166666666666667
Confusion matrix: [[73  0]
 [ 6 30]]
Classification Report:               precision    recall  f1-score   support

           0       0.92      1.00      0.96        73
           1       1.00      0.83      0.91        36

    accuracy                           0.94       109
   macro avg       0.96      0.92      0.93       109
weighted avg       0.95      0.94      0.94       109



In [134]:
dt = XGBClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score:", f1_score(y_test, y_pred, average='weighted')) 
print("Jaccard score:", jaccard_score(y_test, y_pred))
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.9724770642201835
F1-score: 0.9721558160640141
Jaccard score: 0.9166666666666666
AUC score: 0.9583333333333333
Confusion matrix: 
 [[73  0]
 [ 3 33]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        73
           1       1.00      0.92      0.96        36

    accuracy                           0.97       109
   macro avg       0.98      0.96      0.97       109
weighted avg       0.97      0.97      0.97       109

