In [15]:
import seaborn as sns
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib

In [3]:
data = pd.read_excel(r"C:\Users\cdhan\Downloads\PCOS_data_without_infertility.xlsx")
data["AMH(ng/mL)"] = pd.to_numeric(data["AMH(ng/mL)"], errors='coerce')
data["II    beta-HCG(mIU/mL)"] = pd.to_numeric(data["II    beta-HCG(mIU/mL)"], errors='coerce')
data['Marraige Status (Yrs)'].fillna(data['Marraige Status (Yrs)'].median(),inplace=True)
data['II    beta-HCG(mIU/mL)'].fillna(data['II    beta-HCG(mIU/mL)'].median(),inplace=True)
data['AMH(ng/mL)'].fillna(data['AMH(ng/mL)'].median(),inplace=True)
data['Fast food (Y/N)'].fillna(data['Fast food (Y/N)'].median(),inplace=True)

#Clearing up the extra space in the column names (optional)

data.columns = [col.strip() for col in data.columns]
X = data.drop(["Sl. No",'Unnamed: 44',"Patient File No.", "PCOS (Y/N)"], axis=1) 
X = pd.get_dummies(X)
y = data["PCOS (Y/N)"].ravel()
label = LabelEncoder()
y = label.fit_transform(y)


In [4]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # You can change the strategy as needed (e.g., 'median', 'most_frequent', etc.)
    ('scaler', StandardScaler()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['float64', 'int64']).columns)
        
    ])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier(n_neighbors=3))
])
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
predictions = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)*100
conf_matrix = confusion_matrix(y_test, predictions)
classification_report_knn = classification_report(y_test,predictions)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print("classification_report_knn")
print(classification_report_knn)

Accuracy: 83.4862385321101
Confusion Matrix:
[[72  5]
 [13 19]]
classification_report_knn
              precision    recall  f1-score   support

           0       0.85      0.94      0.89        77
           1       0.79      0.59      0.68        32

    accuracy                           0.83       109
   macro avg       0.82      0.76      0.78       109
weighted avg       0.83      0.83      0.83       109



In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier(n_neighbors=3))
])
# Fitting the pipeline on the training data
pipeline_knn.fit(X_train, y_train)

# Making predictions on the test data
predictions_knn = pipeline_knn.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, predictions_knn)*100
conf_matrix_knn = confusion_matrix(y_test, predictions_knn)
classification_report_knn = classification_report(y_test,predictions_knn)

print(f'Accuracy: {accuracy_knn}')
print('Confusion Matrix:')
print(conf_matrix_knn)
print("classification_report_knn")
print(classification_report_knn)

Accuracy: 83.4862385321101
Confusion Matrix:
[[72  5]
 [13 19]]
classification_report_knn
              precision    recall  f1-score   support

           0       0.85      0.94      0.89        77
           1       0.79      0.59      0.68        32

    accuracy                           0.83       109
   macro avg       0.82      0.76      0.78       109
weighted avg       0.83      0.83      0.83       109



In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC())
])
pipeline_svm.fit(X_train, y_train)
predictions_svm = pipeline_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, predictions_svm)*100
conf_matrix_svm = confusion_matrix(y_test, predictions_svm)
classification_report_svm = classification_report(y_test,predictions_svm)

print(f'Accuracy: {accuracy_svm}')
print('Confusion Matrix:')
print(conf_matrix_svm)
print("classification_report_svm")
print(classification_report_svm)

Accuracy: 87.1559633027523
Confusion Matrix:
[[74  3]
 [11 21]]
classification_report_svm
              precision    recall  f1-score   support

           0       0.87      0.96      0.91        77
           1       0.88      0.66      0.75        32

    accuracy                           0.87       109
   macro avg       0.87      0.81      0.83       109
weighted avg       0.87      0.87      0.87       109



In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('Random forest',RandomForestClassifier(n_estimators=100))
])

pipeline3.fit(X_train, y_train)

predictions_Rf = pipeline3.predict(X_test)

accuracy_Rf = accuracy_score(y_test, predictions_svm)*100
conf_matrix_Rf = confusion_matrix(y_test, predictions_Rf)
classification_report_Random_forest = classification_report(y_test,predictions_Rf)

print(f'Accuracy: {accuracy_Rf}')
print('Confusion Matrix:')
print(conf_matrix_Rf)
print("classification_report_Random_forest")
print(classification_report_Random_forest)

Accuracy: 87.1559633027523
Confusion Matrix:
[[74  3]
 [16 16]]
classification_report_Random_forest
              precision    recall  f1-score   support

           0       0.82      0.96      0.89        77
           1       0.84      0.50      0.63        32

    accuracy                           0.83       109
   macro avg       0.83      0.73      0.76       109
weighted avg       0.83      0.83      0.81       109



In [13]:
data1 =np.array([33,68.8,165,25.27089073,11,	72	,18,	11.8,2,	5,10,1,0,494.08,494.08,5.54,0.88,6.295454545,40	,36,	0.9,	2.54,6.63,	10.52,49.7,0.36,84	,0	,0	,0	,1,	1,	1,	0	,120,	80,	13	,15	,18	,20,	10])

In [14]:
data1_ = np.array(data1).reshape(1, -1)
input_data = pd.DataFrame([data1], columns=X.columns)

# Predict using the pipeline
predic = pipeline_svm.predict(input_data)
print("Prediction:", predic)


Prediction: [0]


In [16]:
import joblib
joblib.dump(pipeline3, 'random_forest_model.pkl')
joblib.dump(pipeline_knn, 'knn_model.pkl')
joblib.dump(pipeline_svm, 'svm_model.pkl')

# Load the model from the file
loaded_model_Rf = joblib.load('random_forest_model.pkl')
loaded_model_knn = joblib.load('knn_model.pkl')
loaded_model_svm = joblib.load('svm_model.pkl')