In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from warnings import filterwarnings
filterwarnings("ignore")


In [None]:
#read datasets
df_inf=pd.read_csv("PCOS_infertility.csv", encoding='iso-8859-1')
df_noinf=pd.read_csv("data without infertility _final.csv", encoding='iso-8859-1')
print(f"Shape of df_inf:{df_inf.shape}")
print(f"Shape of df_noinf:{df_noinf.shape}")

In [None]:
#Sample data from df_inf
df_inf.sample(5)

In [None]:
#Sample data from df_noinf
df_noinf.sample(5)

##Feature Selection

In [None]:
#Identifying Features which have more than 0.40 correlation with PCOS(Y/N)

corr_features=df_noinf.corrwith(df_noinf["PCOS (Y/N)"]).abs().sort_values(ascending=False)
#features with correlation more than 0.4
corr_features=corr_features[corr_features>0.4].index
corr_features

In [None]:
df_noinf['Cycle(R/I)'].unique()

In [None]:
df_inf.corrwith(df_inf["PCOS (Y/N)"]).abs()

In [None]:
df_noinf=df_noinf[corr_features]
df_noinf.head()

In [None]:
df_noinf.columns

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,6,1)
sns.boxplot(x='PCOS (Y/N)',y='Follicle No. (R)',data=df_noinf)
#plt.subplot(1,7,2)
#sns.boxplot(x='PCOS (Y/N)',y='Insulin levels (æIU/ml)',data=df_noinf)
plt.subplot(1,6,2)
sns.boxplot(x='PCOS (Y/N)',y='Follicle No. (L)',data=df_noinf)
plt.subplot(1,6,3)
sns.boxplot(x='PCOS (Y/N)',y='Skin darkening (Y/N)',data=df_noinf)
plt.subplot(1,6,4)
sns.boxplot(x='PCOS (Y/N)',y='hair growth(Y/N)',data=df_noinf)
plt.subplot(1,6,5)
sns.boxplot(x='PCOS (Y/N)',y='Weight gain(Y/N)',data=df_noinf)
plt.subplot(1,6,6)
sns.boxplot(x='PCOS (Y/N)',y='Cycle(R/I)',data=df_noinf)

plt.show()

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(df_noinf.corr(), annot=True)
plt.show()

In [None]:
y=df_noinf['PCOS (Y/N)']
X=df_noinf.drop(['PCOS (Y/N)'], axis=1)

In [None]:
X_train,X_test,y_train, y_test=train_test_split(X,y, test_size=0.2)

## Training Model

In [None]:
model=LogisticRegression()
model.fit(X_train,y_train)
print(f"Score in Train Data : {model.score(X_train,y_train)}")

In [None]:
model2 = RandomForestClassifier()
model2.fit(X_train, y_train)
print(f"Score in Train Data: {model2.score(X_train,y_train)}")

In [None]:
model3 = DecisionTreeClassifier()
model3.fit(X_train, y_train)
print(f'Score in train data: {model3.score(X_train, y_train)}')

## Prediction

In [None]:
y_pred=model.predict(X_test)

## Model Evaluation

In [None]:
print(f"Score in Test Data : {model.score(X_test,y_test)}")

cm=confusion_matrix(y_test, y_pred)
p_right=cm[0][0]+cm[1][1]
p_wrong=cm[0][1]+cm[1][0]

print(f"Right classification : {p_right}")
print(f"Wrong classification : {p_wrong}")
cm

In [None]:
import joblib

In [None]:
joblib.dump(model3, 'Decision_Tree.joblib')