# Importing Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [None]:
data = pd.read_csv("final_data.csv")
data = data.drop(columns=["Unnamed: 0", "Sl. No","Patient File No."])
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data["PCOS (Y/N)"].value_counts()

# Data Cleaning and EDA

In [None]:
data["AMH(ng/mL)"] = pd.to_numeric(data["AMH(ng/mL)"], errors='coerce')
data["II    beta-HCG(mIU/mL)"] = pd.to_numeric(data["II    beta-HCG(mIU/mL)"], errors='coerce')

#Dealing with missing values. 
#Filling NA values with the median of that feature.

data['Marraige Status (Yrs)'].fillna(data['Marraige Status (Yrs)'].median(),inplace=True)
data['II    beta-HCG(mIU/mL)'].fillna(data['II    beta-HCG(mIU/mL)'].median(),inplace=True)
data['AMH(ng/mL)'].fillna(data['AMH(ng/mL)'].median(),inplace=True)
data['Fast food (Y/N)'].fillna(data['Fast food (Y/N)'].median(),inplace=True)

#Clearing up the extra space in the column names (optional)

data.columns = [col.strip() for col in data.columns]

In [None]:
#Examaning a correlation matrix of all the features 

corrmat = data.corr()
plt.subplots(figsize=(18,18))
sns.heatmap(corrmat,cmap="Pastel2", square=True);

In [None]:
#How all the features correlate with the PCOS 

corrmat["PCOS (Y/N)"].sort_values(ascending=False)

Now we have correlation values for all the features w.r.t PCOS[Y/N], we would now run Chi squared test and select 20 best features and proceed with training(this is performed after graphs)

In [None]:
# Length of menstrual phase in PCOS vs normal 
color = ["red", "green"]
fig=sns.lmplot(data=data,x="Age (yrs)",y="Cycle length(days)", hue="PCOS (Y/N)",palette=color)
plt.show(fig)

The length of the menstrual phase is overall consistent over different ages for normal cases. Whereas in the case of PCOD the length increased with age.

In [None]:
# Pattern of weight gain (BMI) over years in PCOS and Normal. 
fig= sns.lmplot(data =data,x="Age (yrs)",y="BMI", hue="PCOS (Y/N)", palette= color )
plt.show(fig)

Body mass index (BMI) is showing consistency for normal cases. Whereas for PCOS the BMI increases with age.

Patterns Of Irregularity In Mensuration: There is no clarity on what this feature and its values indicate but,

Apparently in the feature "Cycle(R/I)" value:
4 indicates irregular menstrual cycle and
2 indicates a regular menstrual cycle

In [None]:
# Distribution of follicles in both ovaries. 
sns.lmplot(data =data,x='Follicle No. (R)',y='Follicle No. (L)', hue="PCOS (Y/N)",palette=color)
plt.show()

The mensural cyclThe distribution of follicles in both ovaries Left and Right are not equal for women with PCOS in comparison with the "Normal" patient. This is curious let's exlore it with boxen plot.e becomes more regular for normal cases with age. Whereas, for PCOS the irregularity increases with age.

In [None]:
features = ["Follicle No. (L)","Follicle No. (R)"]
for i in features:
    sns.swarmplot(x=data["PCOS (Y/N)"], y=data[i], color="black", alpha=0.5 )
    sns.boxenplot(x=data["PCOS (Y/N)"], y=data[i], palette=color)
    plt.show()

The number of follicles in women with PCOS is higher, as expected. And are unequal as well.

We are performing some more EDA to see how other features relate with a woman having PCOS or not.

In [None]:
features = ["Age (yrs)","Weight (Kg)", "BMI", "Hb(g/dl)", "Cycle length(days)","Endometrium (mm)" ]
for i in features:
    sns.swarmplot(x=data["PCOS (Y/N)"], y=data[i], color="black", alpha=0.5 )
    sns.boxenplot(x=data["PCOS (Y/N)"], y=data[i], palette=color)
    plt.show()

In [None]:
df1 = pd.DataFrame(data = np.random.random(size=(4,3)), columns = ["BP _Diastolic (mmHg)","AMH(ng/mL)","BP _Systolic (mmHg)"])

sns.boxplot(x="variable", y="value", data=pd.melt(df1))

plt.show()

In [None]:
#Dropping the outliers. 

data = data[(data["BP _Diastolic (mmHg)"]>20)]
data = data[(data["AMH(ng/mL)"]<40)]
data = data[(data["BP _Systolic (mmHg)"]>20)]
data = data[(data["Endometrium (mm)"]>0)]
data = data[(data["Avg. F size (R) (mm)"]>0)]
data = data[(data["Avg. F size (R) (mm)"]>0)]
data = data[(data["RBS(mg/dl)"]<200)]
data = data[(data["PRG(ng/mL)"]<20)]
data = data[(data["Pulse rate(bpm)"]>20)]
data = data[(data["FSH(mIU/mL)"]<4000)]
data = data[(data["LH(mIU/mL)"]<1500)]
data = data[(data["Cycle(R/I)"]<4.5)]

data.shape

In [None]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
y = data['PCOS (Y/N)']
X = data.drop('PCOS (Y/N)', axis = 1)

X_new = SelectKBest(chi2, k=15).fit_transform(X, y)
X_new[:5]



In [None]:
model= SelectKBest(chi2, k=20).fit(X,y)


selected_feature_names=X.columns[model.get_support()]
print(selected_feature_names)

## Modeling 

In [None]:
#Splitting the data into test and training sets

X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.3) 

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

#Making prediction and checking the test set

pred_rfc = rfc.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

#Testing the Model on test set

predictions=rfc.predict(X_test)
acccuracy_final = accuracy_score(y_test,predictions)


classi_report = classification_report(y_test, predictions)
print(classi_report)

# F1 Score
a = f1_score(y_test, predictions, zero_division=1)
print(a)

In [None]:
from yellowbrick.classifier import ROCAUC
visualizer = ROCAUC(rfc, classes=[0, 1])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
from yellowbrick.classifier import ClassPredictionError
# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(
    rfc, classes=[0,1]
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)


#Making prediction and checking the test set

pred_rfc = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

#Testing the Model on test set

predictions=clf.predict(X_test)
acccuracy_final = accuracy_score(y_test,predictions)


classi_report = classification_report(y_test, predictions)
print(classi_report)


# F1 Score
b = f1_score(y_test, predictions, zero_division=1)
print(b)

In [None]:
visualizer = ROCAUC(clf, classes=[0, 1])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
visualizer = ClassPredictionError(
    clf, classes=[0,1]
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()




### MLP Classifier(Neural Networks)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

#Making prediction and checking the test set

pred_rfc = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

#Testing the Model on test set

predictions=clf.predict(X_test)
acccuracy_final = accuracy_score(y_test,predictions)


classi_report = classification_report(y_test, predictions)
print(classi_report)


# F1 Score
c = f1_score(y_test, predictions, zero_division=1)
print(c)

In [None]:
visualizer = ROCAUC(clf, classes=[0, 1])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
visualizer = ClassPredictionError(
    clf, classes=[0,1]
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()

### Decision Trees

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

#Making prediction and checking the test set

pred_rfc = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

#Testing the Model on test set

predictions=clf.predict(X_test)
acccuracy_final = accuracy_score(y_test,predictions)


classi_report = classification_report(y_test, predictions)
print(classi_report)


# F1 Score
d = f1_score(y_test, predictions, zero_division=1)
print(d)

In [None]:
visualizer = ROCAUC(clf, classes=[0, 1])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
visualizer = ClassPredictionError(
    clf, classes=[0,1]
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(X_train, y_train)

#Making prediction and checking the test set

pred_rfc = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

#Testing the Model on test set

predictions=clf.predict(X_test)
acccuracy_final = accuracy_score(y_test,predictions)


classi_report = classification_report(y_test, predictions)
print(classi_report)


# F1 Score
e = f1_score(y_test, predictions, zero_division=1)
print(e)

In [None]:
visualizer = ROCAUC(clf, classes=[0, 1])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
visualizer = ClassPredictionError(
    clf, classes=[0,1]
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()