In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

random_seed = 42

In [None]:
df = pd.read_csv("/content/CLEAN- PCOS SURVEY SPREADSHEET.csv")

In [None]:
# Rename the features for easy processing
df.columns = ['age', 'weight', 'height', 'blood_group', 'period_intervial', 'have_weight_gain', 'have_ex_bodyhair_growth', 'have_darkening_skin', 'have_hair_loss', 'have_pimples', 'have_consume_fastfood', 'have_exercise', 'PCOS_diagnosis', 'have_mood_swings', 'have_regular_periods', 'period_last']

# translate the labels of blood group
df['blood_group'] = df['blood_group'].replace(11, 'A+').replace(12, 'A-').replace(13, 'B+').replace(14, 'B-').replace(15, 'O+').replace(16, 'O-').replace(17, 'AB+').replace(18, 'AB-')

In [None]:
# some entry of height is not in cm, and is not reasonable. So we drop it
df = df[df["height"] >= 100]

Feature encoding

In [None]:
# One-hot encode the 'blood_group' column
one_hot_encoded = pd.get_dummies(df['blood_group'], prefix='blood_group')

# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded * 1], axis=1)

# Drop the original 'blood_group' column
df = df.drop('blood_group', axis=1)

Upsample

In [None]:
y = df['PCOS_diagnosis']
X = df.drop(columns=["PCOS_diagnosis"])

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
oversampler = SMOTE(random_state=random_seed)
X, y = oversampler.fit_resample(X, y)

Feature engineering

In [None]:
# calculate BMI
BMI = X['weight'] / ((X['height']/100) ** 2)
BMI = pd.DataFrame(BMI,columns = ['BMI'])
X = pd.concat([X, BMI], axis=1)

In [None]:
# Drop the weight and height columns
X = X.drop(['weight', 'height'], axis=1)

In [None]:
print(len(X))
X.head()

461


Unnamed: 0,age,period_intervial,have_weight_gain,have_ex_bodyhair_growth,have_darkening_skin,have_hair_loss,have_pimples,have_consume_fastfood,have_exercise,have_mood_swings,have_regular_periods,period_last,blood_group_A+,blood_group_A-,blood_group_B+,blood_group_B-,blood_group_O+,blood_group_O-,BMI
0,21,2,0,0,1,1,0,0,0,1,0,6,0,0,0,0,1,0,22.959184
1,20,1,1,0,0,1,0,1,0,1,1,6,0,0,1,0,0,0,28.061224
2,19,1,0,0,0,0,1,0,0,1,1,4,0,0,1,0,0,0,23.308867
3,27,1,1,0,0,1,0,0,0,0,1,5,1,0,0,0,0,0,26.780401
4,23,1,1,0,0,1,0,0,0,1,1,3,0,0,0,0,1,0,24.300734


In [None]:
print(len(y))
y.head()

718


Unnamed: 0,PCOS_diagnosis
0,0
1,0
2,0
3,0
4,0


In [None]:
(pd.concat([X, y], axis=1)).to_csv("pcos_full.csv", index=False)

Train test split

In [None]:
from sklearn.model_selection import train_test_split

split_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = split_ratio, random_state = random_seed, stratify=y)

In [None]:
print(len(train_dataset))
train_dataset = pd.concat([X_train, y_train], axis=1)
train_dataset.head()

646


Unnamed: 0,age,period_intervial,have_weight_gain,have_ex_bodyhair_growth,have_darkening_skin,have_hair_loss,have_pimples,have_consume_fastfood,have_exercise,have_mood_swings,have_regular_periods,period_last,blood_group_A+,blood_group_A-,blood_group_B+,blood_group_B-,blood_group_O+,blood_group_O-,BMI,PCOS_diagnosis
440,27,1,1,0,0,1,0,1,0,1,1,5,0,0,1,0,0,0,30.778701,1
45,23,1,1,1,0,1,0,1,0,1,1,4,0,0,1,0,0,0,28.507522,0
336,16,1,0,0,0,0,0,0,0,1,1,5,1,0,0,0,0,0,19.195303,0
633,26,1,1,0,1,1,1,0,0,1,0,7,1,0,0,0,0,0,29.336155,1
518,33,1,1,0,0,1,0,1,0,0,1,5,0,0,1,0,0,0,30.171773,1


In [None]:
print(len(test_dataset))
test_dataset = pd.concat([X_test, y_test], axis=1)
test_dataset.head()

72


Unnamed: 0,age,period_intervial,have_weight_gain,have_ex_bodyhair_growth,have_darkening_skin,have_hair_loss,have_pimples,have_consume_fastfood,have_exercise,have_mood_swings,have_regular_periods,period_last,blood_group_A+,blood_group_A-,blood_group_B+,blood_group_B-,blood_group_O+,blood_group_O-,BMI,PCOS_diagnosis
483,18,1,1,1,1,1,1,0,0,1,0,6,0,0,1,0,0,0,25.922426,1
325,27,1,1,1,0,0,0,1,1,1,1,5,0,0,1,0,0,0,14.86054,0
466,23,1,1,0,1,0,0,0,0,1,0,4,1,0,0,0,0,0,21.572652,1
538,19,1,1,1,1,1,1,1,0,1,0,4,0,0,1,0,0,0,28.691275,1
244,22,1,0,0,0,0,1,0,0,0,0,3,0,0,1,0,0,0,21.09375,1


Feature scaling

In [None]:
from sklearn.preprocessing import Normalizer

norm = Normalizer()

train_dataset[['age', 'period_intervial', 'period_last', 'BMI']] = norm.fit_transform(train_dataset[['age', 'period_intervial', 'period_last', 'BMI']])

In [None]:
train_dataset

Unnamed: 0,age,period_intervial,have_weight_gain,have_ex_bodyhair_growth,have_darkening_skin,have_hair_loss,have_pimples,have_consume_fastfood,have_exercise,have_mood_swings,have_regular_periods,period_last,blood_group_A+,blood_group_A-,blood_group_B+,blood_group_B-,blood_group_O+,blood_group_O-,BMI,PCOS_diagnosis
440,0.654398,0.024237,1,0,0,1,0,1,0,1,1,0.121185,0,0,1,0,0,0,0.745982,1
45,0.623978,0.027129,1,1,0,1,0,1,0,1,1,0.108518,0,0,1,0,0,0,0.773394,0
336,0.627350,0.039209,0,0,0,0,0,0,0,1,1,0.196047,1,0,0,0,0,0,0.752636,0
633,0.652737,0.025105,1,0,1,1,1,0,0,1,0,0.175737,1,0,0,0,0,0,0.736492,1
518,0.733273,0.022220,1,0,0,1,0,1,0,0,1,0.111102,0,0,1,0,0,0,0.670428,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,0.561501,0.084225,1,0,1,1,1,0,0,1,0,0.168450,0,0,0,0,0,0,0.805759,1
404,0.712856,0.035643,0,0,1,1,1,1,0,1,1,0.178214,0,0,0,0,1,0,0.677352,0
405,0.799509,0.028554,0,1,0,1,1,0,0,0,1,0.085662,1,0,0,0,0,0,0.593828,0
250,0.655727,0.034512,1,0,0,1,0,0,0,0,1,0.138048,1,0,0,0,0,0,0.741467,0


In [None]:
test_dataset[['age', 'period_intervial', 'period_last', 'BMI']] = norm.fit_transform(test_dataset[['age', 'period_intervial', 'period_last', 'BMI']])

In [None]:
test_dataset

Unnamed: 0,age,period_intervial,have_weight_gain,have_ex_bodyhair_growth,have_darkening_skin,have_hair_loss,have_pimples,have_consume_fastfood,have_exercise,have_mood_swings,have_regular_periods,period_last,blood_group_A+,blood_group_A-,blood_group_B+,blood_group_B-,blood_group_O+,blood_group_O-,BMI,PCOS_diagnosis
483,0.560052,0.031114,1,1,1,1,1,0,0,1,0,0.186684,0,0,1,0,0,0,0.806550,1
325,0.864322,0.032012,1,1,0,0,0,1,1,1,1,0.160060,0,0,1,0,0,0,0.475714,0
466,0.723221,0.031444,1,0,1,0,0,0,0,1,0,0.125777,1,0,0,0,0,0,0.678339,1
538,0.548211,0.028853,1,1,1,1,1,1,0,1,0,0.115413,0,0,1,0,0,0,0.827836,1
244,0.717963,0.032635,0,0,0,0,1,0,0,0,0,0.097904,0,0,1,0,0,0,0.688388,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,0.803113,0.022946,1,0,0,0,0,0,1,1,1,0.091784,0,0,1,0,0,0,0.588267,0
317,0.718522,0.029938,0,0,0,0,0,1,1,0,1,0.119754,0,0,0,0,1,0,0.684463,0
112,0.704268,0.028171,1,0,0,0,0,0,1,1,1,0.169024,0,0,1,0,0,0,0.688944,0
82,0.497349,0.047367,1,1,0,1,1,0,0,0,0,0.094733,0,0,1,0,0,0,0.861061,1


Output preprocessed training dataset and testing dataset

In [None]:
train_dataset.to_csv("pcos_train.csv", index=False)
test_dataset.to_csv("pcos_test.csv", index=False)