# Load Packages

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import proportions_chisquare

# Load in Data

In [7]:
PCOS_data = pd.read_csv('../Data/PCOS_Data.csv')

# View Data

In [8]:
PCOS_data.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm),Unnamed: 44
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,1.0,0,110,80,3,3,18.0,18.0,8.5,
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0.0,0,120,70,3,5,15.0,14.0,3.7,
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1.0,0,120,80,13,15,18.0,20.0,10.0,
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0.0,0,120,70,2,2,15.0,14.0,7.5,
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0.0,0,120,80,3,4,16.0,14.0,7.0,


# Rename and drop columns

In [9]:
PCOS_data.columns = ['Sl. No', 'Patient File No.', 'PCOS', ' Age', 'Weight',
       'Height(', 'BMI', 'Blood Group', 'Pulse rate(',
       'RR', 'Hb', 'Cycle', 'Cycle length',
       'Marraige Status', 'Pregnant', 'No. of abortions',
       'Ibeta-HCG', 'IIbeta-HCG', 'FSH',
       'LH(', 'FSH/LH', 'Hip', 'Waist', 'WaistHip Ratio',
       'TSH', 'AMH', 'PRL', 'VitD3',
       'PRG', 'RBS', 'Weightgain', 'hairgrowth',
       'Skindarkening', 'Hairloss', 'Pimples',
       'Fastfood', 'Regexercise', 'BPSystolic',
       'BPDiastolic', 'FollicleNo', 'Follicle No2',
       'AvgFsize', 'AvgFsize', 'Endometrium',
       'Unnamed44']

In [10]:
PCOS_data.columns

Index(['Sl. No', 'Patient File No.', 'PCOS', ' Age', 'Weight', 'Height(',
       'BMI', 'Blood Group', 'Pulse rate(', 'RR', 'Hb', 'Cycle',
       'Cycle length', 'Marraige Status', 'Pregnant', 'No. of abortions',
       'Ibeta-HCG', 'IIbeta-HCG', 'FSH', 'LH(', 'FSH/LH', 'Hip', 'Waist',
       'WaistHip Ratio', 'TSH', 'AMH', 'PRL', 'VitD3', 'PRG', 'RBS',
       'Weightgain', 'hairgrowth', 'Skindarkening', 'Hairloss', 'Pimples',
       'Fastfood', 'Regexercise', 'BPSystolic', 'BPDiastolic', 'FollicleNo',
       'Follicle No2', 'AvgFsize', 'AvgFsize', 'Endometrium', 'Unnamed44'],
      dtype='object')

In [11]:
PCOS_data.drop(['Unnamed44', 'Endometrium'], axis=1, inplace=True)

In [12]:
PCOS_data.drop(['No. of abortions',
       'Ibeta-HCG', 'IIbeta-HCG', 'FSH', 'LH(', 'FSH/LH', 'Hip', 'Waist',
       'WaistHip Ratio', 'TSH', 'AMH', 'PRL', 'VitD3', 'PRG', 'RBS',
       'Weightgain', 'hairgrowth', 'Skindarkening', 'Hairloss', 'Pimples',
       'Fastfood', 'Regexercise', 'BPSystolic', 'BPDiastolic', 'FollicleNo',
       'Follicle No2', 'AvgFsize', 'AvgFsize'], axis=1, inplace=True)

In [13]:
PCOS_data.drop(['Sl. No', 'Patient File No.', 'Blood Group', 'Pulse rate(', 'RR', 'Hb', 'Cycle',
       'Cycle length', 'Marraige Status'], axis=1, inplace=True)

In [14]:
PCOS_data.columns = ['PCOS', 'Age', 'Weight', 'Height', 'BMI', 'Pregnant']

In [15]:
PCOS_data.columns

Index(['PCOS', 'Age', 'Weight', 'Height', 'BMI', 'Pregnant'], dtype='object')

In [16]:
PCOS_data.head()

Unnamed: 0,PCOS,Age,Weight,Height,BMI,Pregnant
0,0,28,44.6,152.0,19.3,0
1,0,36,65.0,161.5,24.9,1
2,1,33,68.8,165.0,25.3,1
3,0,37,65.0,148.0,29.7,0
4,0,25,52.0,161.0,20.1,1


# Drop missing data

In [17]:
PCOS_data.dropna()

Unnamed: 0,PCOS,Age,Weight,Height,BMI,Pregnant
0,0,28,44.6,152.000,19.3,0
1,0,36,65.0,161.500,24.9,1
2,1,33,68.8,165.000,25.3,1
3,0,37,65.0,148.000,29.7,0
4,0,25,52.0,161.000,20.1,1
...,...,...,...,...,...,...
536,0,35,50.0,164.592,18.5,0
537,0,30,63.2,158.000,25.3,1
538,0,36,54.0,152.000,23.4,0
539,0,27,50.0,150.000,22.2,0


# Check datatype for values

In [18]:
PCOS_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PCOS      541 non-null    int64  
 1   Age       541 non-null    int64  
 2   Weight    541 non-null    float64
 3   Height    541 non-null    float64
 4   BMI       541 non-null    float64
 5   Pregnant  541 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 25.5 KB
