# Data Cleaning

In [52]:
# import packages
import pandas as pd
import numpy as np

# load the dataset
df = pd.read_csv('Menstural_Cycle_Data.csv')
print(df)


     ClientID  CycleNumber  Group  CycleWithPeakorNot  ReproductiveCategory  \
0     nfp8122            1      0                   1                     0   
1     nfp8122            2      0                   1                     0   
2     nfp8122            3      0                   1                     0   
3     nfp8122            4      0                   1                     0   
4     nfp8122            5      0                   1                     0   
5     nfp8122            6      0                   1                     0   
6     nfp8122            7      0                   1                     0   
7     nfp8122            8      0                   1                     2   
8     nfp8122            9      0                   1                     0   
9     nfp8122           10      0                   1                     0   
10    nfp8122           11      0                   1                     0   
11    nfp8122           12      0                   

In [24]:
# check the first few rows of the dataset
print(df.head())

  ClientID  CycleNumber  Group  CycleWithPeakorNot  ReproductiveCategory  \
0  nfp8122            1      0                   1                     0   
1  nfp8122            2      0                   1                     0   
2  nfp8122            3      0                   1                     0   
3  nfp8122            4      0                   1                     0   
4  nfp8122            5      0                   1                     0   

   LengthofCycle MeanCycleLength EstimatedDayofOvulation LengthofLutealPhase  \
0             29           27.33                      17                  12   
1             27             NaN                      15                  12   
2             29             NaN                      15                  14   
3             27             NaN                      15                  12   
4             28             NaN                      16                  12   

  FirstDayofHigh  ... Method Prevmethod Methoddate Whychart Ne

In [25]:
# Get basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1665 entries, 0 to 1664
Data columns (total 80 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ClientID                    1665 non-null   object
 1   CycleNumber                 1665 non-null   int64 
 2   Group                       1665 non-null   int64 
 3   CycleWithPeakorNot          1665 non-null   int64 
 4   ReproductiveCategory        1665 non-null   int64 
 5   LengthofCycle               1665 non-null   int64 
 6   MeanCycleLength             141 non-null    object
 7   EstimatedDayofOvulation     1515 non-null   object
 8   LengthofLutealPhase         1514 non-null   object
 9   FirstDayofHigh              1407 non-null   object
 10  TotalNumberofHighDays       1653 non-null   object
 11  TotalHighPostPeak           1662 non-null   object
 12  TotalNumberofPeakDays       1649 non-null   object
 13  TotalDaysofFertility        1634 non-null   obje

In [51]:
# Summary statistics
pd.set_option('display.max_rows', None)
print(df.describe())

       CycleNumber        Group  CycleWithPeakorNot  ReproductiveCategory  \
count  1653.000000  1653.000000         1653.000000           1653.000000   
mean      8.072595     0.385360            0.913491              0.055656   
std       6.604134     0.486828            0.281200              0.481422   
min       1.000000     0.000000            0.000000              0.000000   
25%       3.000000     0.000000            1.000000              0.000000   
50%       7.000000     0.000000            1.000000              0.000000   
75%      11.000000     1.000000            1.000000              0.000000   
max      45.000000     1.000000            1.000000              9.000000   

       LengthofCycle  EstimatedDayofOvulation  LengthofLutealPhase  
count    1653.000000              1653.000000          1653.000000  
mean       29.320629                15.791289            13.261343  
std         3.889520                 3.436821             2.545363  
min        18.000000          

# Handling Duplicates

In [42]:
# Check for duplicates
df.duplicated().sum()
df.drop_duplicates(inplace=True)

# Handling missing values

In [43]:
#replace empty cells with Nan
df.replace(' ', np.nan, inplace=True)
missing_count  = df.isnull().sum()
pd.set_option('display.max_rows', None )
print(missing_count)

ClientID                         0
CycleNumber                      0
Group                            0
CycleWithPeakorNot               0
ReproductiveCategory             0
LengthofCycle                    0
MeanCycleLength               1515
EstimatedDayofOvulation          0
LengthofLutealPhase              0
FirstDayofHigh                 253
TotalNumberofHighDays           12
TotalHighPostPeak                3
TotalNumberofPeakDays           16
TotalDaysofFertility            31
TotalFertilityFormula            2
LengthofMenses                   4
MeanMensesLength              1515
MensesScoreDayOne                4
MensesScoreDayTwo                4
MensesScoreDayThree             25
MensesScoreDayFour              87
MensesScoreDayFive             430
MensesScoreDaySix             1054
MensesScoreDaySeven           1431
MensesScoreDayEight           1585
MensesScoreDayNine            1627
MensesScoreDayTen             1647
MensesScoreDay11              1651
MensesScoreDay12    

# Replacing missing values(Nan) with the mode in identified columns

In [44]:
# Replacing missing values(Nan) in EstimatedDayofOvulation with the mode 
mode_EstimatedDayofOvulation = df['EstimatedDayofOvulation'].mode()[0]
df['EstimatedDayofOvulation'].fillna(mode_EstimatedDayofOvulation, inplace=True)

# Replacing missing values(Nan) in LengthofLutealPhase with the mode 
mode_LengthofLutealPhase = df['LengthofLutealPhase'].mode()[0]
df['LengthofLutealPhase'].fillna(mode_LengthofLutealPhase, inplace=True)

# Replacing missing values(Nan) in LengthofCycle with the mode 
mode_LengthofCycle = df['LengthofCycle'].mode()[0]
df['LengthofCycle'].fillna(mode_LengthofCycle, inplace=True)

# Data type conversion

In [45]:
# Convert the column to float
df['EstimatedDayofOvulation'] = df['EstimatedDayofOvulation'].astype(float)
df['LengthofLutealPhase'] = df['LengthofLutealPhase'].astype(float)
df['LengthofCycle'] = df['LengthofCycle'].astype(float)

In [41]:
print(mode_EstimatedDayofOvulation, mode_LengthofLutealPhase, mode_LengthofCycle)

14.0 13.0 28.0


# Renaming Columns

In [None]:
# Rename columns in under MensesScoreDays for consistency and easy readability
df.rename(columns={
    'MensesScoreDayOne': 'MensesScoreDay1',
    'MensesScoreDayTwo': 'MensesScoreDay2',
    'MensesScoreDayThree': 'MensesScoreDay3',
    'MensesScoreDayFour': 'MensesScoreDay4',
    'MensesScoreDayFive' : 'MensesScoreDay5',
    'MensesScoreDaySix' : 'MensesScoreDay6',
    'MensesScoreDaySeven' : 'MensesScoreDay7',
    'MensesScoreDayEight' : 'MensesScoreDay8',
    'MensesScoreDayNine' : 'MensesScoreDay9',
    'MensesScoreDayTen' : 'MensesScoreDay10'
}, inplace= True)

In [53]:
df.to_csv('cleaned_menstrual_cycle_data.csv', index=False)