# Data Preparation

In [32]:
# Libraries
import os
import fnmatch
import re
import datetime
import pandas as pd
import numpy as np

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


Ressource 1: Kaggle

In [33]:
# Read the data to a pandas data frame
dfmens1 = pd.read_csv('FedCycleData071012.csv', sep=',', encoding='utf-8')
dfmens1.head(5)


Unnamed: 0,ClientID,CycleNumber,Group,CycleWithPeakorNot,ReproductiveCategory,LengthofCycle,MeanCycleLength,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,...,Method,Prevmethod,Methoddate,Whychart,Nextpreg,NextpregM,Spousesame,SpousesameM,Timeattemptpreg,BMI
0,nfp8122,1,0,1,0,29,27.33,17,12,12.0,...,9.0,,,2.0,7.0,7.0,1.0,1.0,0.0,21.254724111867
1,nfp8122,2,0,1,0,27,,15,12,13.0,...,,,,,,,,,,
2,nfp8122,3,0,1,0,29,,15,14,,...,,,,,,,,,,
3,nfp8122,4,0,1,0,27,,15,12,13.0,...,,,,,,,,,,
4,nfp8122,5,0,1,0,28,,16,12,12.0,...,,,,,,,,,,


In [34]:
# Dimension (rows, columns)
print('Dimension:', dfmens1.shape)

# Number of rows
print('Number of rows:', dfmens1.shape[0])

# Number of columns
print('Number of columns:', dfmens1.shape[1])

# Gibt die Anzahl der Personen aus. Eine Column ist nicht eine Person, sondern einen Zyklus.
dfmens1[['ClientID']].value_counts()


Dimension: (1665, 80)
Number of rows: 1665
Number of columns: 80


ClientID
nfp8122     45
nfp8328     33
nfp8323     33
nfp8289     32
nfp8066     31
            ..
nfp8252      1
nfp8236      1
nfp8244      1
nfp8302      1
nfp8284      1
Name: count, Length: 159, dtype: int64

In [35]:
# calculate the total number of women in the data set
n_women = dfmens1['ClientID'].nunique()
print('Number of Women: ', n_women)
print('Number of Cycles: ', dfmens1.shape[0])


Number of Women:  159
Number of Cycles:  1665


### Remove Outliers

The columns MensesScoreDay12-15 are not considered.

### D-Types Cleansing

In [36]:
dfmens2 = dfmens1[['ClientID',
                   'CycleNumber',
                   'CycleWithPeakorNot',
                   'LengthofCycle',
                   'EstimatedDayofOvulation',
                   'LengthofLutealPhase',
                   'FirstDayofHigh',
                   'TotalNumberofHighDays',
                   'TotalDaysofFertility',
                   'LengthofMenses',
                   'MensesScoreDayOne',
                   'MensesScoreDayTwo',
                   'MensesScoreDayThree',
                   'MensesScoreDayFour',
                   'MensesScoreDayFive',
                   'MensesScoreDaySix',
                   'MensesScoreDaySeven',
                   'MensesScoreDayEight',
                   'MensesScoreDayNine',
                   'MensesScoreDayTen',
                   'MensesScoreDay11',
                   'TotalMensesScore',
                   'Age',
                   'Height',
                   'Weight',
                   'Numberpreg',
                   'BMI']]

dfmens2.head(5)


Unnamed: 0,ClientID,CycleNumber,CycleWithPeakorNot,LengthofCycle,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,TotalDaysofFertility,LengthofMenses,...,MensesScoreDayEight,MensesScoreDayNine,MensesScoreDayTen,MensesScoreDay11,TotalMensesScore,Age,Height,Weight,Numberpreg,BMI
0,nfp8122,1,1,29,17,12,12.0,5,9,5,...,,,,,10,36.0,63.0,120.0,3.0,21.254724111867
1,nfp8122,2,1,27,15,12,13.0,2,6,5,...,,,,,10,,,,,
2,nfp8122,3,1,29,15,14,,1,5,5,...,,,,,10,,,,,
3,nfp8122,4,1,27,15,12,13.0,2,6,5,...,,,,,12,,,,,
4,nfp8122,5,1,28,16,12,12.0,4,8,5,...,,,,,11,,,,,


In [37]:
dfmens2.dtypes


ClientID                   object
CycleNumber                 int64
CycleWithPeakorNot          int64
LengthofCycle               int64
EstimatedDayofOvulation    object
LengthofLutealPhase        object
FirstDayofHigh             object
TotalNumberofHighDays      object
TotalDaysofFertility       object
LengthofMenses             object
MensesScoreDayOne          object
MensesScoreDayTwo          object
MensesScoreDayThree        object
MensesScoreDayFour         object
MensesScoreDayFive         object
MensesScoreDaySix          object
MensesScoreDaySeven        object
MensesScoreDayEight        object
MensesScoreDayNine         object
MensesScoreDayTen          object
MensesScoreDay11           object
TotalMensesScore           object
Age                        object
Height                     object
Weight                     object
Numberpreg                 object
BMI                        object
dtype: object

In [38]:
dfmens2[['EstimatedDayofOvulation',
         'LengthofLutealPhase',
         'FirstDayofHigh',
         'TotalNumberofHighDays',
         'TotalDaysofFertility',
         'LengthofMenses',
         'MensesScoreDayOne',
         'MensesScoreDayTwo',
         'MensesScoreDayThree',
         'MensesScoreDayFour',
         'MensesScoreDayFive',
         'MensesScoreDaySix',
         'MensesScoreDaySeven',
         'MensesScoreDayEight',
         'MensesScoreDayNine',
         'MensesScoreDayTen',
         'MensesScoreDay11',
         'TotalMensesScore']] = dfmens2[['EstimatedDayofOvulation',
                                         'LengthofLutealPhase',
                                         'FirstDayofHigh',
                                         'TotalNumberofHighDays',
                                         'TotalDaysofFertility',
                                         'LengthofMenses',
                                         'MensesScoreDayOne',
                                         'MensesScoreDayTwo',
                                         'MensesScoreDayThree',
                                         'MensesScoreDayFour',
                                         'MensesScoreDayFive',
                                         'MensesScoreDaySix',
                                         'MensesScoreDaySeven',
                                         'MensesScoreDayEight',
                                         'MensesScoreDayNine',
                                         'MensesScoreDayTen',
                                         'MensesScoreDay11',
                                         'TotalMensesScore']].apply(pd.to_numeric, errors='coerce')


In [39]:


dfmens2[['MensesScoreDayOne',
         'MensesScoreDayTwo',
         'MensesScoreDayThree',
         'MensesScoreDayFour',
         'MensesScoreDayFive',
         'MensesScoreDaySix',
         'MensesScoreDaySeven',
         'MensesScoreDayEight',
         'MensesScoreDayNine',
         'MensesScoreDayTen',
         'MensesScoreDay11',]] = dfmens2[['MensesScoreDayOne',
                                          'MensesScoreDayTwo',
                                          'MensesScoreDayThree',
                                          'MensesScoreDayFour',
                                          'MensesScoreDayFive',
                                          'MensesScoreDaySix',
                                          'MensesScoreDaySeven',
                                          'MensesScoreDayEight',
                                          'MensesScoreDayNine',
                                          'MensesScoreDayTen',
                                          'MensesScoreDay11',]].fillna(0)


In [40]:

Age = []
previous_value_age = None

for value in dfmens2['Age']:
    if value.strip() != '':
        previous_value_age = float(value)
    Age.append(previous_value_age)

# Save as new variable in the pandas data frame
dfmens2['Age'] = pd.Series(Age, dtype="float64")

# Print first 5 values
print(dfmens2['Age'].head(5), '\n')


0    36.0
1    36.0
2    36.0
3    36.0
4    36.0
Name: Age, dtype: float64 



In [41]:
# Extract values from 'Height' strings and calculate to metric System
Height = []
previous_value_h = None

for value in dfmens2['Height']:
    if value.strip() != '':
        previous_value_h = float(value)
    Height.append(previous_value_h)

# Save as new variable in the pandas data frame
dfmens2['Height'] = pd.Series(Height, dtype="float64") * 2.54
dfmens2['Height'] = round(dfmens2['Height'], 2)
# Print first 5 values
print(dfmens2['Height'].head(5), '\n')


0    160.02
1    160.02
2    160.02
3    160.02
4    160.02
Name: Height, dtype: float64 



In [42]:
# Extract values from 'Weight' strings and calculate to KG System
Weight = []
previous_value_w = None

for value in dfmens2['Weight']:
    if value.strip() != '':
        previous_value_w = float(value)
    Weight.append(previous_value_w)

# Save as new variable in the pandas data frame
dfmens2['Weight'] = pd.Series(Weight, dtype="float64") * 0.453592
dfmens2['Weight'] = round(dfmens2['Weight'], 2)
# Print first 5 values
print(dfmens2['Weight'].head(5), '\n')


0    54.43
1    54.43
2    54.43
3    54.43
4    54.43
Name: Weight, dtype: float64 



In [43]:
# Extract values from 'Numberpreg' strings
Numberpreg = []
previous_value_preg = None

for value in dfmens2['Numberpreg']:
    if value.strip() != '':
        previous_value_preg = float(value)
    Numberpreg.append(previous_value_preg)

# Save as new variable in the pandas data frame
dfmens2['Numberpreg'] = pd.Series(Numberpreg, dtype="float64")

# Print first 5 values
print(dfmens2['Numberpreg'].head(5), '\n')


0    3.0
1    3.0
2    3.0
3    3.0
4    3.0
Name: Numberpreg, dtype: float64 



In [44]:
# Extract values from 'bmi' strings
bmi = []
previous_value_bmi = None

for value in dfmens2['BMI']:
    if value.strip() != '':
        previous_value_bmi = float(value)
    bmi.append(previous_value_bmi)

# Save as new variable in the pandas data frame
dfmens2['BMI'] = pd.Series(bmi, dtype="float64")
dfmens2['BMI'] = round(dfmens2['BMI'], 2)

# Print first 5 values
print(dfmens2['BMI'].head(5), '\n')


0    21.25
1    21.25
2    21.25
3    21.25
4    21.25
Name: BMI, dtype: float64 



In [45]:
dfmens2.dtypes


ClientID                    object
CycleNumber                  int64
CycleWithPeakorNot           int64
LengthofCycle                int64
EstimatedDayofOvulation    float64
LengthofLutealPhase        float64
FirstDayofHigh             float64
TotalNumberofHighDays      float64
TotalDaysofFertility       float64
LengthofMenses             float64
MensesScoreDayOne          float64
MensesScoreDayTwo          float64
MensesScoreDayThree        float64
MensesScoreDayFour         float64
MensesScoreDayFive         float64
MensesScoreDaySix          float64
MensesScoreDaySeven        float64
MensesScoreDayEight        float64
MensesScoreDayNine         float64
MensesScoreDayTen          float64
MensesScoreDay11           float64
TotalMensesScore           float64
Age                        float64
Height                     float64
Weight                     float64
Numberpreg                 float64
BMI                        float64
dtype: object

# Data Aggregation > MERGE

Ressource 2: Salome

In [46]:
# Read the data to a pandas data frame
dfmensSalome = pd.read_csv('cycles.csv', sep=';', encoding='utf-8')
dfmensSalome.head(5)


Unnamed: 0,Katalog,Zyklusnummer,Zykluslänge,1. Zyklustag,Kinderwunsch,Messort,Messdauer,Thermometerart,Auswertung,Auswertung am Zyklusanfang,Manuell ausgewertet,Unfruchtbar am Zyklusanfang bis,Unfruchtbar ab,1. hM,Temperaturauswertung abgeschlossen,Hilfslinie,Zervixschleim-Höhepunkt,Muttermund-Höhepunkt,Menstruationsprognose,Kommentar
0,NFP-Zyklen,1,30,25.12.2022,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,,,,,,,,,
1,NFP-Zyklen,2,28,24.01.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,,,14.0,16.0,3640.0,,,27.0,
2,NFP-Zyklen,3,28,21.02.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,5.0,,15.0,18.0,3655.0,,,28.0,
3,NFP-Zyklen,4,33,21.03.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,5.0,,23.0,26.0,3680.0,,,35.0,
4,NFP-Zyklen,5,25,23.04.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,5.0,,,,,,,,


In [47]:
dfmensSalome = dfmensSalome.rename(columns={
                                   'Zyklusnummer': 'CycleNumber', 'Zykluslänge': 'LengthofCycle', '1. hM': 'FirstDayofHigh'})

dfmensSalome[['CycleNumber',
              'LengthofCycle',
              'FirstDayofHigh']] = dfmens2[['CycleNumber',
                                            'LengthofCycle',
                                            'FirstDayofHigh']].apply(pd.to_numeric, errors='coerce')
dfmensSalome.head(5)


Unnamed: 0,Katalog,CycleNumber,LengthofCycle,1. Zyklustag,Kinderwunsch,Messort,Messdauer,Thermometerart,Auswertung,Auswertung am Zyklusanfang,Manuell ausgewertet,Unfruchtbar am Zyklusanfang bis,Unfruchtbar ab,FirstDayofHigh,Temperaturauswertung abgeschlossen,Hilfslinie,Zervixschleim-Höhepunkt,Muttermund-Höhepunkt,Menstruationsprognose,Kommentar
0,NFP-Zyklen,1,29,25.12.2022,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,,,12.0,,,,,,
1,NFP-Zyklen,2,27,24.01.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,,,13.0,16.0,3640.0,,,27.0,
2,NFP-Zyklen,3,29,21.02.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,5.0,,,18.0,3655.0,,,28.0,
3,NFP-Zyklen,4,27,21.03.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,5.0,,13.0,26.0,3680.0,,,35.0,
4,NFP-Zyklen,5,28,23.04.2023,nein,,,,Zervixschleim + Temperatur,Zervixschleim + Temperatur,nein,5.0,,12.0,,,,,,


In [48]:
dfmensSalome = dfmensSalome.drop(['1. Zyklustag',
                                  'Katalog',
                                  'Kinderwunsch',
                                  'Messort',
                                  'Messdauer',
                                  'Thermometerart',
                                  'Auswertung',
                                  'Auswertung am Zyklusanfang',
                                  'Manuell ausgewertet',
                                  'Unfruchtbar am Zyklusanfang bis',
                                  'Unfruchtbar ab',
                                  'Temperaturauswertung abgeschlossen',
                                  'Hilfslinie',
                                  'Zervixschleim-Höhepunkt',
                                  'Muttermund-Höhepunkt',
                                  'Menstruationsprognose',
                                  'Kommentar'], axis=1)
dfmensSalome.head(5)

Unnamed: 0,CycleNumber,LengthofCycle,FirstDayofHigh
0,1,29,12.0
1,2,27,13.0
2,3,29,
3,4,27,13.0
4,5,28,12.0


In [49]:
dfmensSalome['ClientID'] = 'nfp001'
dfmensSalome['CycleWithPeakorNot'] = np.nan
dfmensSalome['EstimatedDayofOvulation'] = np.nan
dfmensSalome['LengthofLutealPhase'] = dfmensSalome['LengthofCycle'] - dfmensSalome['FirstDayofHigh'] 
dfmensSalome['TotalNumberofHighDays'] = np.nan
dfmensSalome['TotalDaysofFertility'] = np.nan
dfmensSalome['LengthofMenses'] = [5, 5, 7, 6, 4, 6]

columns_to_nan = ['MensesScoreDayOne', 
                  'MensesScoreDayTwo', 
                  'MensesScoreDayThree',
                  'MensesScoreDayFour', 
                  'MensesScoreDayFive', 
                  'MensesScoreDaySix',
                  'MensesScoreDaySeven', 
                  'MensesScoreDayEight', 
                  'MensesScoreDayNine',
                  'MensesScoreDayTen', 
                  'MensesScoreDay11', 
                  'TotalMensesScore']
dfmensSalome[columns_to_nan] = np.nan

dfmensSalome['Age'] = 24
dfmensSalome['Weight'] = 58.00
dfmensSalome['Height'] = 180.00
bmi = dfmensSalome['Weight'] / dfmensSalome['Height']  / dfmensSalome['Height'] *10000
dfmensSalome['BMI'] = round(bmi, 2)
dfmensSalome['Numberpreg'] = 0
dfmensSalome.head(6)

Unnamed: 0,CycleNumber,LengthofCycle,FirstDayofHigh,ClientID,CycleWithPeakorNot,EstimatedDayofOvulation,LengthofLutealPhase,TotalNumberofHighDays,TotalDaysofFertility,LengthofMenses,...,MensesScoreDayEight,MensesScoreDayNine,MensesScoreDayTen,MensesScoreDay11,TotalMensesScore,Age,Weight,Height,BMI,Numberpreg
0,1,29,12.0,nfp001,,,17.0,,,5,...,,,,,,24,58.0,180.0,17.9,0
1,2,27,13.0,nfp001,,,14.0,,,5,...,,,,,,24,58.0,180.0,17.9,0
2,3,29,,nfp001,,,,,,7,...,,,,,,24,58.0,180.0,17.9,0
3,4,27,13.0,nfp001,,,14.0,,,6,...,,,,,,24,58.0,180.0,17.9,0
4,5,28,12.0,nfp001,,,16.0,,,4,...,,,,,,24,58.0,180.0,17.9,0
5,6,26,10.0,nfp001,,,16.0,,,6,...,,,,,,24,58.0,180.0,17.9,0


In [50]:
dfmensSalome.dtypes


CycleNumber                  int64
LengthofCycle                int64
FirstDayofHigh             float64
ClientID                    object
CycleWithPeakorNot         float64
EstimatedDayofOvulation    float64
LengthofLutealPhase        float64
TotalNumberofHighDays      float64
TotalDaysofFertility       float64
LengthofMenses               int64
MensesScoreDayOne          float64
MensesScoreDayTwo          float64
MensesScoreDayThree        float64
MensesScoreDayFour         float64
MensesScoreDayFive         float64
MensesScoreDaySix          float64
MensesScoreDaySeven        float64
MensesScoreDayEight        float64
MensesScoreDayNine         float64
MensesScoreDayTen          float64
MensesScoreDay11           float64
TotalMensesScore           float64
Age                          int64
Weight                     float64
Height                     float64
BMI                        float64
Numberpreg                   int64
dtype: object

### MergeMerge

In [51]:
dfmens2 = pd.concat([dfmens2, dfmensSalome])
dfmens2.head(5)

Unnamed: 0,ClientID,CycleNumber,CycleWithPeakorNot,LengthofCycle,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,TotalDaysofFertility,LengthofMenses,...,MensesScoreDayEight,MensesScoreDayNine,MensesScoreDayTen,MensesScoreDay11,TotalMensesScore,Age,Height,Weight,Numberpreg,BMI
0,nfp8122,1,1.0,29,17.0,12.0,12.0,5.0,9.0,5.0,...,0.0,0.0,0.0,0.0,10.0,36.0,160.02,54.43,3.0,21.25
1,nfp8122,2,1.0,27,15.0,12.0,13.0,2.0,6.0,5.0,...,0.0,0.0,0.0,0.0,10.0,36.0,160.02,54.43,3.0,21.25
2,nfp8122,3,1.0,29,15.0,14.0,,1.0,5.0,5.0,...,0.0,0.0,0.0,0.0,10.0,36.0,160.02,54.43,3.0,21.25
3,nfp8122,4,1.0,27,15.0,12.0,13.0,2.0,6.0,5.0,...,0.0,0.0,0.0,0.0,12.0,36.0,160.02,54.43,3.0,21.25
4,nfp8122,5,1.0,28,16.0,12.0,12.0,4.0,8.0,5.0,...,0.0,0.0,0.0,0.0,11.0,36.0,160.02,54.43,3.0,21.25


# Data Cleaning

## Missing Values

In [52]:
# Count missing values
print('Count missing values per variable')
print(pd.isna(dfmens2).sum(), '\n')

# Identify rows with missing values
print('Identify rows with missing values')
print(dfmens2.loc[dfmens2.isna().any(axis=1)]
      [['ClientID', 'CycleNumber']], '\n')

# Drop rows where at least one element is missing.
dfmens2.dropna(how='all')
# df.dropna(subset=['name', 'toy'])

dfmens2.head(5)


Count missing values per variable
ClientID                     0
CycleNumber                  0
CycleWithPeakorNot           6
LengthofCycle                0
EstimatedDayofOvulation    156
LengthofLutealPhase        152
FirstDayofHigh             259
TotalNumberofHighDays       18
TotalDaysofFertility        37
LengthofMenses               4
MensesScoreDayOne            6
MensesScoreDayTwo            6
MensesScoreDayThree          6
MensesScoreDayFour           6
MensesScoreDayFive           6
MensesScoreDaySix            6
MensesScoreDaySeven          6
MensesScoreDayEight          6
MensesScoreDayNine           6
MensesScoreDayTen            6
MensesScoreDay11             6
TotalMensesScore            10
Age                          0
Height                       0
Weight                       0
Numberpreg                   0
BMI                          0
dtype: int64 

Identify rows with missing values
   ClientID  CycleNumber
2   nfp8122            3
6   nfp8122            7
17  n

Unnamed: 0,ClientID,CycleNumber,CycleWithPeakorNot,LengthofCycle,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,TotalDaysofFertility,LengthofMenses,...,MensesScoreDayEight,MensesScoreDayNine,MensesScoreDayTen,MensesScoreDay11,TotalMensesScore,Age,Height,Weight,Numberpreg,BMI
0,nfp8122,1,1.0,29,17.0,12.0,12.0,5.0,9.0,5.0,...,0.0,0.0,0.0,0.0,10.0,36.0,160.02,54.43,3.0,21.25
1,nfp8122,2,1.0,27,15.0,12.0,13.0,2.0,6.0,5.0,...,0.0,0.0,0.0,0.0,10.0,36.0,160.02,54.43,3.0,21.25
2,nfp8122,3,1.0,29,15.0,14.0,,1.0,5.0,5.0,...,0.0,0.0,0.0,0.0,10.0,36.0,160.02,54.43,3.0,21.25
3,nfp8122,4,1.0,27,15.0,12.0,13.0,2.0,6.0,5.0,...,0.0,0.0,0.0,0.0,12.0,36.0,160.02,54.43,3.0,21.25
4,nfp8122,5,1.0,28,16.0,12.0,12.0,4.0,8.0,5.0,...,0.0,0.0,0.0,0.0,11.0,36.0,160.02,54.43,3.0,21.25


In [53]:
print('Dimension:', dfmens2.shape)


Dimension: (1671, 27)


In [54]:
median = dfmens2['CycleWithPeakorNot'].median()
dfmens2['CycleWithPeakorNot'].fillna(median, inplace=True)

median = dfmens2['TotalNumberofHighDays'].median()
dfmens2['TotalNumberofHighDays'].fillna(median, inplace=True)

median = dfmens2['TotalDaysofFertility'].median()
dfmens2['TotalDaysofFertility'].fillna(median, inplace=True)

median = dfmens2['LengthofMenses'].median()
dfmens2['LengthofMenses'].fillna(median, inplace=True)

median = dfmens2['EstimatedDayofOvulation'].median()
dfmens2['EstimatedDayofOvulation'].fillna(median, inplace=True)

median = dfmens2['LengthofLutealPhase'].median()
dfmens2['LengthofLutealPhase'].fillna(median, inplace=True)

median = dfmens2['FirstDayofHigh'].median()
dfmens2['FirstDayofHigh'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayOne'].median()
dfmens2['MensesScoreDayOne'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayTwo'].median()
dfmens2['MensesScoreDayTwo'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayThree'].median()
dfmens2['MensesScoreDayThree'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayFour'].median()
dfmens2['MensesScoreDayFour'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayFive'].median()
dfmens2['MensesScoreDayFive'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDaySix'].median()
dfmens2['MensesScoreDaySix'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDaySeven'].median()
dfmens2['MensesScoreDaySeven'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayEight'].median()
dfmens2['MensesScoreDayEight'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayNine'].median()
dfmens2['MensesScoreDayNine'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDayTen'].median()
dfmens2['MensesScoreDayTen'].fillna(median, inplace=True)

median = dfmens2['MensesScoreDay11'].median()
dfmens2['MensesScoreDay11'].fillna(median, inplace=True)

median = dfmens2['TotalMensesScore'].median()
dfmens2['TotalMensesScore'].fillna(median, inplace=True)


In [55]:
# Count missing values
print('Count missing values per variable')
print(pd.isna(dfmens2).sum(), '\n')


Count missing values per variable
ClientID                   0
CycleNumber                0
CycleWithPeakorNot         0
LengthofCycle              0
EstimatedDayofOvulation    0
LengthofLutealPhase        0
FirstDayofHigh             0
TotalNumberofHighDays      0
TotalDaysofFertility       0
LengthofMenses             0
MensesScoreDayOne          0
MensesScoreDayTwo          0
MensesScoreDayThree        0
MensesScoreDayFour         0
MensesScoreDayFive         0
MensesScoreDaySix          0
MensesScoreDaySeven        0
MensesScoreDayEight        0
MensesScoreDayNine         0
MensesScoreDayTen          0
MensesScoreDay11           0
TotalMensesScore           0
Age                        0
Height                     0
Weight                     0
Numberpreg                 0
BMI                        0
dtype: int64 



In [56]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(dfmens2[['CycleWithPeakorNot', 'LengthofCycle', 'EstimatedDayofOvulation',
            'FirstDayofHigh', 'TotalNumberofHighDays', 'LengthofMenses', 'Height', 'Weight']])

np.set_printoptions(suppress=True)
print(imputer.statistics_)
print('-----')
print(dfmens2[['CycleWithPeakorNot', 'LengthofCycle', 'EstimatedDayofOvulation',
      'FirstDayofHigh', 'TotalNumberofHighDays', 'LengthofMenses', 'Height', 'Weight']].median())


[  1.    29.    15.    11.     3.     5.   162.56  61.69]
-----
CycleWithPeakorNot           1.00
LengthofCycle               29.00
EstimatedDayofOvulation     15.00
FirstDayofHigh              11.00
TotalNumberofHighDays        3.00
LengthofMenses               5.00
Height                     162.56
Weight                      61.69
dtype: float64


## Duplicated Values

In [57]:
# Count duplicated values in the whole data set
print('Sum of missing values:', dfmens2.duplicated().sum(), '\n')

# Identify duplicated values in 'rooms', 'area', 'price'
print('Duplicated values')
print(dfmens2.loc[dfmens2.duplicated(keep='last')])

# Drop the rows with duplicated values
dfCleaned = dfmens2.drop_duplicates()


Sum of missing values: 6 

Duplicated values
   ClientID  CycleNumber  CycleWithPeakorNot  LengthofCycle   
45  nfp8114            1                 1.0             28  \
47  nfp8109            1                 1.0             29   
48  nfp8109            2                 1.0             23   
49  nfp8109            3                 1.0             27   
58  nfp8106            1                 1.0             25   
60  nfp8106            3                 1.0             24   

    EstimatedDayofOvulation  LengthofLutealPhase  FirstDayofHigh   
45                     14.0                 14.0            11.0  \
47                     19.0                 10.0            14.0   
48                     14.0                  9.0             9.0   
49                     17.0                 10.0            11.0   
58                     11.0                 14.0             6.0   
60                     12.0                 12.0             8.0   

    TotalNumberofHighDays  TotalDays

In [58]:
dfCleaned[['EstimatedDayofOvulation',
           'LengthofLutealPhase',
           'FirstDayofHigh',
           'TotalNumberofHighDays',
           'TotalDaysofFertility',
           'LengthofMenses',
           'MensesScoreDayOne',
           'MensesScoreDayTwo',
           'MensesScoreDayThree',
           'MensesScoreDayFour',
           'MensesScoreDayFive',
           'MensesScoreDaySix',
           'MensesScoreDaySeven',
           'MensesScoreDayEight',
           'MensesScoreDayNine',
           'MensesScoreDayTen',
           'MensesScoreDay11',
           'TotalMensesScore']] = dfCleaned[['EstimatedDayofOvulation',
                                             'LengthofLutealPhase',
                                             'FirstDayofHigh',
                                             'TotalNumberofHighDays',
                                             'TotalDaysofFertility',
                                             'LengthofMenses',
                                             'MensesScoreDayOne',
                                             'MensesScoreDayTwo',
                                             'MensesScoreDayThree',
                                             'MensesScoreDayFour',
                                             'MensesScoreDayFive',
                                             'MensesScoreDaySix',
                                             'MensesScoreDaySeven',
                                             'MensesScoreDayEight',
                                             'MensesScoreDayNine',
                                             'MensesScoreDayTen',
                                             'MensesScoreDay11',
                                             'TotalMensesScore']].astype('Int64')


# Feature Engineering
<p>As there is already a lot of data and they are only in numeric values, the Feature Engineering Section is short. </p>

### Create new categorical variable based on Length of Cycle

In [59]:
labels = ['0 - 27', '28 - 32', '33 - 40']
dfCleaned['lengthCycle_cat'] = pd.cut(dfCleaned.LengthofCycle, bins=[0, 26, 31, 40], labels=labels)
dfCleaned[['LengthofCycle', 'lengthCycle_cat']].head(10)

Unnamed: 0,LengthofCycle,lengthCycle_cat
0,29,28 - 32
1,27,28 - 32
2,29,28 - 32
3,27,28 - 32
4,28,28 - 32
5,26,0 - 27
6,29,28 - 32
7,24,0 - 27
8,28,28 - 32
9,28,28 - 32


In [60]:
labels = ['Short', 'Normal', 'Long']
dfCleaned['lengthMenses_cat'] = pd.cut(dfCleaned.LengthofMenses, bins=[0, 3, 6, 10], labels=labels)
dfCleaned[['LengthofMenses', 'lengthMenses_cat']].head(10)

Unnamed: 0,LengthofMenses,lengthMenses_cat
0,5,Normal
1,5,Normal
2,5,Normal
3,5,Normal
4,5,Normal
5,5,Normal
6,5,Normal
7,4,Normal
8,6,Normal
9,5,Normal


In [61]:
dfFinishedforCSVCycle = dfCleaned[['ClientID',
                                   'CycleNumber',
                                   'LengthofCycle',
                                   'lengthCycle_cat',
                                   'CycleWithPeakorNot',
                                   'EstimatedDayofOvulation',
                                   'LengthofLutealPhase',
                                   'FirstDayofHigh',
                                   'TotalNumberofHighDays',
                                   'TotalDaysofFertility',
                                   'LengthofMenses',
                                   'lengthMenses_cat',
                                   'Age',
                                   'Height',
                                   'Weight',
                                   'BMI']]

dfFinishedforCSVCycle.head(10)


Unnamed: 0,ClientID,CycleNumber,LengthofCycle,lengthCycle_cat,CycleWithPeakorNot,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,TotalDaysofFertility,LengthofMenses,lengthMenses_cat,Age,Height,Weight,BMI
0,nfp8122,1,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
1,nfp8122,2,27,28 - 32,1.0,15,12,13,2,6,5,Normal,36.0,160.02,54.43,21.25
2,nfp8122,3,29,28 - 32,1.0,15,14,11,1,5,5,Normal,36.0,160.02,54.43,21.25
3,nfp8122,4,27,28 - 32,1.0,15,12,13,2,6,5,Normal,36.0,160.02,54.43,21.25
4,nfp8122,5,28,28 - 32,1.0,16,12,12,4,8,5,Normal,36.0,160.02,54.43,21.25
5,nfp8122,6,26,0 - 27,1.0,15,11,10,5,9,5,Normal,36.0,160.02,54.43,21.25
6,nfp8122,7,29,28 - 32,1.0,16,13,11,1,5,5,Normal,36.0,160.02,54.43,21.25
7,nfp8122,8,24,0 - 27,1.0,14,10,9,5,9,4,Normal,36.0,160.02,54.43,21.25
8,nfp8122,9,28,28 - 32,1.0,16,12,9,7,11,6,Normal,36.0,160.02,54.43,21.25
9,nfp8122,10,28,28 - 32,1.0,17,11,13,4,8,5,Normal,36.0,160.02,54.43,21.25


In [62]:
dfFinishedforCSVCycle.to_csv('../backend/menstrual_cycle_data_enriched_per_CYCLE.csv',
                             sep=",",
                             encoding='utf-8',
                             index=False)


### Splitting Cycles into Cycle Days
<p>The following code will take each Cycle and create a new row for each day in the Cycle. Then, every day is assigned to it's Menstration Phase. </p>

In [63]:
dfCleaned['DayofCycle'] = pd.Series(dtype="int")
print(dfCleaned['DayofCycle'].head(5), '\n')


0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: DayofCycle, dtype: float64 



In [64]:
# Create an empty list to store the expanded rows
expanded_rows = []

# Iterate over each row in the DataFrame
for index, row in dfCleaned.iterrows():
    cycle_number = row['CycleNumber']
    cycle_length = row['LengthofCycle']
    cycle_length_cat = row['lengthCycle_cat']
    ClientID = row['ClientID']
    CycleWithPeakorNot = row['CycleWithPeakorNot']
    EstimatedDayofOvulation = row['EstimatedDayofOvulation']
    LengthofLutealPhase = row['LengthofLutealPhase']
    FirstDayofHigh = row['FirstDayofHigh']
    TotalNumberofHighDays = row['TotalNumberofHighDays']
    TotalDaysofFertility = row['TotalDaysofFertility']
    LengthofMenses = row['LengthofMenses']
    LengthofMenses_cat = row['lengthMenses_cat']
    MensesScoreDayOne = row['MensesScoreDayOne']
    MensesScoreDayTwo = row['MensesScoreDayTwo']
    MensesScoreDayThree = row['MensesScoreDayThree']
    MensesScoreDayFour = row['MensesScoreDayFour']
    MensesScoreDayFive = row['MensesScoreDayFive']
    MensesScoreDaySix = row['MensesScoreDaySix']
    MensesScoreDaySeven = row['MensesScoreDaySeven']
    MensesScoreDayEight = row['MensesScoreDayEight']
    MensesScoreDayNine = row['MensesScoreDayNine']
    MensesScoreDayTen = row['MensesScoreDayTen']
    MensesScoreDay11 = row['MensesScoreDay11']
    TotalMensesScore = row['TotalMensesScore']
    Age = row['Age']
    Height = row['Height']
    Weight = row['Weight']
    Numberpreg = row['Numberpreg']
    BMI = row['BMI']

    # Iterate from 1 to cycle_length and create new rows
    for day in range(1, cycle_length + 1):
        # Create a new row with CycleNumber, LengthofCycle, and DayofCycle
        new_row = [ClientID,
                   cycle_number,
                   day,
                   cycle_length,
                   cycle_length_cat,
                   CycleWithPeakorNot,
                   EstimatedDayofOvulation,
                   LengthofLutealPhase,
                   FirstDayofHigh,
                   TotalNumberofHighDays,
                   TotalDaysofFertility,
                   LengthofMenses,
                   LengthofMenses_cat,
                   MensesScoreDayOne,
                   MensesScoreDayTwo,
                   MensesScoreDayThree,
                   MensesScoreDayFour,
                   MensesScoreDayFive,
                   MensesScoreDaySix,
                   MensesScoreDaySeven,
                   MensesScoreDayEight,
                   MensesScoreDayNine,
                   MensesScoreDayTen,
                   MensesScoreDay11,
                   TotalMensesScore,
                   Age,
                   Height,
                   Weight,
                   Numberpreg,
                   BMI]
        expanded_rows.append(new_row)

        # Create a new DataFrame from the expanded rows
new_columns = ['ClientID',
               'CycleNumber',
               'DayofCycle',
               'LengthofCycle',
               'lengthCycle_cat',
               'CycleWithPeakorNot',
               'EstimatedDayofOvulation',
               'LengthofLutealPhase',
               'FirstDayofHigh',
               'TotalNumberofHighDays',
               'TotalDaysofFertility',
               'LengthofMenses',
               'lengthMenses_cat',
               'MensesScoreDayOne',
               'MensesScoreDayTwo',
               'MensesScoreDayThree',
               'MensesScoreDayFour',
               'MensesScoreDayFive',
               'MensesScoreDaySix',
               'MensesScoreDaySeven',
               'MensesScoreDayEight',
               'MensesScoreDayNine',
               'MensesScoreDayTen',
               'MensesScoreDay11',
               'TotalMensesScore',
               'Age',
               'Height',
               'Weight',
               'Numberpreg',
               'BMI']
dfExpanded = pd.DataFrame(expanded_rows, columns=new_columns)
dfExpanded.head(5)


Unnamed: 0,ClientID,CycleNumber,DayofCycle,LengthofCycle,lengthCycle_cat,CycleWithPeakorNot,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,...,MensesScoreDayEight,MensesScoreDayNine,MensesScoreDayTen,MensesScoreDay11,TotalMensesScore,Age,Height,Weight,Numberpreg,BMI
0,nfp8122,1,1,29,28 - 32,1.0,17,12,12,5,...,0,0,0,0,10,36.0,160.02,54.43,3.0,21.25
1,nfp8122,1,2,29,28 - 32,1.0,17,12,12,5,...,0,0,0,0,10,36.0,160.02,54.43,3.0,21.25
2,nfp8122,1,3,29,28 - 32,1.0,17,12,12,5,...,0,0,0,0,10,36.0,160.02,54.43,3.0,21.25
3,nfp8122,1,4,29,28 - 32,1.0,17,12,12,5,...,0,0,0,0,10,36.0,160.02,54.43,3.0,21.25
4,nfp8122,1,5,29,28 - 32,1.0,17,12,12,5,...,0,0,0,0,10,36.0,160.02,54.43,3.0,21.25


In [65]:
labels = ['Menstruation', 'Follicular', 'Ovulation', 'Luteal']
dfExpanded['CurrentPhase'] = ''

# Iterate over each row in the DataFrame
for index, row in dfExpanded.iterrows():
    day_of_cycle = row['DayofCycle']
    length_of_mens = row['LengthofMenses']
    ovulation = row['FirstDayofHigh']

    # Determine the label based on the length of the menstrual phase and the day of the cycle
    if day_of_cycle <= length_of_mens:
        phase_label = labels[0]  # Menstruation phase
    elif day_of_cycle == ovulation:
        phase_label = labels[2]  # Follicular phase
    elif day_of_cycle >= ovulation:
        phase_label = labels[3]  # Ovulation phase
    else:
        phase_label = labels[1]  # Luteal phase

    # Assign the label to the 'Phase' column
    dfExpanded.at[index, 'CurrentPhase'] = phase_label

dfExpanded['CurrentPhase'].head(10)


0    Menstruation
1    Menstruation
2    Menstruation
3    Menstruation
4    Menstruation
5      Follicular
6      Follicular
7      Follicular
8      Follicular
9      Follicular
Name: CurrentPhase, dtype: object

In [66]:
dfExpanded.head(5)


Unnamed: 0,ClientID,CycleNumber,DayofCycle,LengthofCycle,lengthCycle_cat,CycleWithPeakorNot,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,...,MensesScoreDayNine,MensesScoreDayTen,MensesScoreDay11,TotalMensesScore,Age,Height,Weight,Numberpreg,BMI,CurrentPhase
0,nfp8122,1,1,29,28 - 32,1.0,17,12,12,5,...,0,0,0,10,36.0,160.02,54.43,3.0,21.25,Menstruation
1,nfp8122,1,2,29,28 - 32,1.0,17,12,12,5,...,0,0,0,10,36.0,160.02,54.43,3.0,21.25,Menstruation
2,nfp8122,1,3,29,28 - 32,1.0,17,12,12,5,...,0,0,0,10,36.0,160.02,54.43,3.0,21.25,Menstruation
3,nfp8122,1,4,29,28 - 32,1.0,17,12,12,5,...,0,0,0,10,36.0,160.02,54.43,3.0,21.25,Menstruation
4,nfp8122,1,5,29,28 - 32,1.0,17,12,12,5,...,0,0,0,10,36.0,160.02,54.43,3.0,21.25,Menstruation


In [67]:
dfFinishedforCSVDay = dfExpanded[['ClientID',
                                  'CycleNumber',
                                  'DayofCycle',
                                  'CurrentPhase',
                                  'LengthofCycle',
                                  'lengthCycle_cat',
                                  'CycleWithPeakorNot',
                                  'EstimatedDayofOvulation',
                                  'LengthofLutealPhase',
                                  'FirstDayofHigh',
                                  'TotalNumberofHighDays',
                                  'TotalDaysofFertility',
                                  'LengthofMenses',
                                  'lengthMenses_cat',
                                  'Age',
                                  'Height',
                                  'Weight',
                                  'BMI']]

dfFinishedforCSVDay.head(50)


Unnamed: 0,ClientID,CycleNumber,DayofCycle,CurrentPhase,LengthofCycle,lengthCycle_cat,CycleWithPeakorNot,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,TotalNumberofHighDays,TotalDaysofFertility,LengthofMenses,lengthMenses_cat,Age,Height,Weight,BMI
0,nfp8122,1,1,Menstruation,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
1,nfp8122,1,2,Menstruation,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
2,nfp8122,1,3,Menstruation,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
3,nfp8122,1,4,Menstruation,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
4,nfp8122,1,5,Menstruation,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
5,nfp8122,1,6,Follicular,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
6,nfp8122,1,7,Follicular,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
7,nfp8122,1,8,Follicular,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
8,nfp8122,1,9,Follicular,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25
9,nfp8122,1,10,Follicular,29,28 - 32,1.0,17,12,12,5,9,5,Normal,36.0,160.02,54.43,21.25


In [68]:
dfFinishedforCSVDay.columns


Index(['ClientID', 'CycleNumber', 'DayofCycle', 'CurrentPhase',
       'LengthofCycle', 'lengthCycle_cat', 'CycleWithPeakorNot',
       'EstimatedDayofOvulation', 'LengthofLutealPhase', 'FirstDayofHigh',
       'TotalNumberofHighDays', 'TotalDaysofFertility', 'LengthofMenses',
       'lengthMenses_cat', 'Age', 'Height', 'Weight', 'BMI'],
      dtype='object')

In [69]:
dfFinishedforCSVDay.to_csv('../backend/menstrual_cycle_data_enriched_per_DAY.csv',
                           sep=",",
                           encoding='utf-8',
                           index=False)
