# Data Cleaning and Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the cleaned CTG dataset from a CSV file into a pandas DataFrame
CTG = pd.read_csv("data/CTG_data.csv", header=0)
print(CTG)

     Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5  \
0             b          e         AC         FM         UC         DL   
1           240        357          0          0          0          0   
2             5        632          4          0          4          2   
3           177        779          2          0          5          2   
4           411       1192          2          0          6          2   
...         ...        ...        ...        ...        ...        ...   
2125       1576       3049          1          0          9          0   
2126       2796       3415          1          1          5          0   
2127        NaN        NaN        NaN        NaN        NaN        NaN   
2128        NaN        NaN        NaN        NaN        NaN          0   
2129        NaN        NaN        NaN        564         23         16   

     Unnamed: 6 Unnamed: 7 Unnamed: 8  Unnamed: 9  ... Unnamed: 36  \
0            DS         DP         DR    

In [4]:
# Identify and drop any columns that contain 'Unnamed' in their name
columns_to_drop = [col for col in CTG.columns if 'Unnamed' in col]
CTG = CTG.drop(columns=columns_to_drop)
print(CTG.head())
# Define a dictionary to rename the columns to more descriptive names
new_column_names = {
    '1': 'FHR_Baseline',
    '2': 'Accelerations',
    '3': 'Fetal_Movements',
    '4': 'Uterine_Contractions',
    '5': 'Light_Decels',
    '6': 'Severe_Decels',
    '7': 'Prolonged_Decels',
    '8': 'Abnormal_STV_Percent',
    '9': 'STV_Mean',
    '10': 'Abnormal_LTV_Percent',
    '11': 'LTV_Mean',
    '12': 'Hist_Width',
    '13': 'Hist_Min',
    '14': 'Hist_Max',
    '15': 'Hist_Peaks',
    '16': 'Hist_Zeros',
    '17': 'Hist_Mode',
    '18': 'Hist_Mean',
    '19': 'Hist_Median',
    '20': 'Hist_Variance',
    '21': 'Hist_Tendency',
    '22': 'FHR_Class',
    '23': 'Fetal_State'
}
# Rename the columns in the DataFrame using the new descriptive names
CTG.rename(columns=new_column_names, inplace=True)

     1      2      3      4      5      6      7     8     9    10  ...   14  \
0   LB     AC     FM     UC     DL     DS     DP  ASTV  MSTV  ALTV  ...  Max   
1  120  0.000  0.000  0.000  0.000  0.000  0.000    73   0.5    43  ...  126   
2  132  0.006  0.000  0.006  0.003  0.000  0.000    17   2.1     0  ...  198   
3  133  0.003  0.000  0.008  0.003  0.000  0.000    16   2.1     0  ...  198   
4  134  0.003  0.000  0.008  0.003  0.000  0.000    16   2.4     0  ...  170   

     15      16    17    18      19        20        21     22   23  
0  Nmax  Nzeros  Mode  Mean  Median  Variance  Tendency  CLASS  NSP  
1     2       0   120   137     121        73         1      9    2  
2     6       1   141   136     140        12         0      6    1  
3     5       1   141   135     138        13         0      6    1  
4    11       0   137   134     137        13         1      6    1  

[5 rows x 23 columns]


In [5]:
# Check column names and the number of columns
print("Column Number:", len(CTG.columns))
print("Column Names:", CTG.columns.tolist())

# Check the number of missing values for each column
print("\nNumber of Missing Values for Each Column:")
print(CTG.isnull().sum())

# Check the number of unique values for each column
print("\nNumber of Unique Values for Each Column:")
print(CTG.nunique())

Column Number: 23
Column Names: ['FHR_Baseline', 'Accelerations', 'Fetal_Movements', 'Uterine_Contractions', 'Light_Decels', 'Severe_Decels', 'Prolonged_Decels', 'Abnormal_STV_Percent', 'STV_Mean', 'Abnormal_LTV_Percent', 'LTV_Mean', 'Hist_Width', 'Hist_Min', 'Hist_Max', 'Hist_Peaks', 'Hist_Zeros', 'Hist_Mode', 'Hist_Mean', 'Hist_Median', 'Hist_Variance', 'Hist_Tendency', 'FHR_Class', 'Fetal_State']

Number of Missing Values for Each Column:
FHR_Baseline            3
Accelerations           3
Fetal_Movements         2
Uterine_Contractions    2
Light_Decels            1
Severe_Decels           1
Prolonged_Decels        1
Abnormal_STV_Percent    2
STV_Mean                2
Abnormal_LTV_Percent    2
LTV_Mean                2
Hist_Width              3
Hist_Min                3
Hist_Max                3
Hist_Peaks              3
Hist_Zeros              3
Hist_Mode               3
Hist_Mean               3
Hist_Median             3
Hist_Variance           3
Hist_Tendency           3
FHR_Clas

In [6]:
# Convert each column to the appropriate numeric type
for col in CTG.columns:
    CTG[col] = pd.to_numeric(CTG[col], errors='coerce')

# Check for missing values (count those that were converted to NaN)
missing_values = CTG.isnull().sum()
print("Number of missing values in each column:\n", missing_values)

Number of missing values in each column:
 FHR_Baseline            4
Accelerations           4
Fetal_Movements         3
Uterine_Contractions    3
Light_Decels            2
Severe_Decels           2
Prolonged_Decels        2
Abnormal_STV_Percent    3
STV_Mean                3
Abnormal_LTV_Percent    3
LTV_Mean                3
Hist_Width              4
Hist_Min                4
Hist_Max                4
Hist_Peaks              4
Hist_Zeros              4
Hist_Mode               4
Hist_Mean               4
Hist_Median             4
Hist_Variance           4
Hist_Tendency           4
FHR_Class               4
Fetal_State             4
dtype: int64


In [7]:
# Record the number of rows before cleaning missing data
initial_row_count = len(CTG)

# Your other data cleaning operations can continue at this point...
# Remove rows with missing data after replacing inf values with NaN
CTG = CTG.replace([np.inf, -np.inf], np.nan).dropna()
CTG_clean = CTG.dropna()

# Record the number of rows after cleaning missing data
final_row_count = len(CTG_clean)

# Calculate and print the number of rows removed
rows_removed = initial_row_count - final_row_count
print(f"Removed {rows_removed} rows with missing data.")

# Check the final state of the dataset
print(CTG_clean.info())

Removed 4 rows with missing data.
<class 'pandas.core.frame.DataFrame'>
Index: 2126 entries, 1 to 2126
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   FHR_Baseline          2126 non-null   float64
 1   Accelerations         2126 non-null   float64
 2   Fetal_Movements       2126 non-null   float64
 3   Uterine_Contractions  2126 non-null   float64
 4   Light_Decels          2126 non-null   float64
 5   Severe_Decels         2126 non-null   float64
 6   Prolonged_Decels      2126 non-null   float64
 7   Abnormal_STV_Percent  2126 non-null   float64
 8   STV_Mean              2126 non-null   float64
 9   Abnormal_LTV_Percent  2126 non-null   float64
 10  LTV_Mean              2126 non-null   float64
 11  Hist_Width            2126 non-null   float64
 12  Hist_Min              2126 non-null   float64
 13  Hist_Max              2126 non-null   float64
 14  Hist_Peaks            2126 non-null   float

In [8]:
# Check for any duiplicate values in the data set and if there are any drop them from dataset
CTG.loc[CTG.duplicated()]
CTG.drop_duplicates(inplace = True)
CTG.duplicated().sum()

0

In [9]:
# Check the final state of the dataset
CTG = CTG.reset_index(drop = True)
CTG.shape

(2115, 23)

In [10]:
# Save the cleansed dataset to a new csv file
CTG.to_csv("data/CTG_clean.csv", index=False)