# 4 Pre-processing & Training Data Development

#### Objective: Process the data into a format suitable for modeling, which includes feature selection, normalization or standardization, and splitting the dataset into training and testing sets.

## Import

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Load The Data

In [2]:
data = pd.read_csv('data/train_cleaned.csv')

In [3]:
print(data.columns)

Index(['Patient Id', 'Patient Age', 'Genes in mother's side',
       'Inherited from father', 'Maternal gene', 'Paternal gene',
       'Blood cell count (mcL)', 'Patient First Name', 'Family Name',
       'Father's name', 'Mother's age', 'Father's age', 'Institute Name',
       'Location of Institute', 'Status', 'Respiratory Rate (breaths/min)',
       'Heart Rate (rates/min', 'Test 1', 'Test 2', 'Test 3', 'Test 4',
       'Test 5', 'Parental consent', 'Follow-up', 'Gender', 'Birth asphyxia',
       'Autopsy shows birth defect (if applicable)', 'Place of birth',
       'Folic acid details (peri-conceptional)',
       'H/O serious maternal illness', 'H/O radiation exposure (x-ray)',
       'H/O substance abuse', 'Assisted conception IVF/ART',
       'History of anomalies in previous pregnancies',
       'No. of previous abortion', 'Birth defects',
       'White Blood cell count (thousand per microliter)', 'Blood test result',
       'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'S

## Preprocessing & Training Data Development

In [4]:
categorical_cols = ["Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 
                    'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 
                    'Blood test result']
numeric_cols = ['Patient Age', "Mother's age", "Father's age", 'Blood cell count (mcL)', 
                'White Blood cell count (thousand per microliter)']

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())                  
])

In [6]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
X = data.drop(['Genetic Disorder', 'Disorder Subclass', 'Patient Id', 'Patient First Name', 'Family Name', 
               "Father's name", 'Institute Name', 'Location of Institute'], axis=1)
y = data['Genetic Disorder']

In [9]:
X_preprocessed = preprocessor.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [11]:
print("Preprocessing and data splitting completed successfully.")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Preprocessing and data splitting completed successfully.
Training data shape: (17666, 33)
Test data shape: (4417, 33)


In [12]:
np.save('data/X_train.npy', X_train)
np.save('data/X_test.npy', X_test)
np.save('data/y_train.npy', y_train)
np.save('data/y_test.npy', y_test)

In [13]:
print("File saved successfully!")

File saved successfully!
