# 2 Data Wrangling - "Of Genomes And Genetics"

#### Objective: Prepare the "Of Genomes and Genetics" dataset for analysis, which includes loading the data, handling missing values, and understanding its structure.

## Import

In [1]:
import pandas as pd
import numpy as np

## Load The Data

In [2]:
train = pd.read_csv('raw_data/train.csv')
test = pd.read_csv('raw_data/test.csv')

## Data Wrangling

In [3]:
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())

Train columns: ['Patient Id', 'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Blood cell count (mcL)', 'Patient First Name', 'Family Name', "Father's name", "Mother's age", "Father's age", 'Institute Name', 'Location of Institute', 'Status', 'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min', 'Test 1', 'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent', 'Follow-up', 'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)', 'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion', 'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 'Genetic Disorder', 'Disorder Subclass']
Test columns: ['Patient Id', 'Patient

In [4]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22083 entries, 0 to 22082
Data columns (total 45 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Id                                        22083 non-null  object 
 1   Patient Age                                       20656 non-null  float64
 2   Genes in mother's side                            22083 non-null  object 
 3   Inherited from father                             21777 non-null  object 
 4   Maternal gene                                     19273 non-null  object 
 5   Paternal gene                                     22083 non-null  object 
 6   Blood cell count (mcL)                            22083 non-null  float64
 7   Patient First Name                                22083 non-null  object 
 8   Family Name                                       12392 non-null  object 
 9   Father's name    

In [5]:
def fill_missing_values(data):
    numeric_columns = ['Patient Age', "Mother's age", "Father's age", 'Blood cell count (mcL)', 
                       'White Blood cell count (thousand per microliter)']
    for column in numeric_columns:
        if column in data.columns:
            data[column].fillna(data[column].median(), inplace=True)
    
    categorical_columns = ['Gender', 'Genetic Disorder', 'Disorder Subclass']
    for column in categorical_columns:
        if column in data.columns:
            data[column].fillna(data[column].mode()[0], inplace=True)

In [6]:
fill_missing_values(train)
fill_missing_values(test)

In [7]:
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [8]:
def handle_outliers(data, column):
    if column in data.columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data[column] = np.clip(data[column], lower_bound, upper_bound)

handle_outliers(train, 'Patient Age')

In [9]:
train.to_csv('data/train_cleaned.csv', index=False)
test.to_csv('data/test_cleaned.csv', index=False)

In [10]:
print("File saved successfully!")

File saved successfully!
