# Missing Values Imputation Function Using ML
### `Author:` [`Khizer Rehman`](khizerr776@gmail.com)
### `Date:` 01-05-2025

#### **Steps:**
1. Import all the necessary Libraries
2. Load the data
3. Find the columns with the missing values and store in a object
4. Find the columns based on data type
   1. Numeric
   2. Categoricals
   3. Boolean 
5. Define the function to impute missing values
6. Apply the function to our dataset with missing values 
7. Check the missing values after imputation

In [6]:
# import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, r2_score, mean_squared_error

# importing the iterative imputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# import the train_test_split function
from sklearn.model_selection import train_test_split

In [7]:
# load the dataset
df = pd.read_csv("./heart_disease_uci.csv")   # . indecated the file is in the same directory as the script
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [8]:
# we want the only columns with missing values
missing_data_cols= df.isnull().sum()[df.isnull().sum() > 0 ].index.tolist()
missing_data_cols


['trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalch',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [9]:
# find only the categorical columns from the dataset
cat_cols = df.select_dtypes(include= 'object').columns.tolist()

# find the numerical columns from the dataset
num_cols = df.select_dtypes(exclude = 'object').columns.tolist()

print("Categorical columns: ", cat_cols)
print("Numerical columns: ", num_cols)

Categorical columns:  ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
Numerical columns:  ['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [37]:
# create a list of the categorical columns that we want to encode
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg', 'fbs', 'cp', 'sex', 'num']

# create the list for boolean columns
bool_cols = ['fbs', 'exang']

# create the list same for numerical columns
numerical_cols = ['oldpeak', 'thalch', 'chol', 'trestbps', 'age']

In [33]:
# we may use this function to store all the label encoders in one place

class store_encoders:

    def __init__(self):
    
        # create a dictionary to store all the encoders
        self.encoders = {}

    # add the label encoder to the dictionary
    def add_encoder(self, name, encoder):
        if name in self.encoders:
            raise ValueError(f"Label Encoder for '{name}' already exists.") 
        self.encoders[name] = encoder
    
    # get the encoder by its name    
    def get_encoder(self, name):
        if name not in self.encoders:
            raise ValueError(f"Label Encoder for '{name}' does not exist.")
        return self.encoders[name]

In [None]:
# another way to store all the encoders in one place 
'''
label_encoders = {}
le = encoder_store()
label_encoders[col] = le
'''

In [39]:
# define a function to impute missing values in the dataset

def impute_categorical_missing_data(pass_col):
    '''
    * first of all we will create a dataFrame with all the missing values
    of pass_col and other columns remain same as in original dataframe
    * secondly we will create another dataFrame without the missing values
    of pass_col and other will remain same as in original dataframe
    '''

    df_null = df[df[pass_col].isnull()]                      
    df_not_null = df[df[pass_col].notnull()]     

    '''
    Spliting the dataframe into freature and target so,
    df_not_null and drop the pass_col in X.
    y having only pass_col column of that dataframe
    '''
    # spliting the df_without_missing_values into features and target variables in X and y
    X = df_not_null.drop(pass_col, axis=1)    # drop the pass_col from the original dataframe so features are all the columns other than pass_col 
    y = df_not_null[pass_col]                 # target variable is the pass_col in the original dataframe which we have to predict for the missing values

    '''
    Extract other columns with missing valuse from X
    to impute their missing values to train the model
    '''
    other_missing_cols = [col for col in missing_data_cols if col != pass_col]

    '''
    use the labelEncoder on all the categorical and object dtype columns
    of X
    '''
    
    # Encode the categorical columns by using labelEncoder
    # label_Encoders = {}                      # dictionary to store all the label encoders 

    # initialize the store_encoder class
    Label_encoders = store_encoders()

    for col in X.columns:

        # check if the column is categorical or object
        if X[col].dtype == 'category' or X[col].dtype == 'object':
            
            # initialize the LabelEncoder
            le = LabelEncoder()

            X[col] = le.fit_transform(X[col])


            # using add function of the store_encoders class
            Label_encoders.add_encoder(col, le)
            
            # label_Encoders[col] = le          # store the all the encoder in the dictionary
    
    # check if the pass_col is a boolean then also encode it
    if pass_col in bool_cols:
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    # initialize the IterativeImputer to impute the missing values 
    iterative_imputer = IterativeImputer(estimator = RandomForestRegressor(random_state = 42), add_indicator = True)
    
    # impute the missing values of other missing columns of X
    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:

            # change the shape of 1D array to 2D because iterativeImputer expects 2D array
            col_with_missing_values = X[col].values.reshape(-1, 1)
            
            # fit_transform the IterativeImputer on that column to remove missing values
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)

            # slice the first column of the imputed_values 2D array and convert into 1D
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    # spliting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state= 42)           

    # initialize the random forest classifier
    rf_classifier = RandomForestClassifier()

    # fit the model on the training data
    rf_classifier.fit(X_train, y_train)

    # predict the target variable for the test data
    y_pred = rf_classifier.predict(X_test)

    # calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    
    # print the accuracy and precision of the model
    print("The Feature '" +pass_col+ "' has been imputed with ", round(accuracy * 100),2), "accuracy\n"
    
    # dataset with all the null values of pass_col and we will remove pass_col from X 
    X = df_null.drop(pass_col, axis=1)

    # use the label encoders to encode the categorical and object dtype columns of X
    for col in X.columns:
        if X[col].dtype == 'category' or X[col].dtype == 'object':
            X[col] = Label_encoders.get_encoder(col).transform(X[col])
    
    # impute the missing values of other missing columns of X
    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:

            # change the shape of 1D array to 2D because iterativeImputer expects 2D array
            col_with_missing_values = X[col].values.reshape(-1, 1)
            
            # fit_transform the IterativeImputer on that column to remove missing values
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)

            # slice the first column of the imputed_values 2D array and convert into 1D
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    # predict the target variable for the null values of 
    if len(df_null)>0:
        df_null[pass_col] = rf_classifier.predict(X)
        if pass_col in bool_cols:
            df_null[pass_col] = df_null[pass_col].map({0: False, 1: True})
        else:
            pass
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[pass_col]


In [40]:
# function to impute the continuous missing values
def impute_continuous_missing_data(pass_col):
    
    df_null = df[df[pass_col].isnull()]
    df_not_null = df[df[pass_col].notnull()]

    X = df_not_null.drop(pass_col, axis=1)
    y = df_not_null[pass_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != pass_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])
    
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_regressor = RandomForestRegressor()

    rf_regressor.fit(X_train, y_train)

    y_pred = rf_regressor.predict(X_test)

    print("MAE =", mean_absolute_error(y_test, y_pred), "\n")
    print("RMSE =", np.sqrt(mean_squared_error(y_test, y_pred)), "\n")
    print("R2 =", r2_score(y_test, y_pred), "\n")

    X = df_null.drop(pass_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[pass_col] = rf_regressor.predict(X)
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[pass_col]


In [35]:
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)

ca          611
thal        486
slope       309
fbs          90
oldpeak      62
trestbps     59
thalch       55
exang        55
chol         30
restecg       2
dtype: int64

In [41]:
# remove warning
import warnings
warnings.filterwarnings('ignore')

# impute missing values using our functions
for col in missing_data_cols:
    print("Missing Values", col, ":", str(round((df[col].isnull().sum() / len(df)) * 100, 2))+"%")
    if col in categorical_cols:
        df[col] = impute_categorical_missing_data(col)
    elif col in numerical_cols:
        df[col] = impute_continuous_missing_data(col)
    else:
        pass

Missing Values trestbps : 6.41%
MAE = 13.062774566473987 

RMSE = 17.05816885997707 

R2 = 0.0914630061972822 

Missing Values chol : 3.26%
MAE = 45.03202247191012 

RMSE = 64.06515594878056 

R2 = 0.6748659747124507 

Missing Values fbs : 9.78%
The Feature 'fbs' has been imputed with  81 2
Missing Values restecg : 0.22%
The Feature 'restecg' has been imputed with  65 2
Missing Values thalch : 5.98%
MAE = 16.695664739884393 

RMSE = 21.633393275773393 

R2 = 0.3194835865910084 

Missing Values exang : 5.98%
The Feature 'exang' has been imputed with  80 2
Missing Values oldpeak : 6.74%
MAE = 0.5633837209302327 

RMSE = 0.7911434982882188 

R2 = 0.4036949703841892 

Missing Values slope : 33.59%
The Feature 'slope' has been imputed with  66 2
Missing Values ca : 66.41%
The Feature 'ca' has been imputed with  63 2
Missing Values thal : 52.83%
The Feature 'thal' has been imputed with  72 2


In [42]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64