In [1]:
import pandas as pd
import catboost as cb

Dataset: https://www.kaggle.com/datasets/uciml/horse-colic

In [2]:
df = pd.read_csv('horse.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [4]:
def prep_df(df_):
    return df_.assign(
        **df_.select_dtypes(['object', 'category']).astype(str))

def predict_missing(df_, col):
    df_ = prep_df(df_)
    
    not_missing = df_[df_[col].notna()]
    missing = df_[df_[col].isna()]
    
    X_train = not_missing.drop(columns=[col])
    y_train = not_missing[col]
    X_pred = missing.drop(columns=[col])
    
    cat_cols = list(X_train.select_dtypes(include=['object']).columns)
    
    model = cb.CatBoostRegressor(iterations=20,
                                 cat_features=cat_cols,
                                 silent=True)
    
    model.fit(X_train, y_train)
    
    predicted_value = model.predict(X_pred)
    
    return df_[col].where(df_[col].notna(), 
                          pd.Series(predicted_value, 
                                    index=missing.index))

In [5]:
df2 = (df
       .assign(respiratory_rate= lambda df: predict_missing(df, 'respiratory_rate'),
               packed_cell_volume= lambda df: predict_missing(df, 'packed_cell_volume'))
    )

In [6]:
df.respiratory_rate.isna().mean()

0.1939799331103679

In [7]:
df2.respiratory_rate.isna().mean()

0.0

In [8]:
df.packed_cell_volume.isna().mean()

0.09698996655518395

In [9]:
df.head(10)

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
5,no,adult,528355,,,,warm,normal,pale_pink,less_3_sec,...,,,,,lived,no,0,0,0,no
6,yes,adult,526802,37.9,48.0,16.0,normal,normal,normal_pink,less_3_sec,...,37.0,7.0,,,lived,yes,3124,0,0,no
7,yes,adult,529607,,60.0,,cool,,,less_3_sec,...,44.0,8.3,,,died,yes,2208,0,0,no
8,no,adult,530051,,80.0,36.0,cool,absent,pale_pink,less_3_sec,...,38.0,6.2,,,euthanized,yes,3205,0,0,no
9,no,young,5299629,38.3,90.0,,normal,,normal_pink,less_3_sec,...,40.0,6.2,clear,2.2,lived,no,0,0,0,yes


In [10]:
df2.head(10)

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
5,no,adult,528355,,,26.575245,warm,normal,pale_pink,less_3_sec,...,44.260957,,,,lived,no,0,0,0,no
6,yes,adult,526802,37.9,48.0,16.0,normal,normal,normal_pink,less_3_sec,...,37.0,7.0,,,lived,yes,3124,0,0,no
7,yes,adult,529607,,60.0,24.355778,cool,,,less_3_sec,...,44.0,8.3,,,died,yes,2208,0,0,no
8,no,adult,530051,,80.0,36.0,cool,absent,pale_pink,less_3_sec,...,38.0,6.2,,,euthanized,yes,3205,0,0,no
9,no,young,5299629,38.3,90.0,33.019102,normal,,normal_pink,less_3_sec,...,40.0,6.2,clear,2.2,lived,no,0,0,0,yes
