In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [2]:
class NumAndNumImputer:
    
    '''
    - Imputes a numeric feature using the values in another specified numeric feature.
      Column pairs must be passed as input in the list of lists format like 
      [src_col, dest_col], where src_col and dest_col are numeric features.
    
    - src_col is the column to be used as key for imputation and dest_col is column to be imputed.
    
    - During model fit, src_col is binned and the median value of dest_col is computed and stored.
    
    - During transform, when a missing value is found in dest_col, it is imputed using the median value of
      the corresponding bin in src_col computed while fitting the model.
      
      Inputs:
      column_pairs: list[list[str]]
          Column pairs to be used for imputation
          
      n_bins: int
          Number of bins into whuch the src_col must be grouped using KBinsDiscretizer.
          
      binning_strategy: str: ["uniform", "quantile"]
          Binning strategy to be used for binning the src_col using KBinsDiscretizer.
    '''
    
    
    def __init__(self, column_pairs, n_bins=5, binning_strategy='quantile'):
        self.n_bins = n_bins
        self.binning_strategy = binning_strategy
        self.column_pairs = column_pairs
        self.src_cols = None
        self.dest_cols = None
        self.kbins_discretizor = KBinsDiscretizer(encode='ordinal', n_bins=self.n_bins, strategy=self.binning_strategy)
        self.impute_value_dict = dict()
        
    def fit(self, x):
        x = x.copy()
        self.src_cols = [i for i, j in self.column_pairs] # key columns based on which dest cols are imputed
        self.dest_cols = [j for i, j in self.column_pairs] # cols to be imputed based on src cols
        self.kbins_discretizor.fit(x[self.src_cols].fillna(0)) # fitting src cols into keys, so that they can be grouped
        binned_x = self.kbins_discretizor.transform(x[self.src_cols].fillna(0)).copy() # transforming src cols into bins
        binned_x = pd.DataFrame(binned_x, columns=self.src_cols, index=x.index) # converting bin result to df from array
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            # creating df with one col as binned src col and other as dest col.
            # selecting first column using .iloc[:, 0] in case multiple cols are returned when self.src_cols=['a', 'a']
            # or self.dest_cols=['a', 'a']. This happens when column_pairs are [['a', 'b'], ['a', 'c']]
            df = pd.DataFrame({src_col: binned_x[[src_col]].iloc[:, 0], dest_col: x[[dest_col]].iloc[:, 0]}).dropna() 
            # grouping using binned src col and calculating median of dest col which can be used for imputation
            df = df.groupby(src_col, as_index=False).median()
            # Transforming result df into list of lists where first val is src bin val and second val is dest median val to impute
            self.impute_value_dict[src_col+"_"+dest_col] = df.to_dict(orient="split")['data']
        return self
            
    def transform(self, x):
        x = x.copy()
        binned_x = self.kbins_discretizor.transform(x[self.src_cols].fillna(0)).copy() # transforming src cols into bins using prefitted kbinsdiscretizor
        binned_x = pd.DataFrame(binned_x, columns=self.src_cols, index=x.index) # converting bin result to df from array
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            for cat in self.impute_value_dict[src_col+"_"+dest_col]: # choosing values to impute based on src col
                src_val = cat[0] # src_val = bin value
                dest_val = cat[1] # dest_val = median val to impute
                # if binned_x src col val = src_val and dest_col in x == NaN then impute dest col with dest_val
                x.loc[(binned_x[[src_col]].iloc[:, 0] == src_val) & (x[[dest_col]].iloc[:, 0].isna()), dest_col] = dest_val
        return x
    
    def __repr__(self):
        return f'NumAndNumImputer(column_pairs={self.column_pairs}, n_bins={self.n_bins}, binning_strategy={self.binning_strategy})'

In [3]:
class CatAndNumImputer:
    
    '''
    - Imputes a numeric feature using the values in another specified categorical feature.
      Column pairs must be passed as input in the list of lists format like 
      [src_col, dest_col], where src_col and dest_col are categorical and numeric features respectively.
    
    - src_col is the column to be used as key for imputation and dest_col is column to be imputed.
    
    - During model fit, src_col is grouped and the median value of dest_col is computed and stored.
    
    - During transform, when a missing value is found in dest_col, it is imputed using the median value of
      the corresponding category in src_col computed while fitting the model.
      
      Inputs:
      column_pairs: list[list[str]]
          Column pairs to be used for imputation          
    '''
    
    
    def __init__(self, column_pairs):
        self.column_pairs = column_pairs
        self.src_cols = None
        self.dest_cols = None
        self.impute_value_dict = dict()
        
    def fit(self, x):
        x = x.copy()
        self.src_cols = [i for i, j in self.column_pairs] # key columns based on which dest cols are imputed
        self.dest_cols = [j for i, j in self.column_pairs] # cols to be imputed based on src cols
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            # creating df with one col as src col and other as dest col.
            # selecting first column using .iloc[:, 0] in case multiple cols are returned when self.src_cols=['a', 'a']
            # or self.dest_cols=['a', 'a']. This happens when column_pairs are [['a', 'b'], ['a', 'c']]
            df = pd.DataFrame({src_col: x[[src_col]].iloc[:, 0], dest_col: x[[dest_col]].iloc[:, 0]}).dropna() 
            # grouping using src col and calculating median of dest col which can be used for imputation
            df = df.groupby(src_col, as_index=False).median()
            # Transforming result df into list of lists where first val is src bin val and second val is dest median val to impute
            self.impute_value_dict[src_col+"_"+dest_col] = df.to_dict(orient="split")['data']
        return self
            
    def transform(self, x):
        x = x.copy()
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            for cat in self.impute_value_dict[src_col+"_"+dest_col]: # choosing values to impute based on src col
                src_val = cat[0] # src_val = groupby value
                dest_val = cat[1] # dest_val = median val to impute
                # if x src col val = src_val and dest_col in x == NaN then impute dest col with dest_val
                x.loc[(x[[src_col]].iloc[:, 0] == src_val) & (x[[dest_col]].iloc[:, 0].isna()), dest_col] = dest_val
        return x
    
    def __repr__(self):
        return f'CatAndNumImputer(column_pairs={self.column_pairs})'

In [4]:
class CatAndCatImputer:
    
    '''
    - Imputes a categorical feature using the values in another specified categorical feature.
      Column pairs must be passed as input in the list of lists format like 
      [src_col, dest_col], where src_col and dest_col are categorical features.
    
    - src_col is the column to be used as key for imputation and dest_col is column to be imputed.
    
    - During model fit, src_col is grouped and the mode value of dest_col is computed and stored.
    
    - During transform, when a missing value is found in dest_col, it is imputed using the mode value of
      the corresponding category in src_col computed while fitting the model.
      
      Inputs:
      column_pairs: list[list[str]]
          Column pairs to be used for imputation          
    '''
    
    
    def __init__(self, column_pairs):
        self.column_pairs = column_pairs
        self.src_cols = None
        self.dest_cols = None
        self.impute_value_dict = dict()
        
    def fit(self, x):
        x = x.copy()
        self.src_cols = [i for i, j in self.column_pairs] # key columns based on which dest cols are imputed
        self.dest_cols = [j for i, j in self.column_pairs] # cols to be imputed based on src cols
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            # creating df with one col as src col and other as dest col.
            # selecting first column using .iloc[:, 0] in case multiple cols are returned when self.src_cols=['a', 'a']
            # or self.dest_cols=['a', 'a']. This happens when column_pairs are [['a', 'b'], ['a', 'c']]
            df = pd.DataFrame({src_col: x[[src_col]].iloc[:, 0], dest_col: x[[dest_col]].iloc[:, 0]}).dropna()
            # grouping using src col and calculating most common value of dest col which can be used for imputation
            df = df.groupby(src_col, as_index=False).agg(lambda k: Counter(k).most_common()[0][0])
            # Transforming result df into list of lists where first val is src bin val and second val is dest median val to impute
            self.impute_value_dict[src_col+"_"+dest_col] = df.to_dict(orient="split")['data']
        return self
            
    def transform(self, x):
        x = x.copy()
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            for cat in self.impute_value_dict[src_col+"_"+dest_col]: # choosing values to impute based on src col
                src_val = cat[0] # src_val = groupby value
                dest_val = cat[1] # dest_val = median val to impute
                # if x src col val = src_val and dest_col in x == NaN then impute dest col with dest_val
                x.loc[(x[[src_col]].iloc[:, 0] == src_val) & (x[[dest_col]].iloc[:, 0].isna()), dest_col] = dest_val
        return x
    
    def __repr__(self):
        return f'CatAndCatImputer(column_pairs={self.column_pairs})'

In [5]:
class NumAndCatImputer:
    
    '''
    - Imputes a categorical feature using the values in another specified numeric feature.
      Column pairs must be passed as input in the list of lists format like 
      [src_col, dest_col], where src_col and dest_col are numeric and categorical features respectively.
    
    - src_col is the column to be used as key for imputation and dest_col is column to be imputed.
    
    - During model fit, src_col is binned and the mode value of dest_col is computed and stored.
    
    - During transform, when a missing value is found in dest_col, it is imputed using the mode value of
      the corresponding bin in src_col computed while fitting the model.
      
      Inputs:
      column_pairs: list[list[str]]
          Column pairs to be used for imputation
          
      n_bins: int
          Number of bins into whuch the src_col must be grouped using KBinsDiscretizer.
          
      binning_strategy: str: ["uniform", "quantile"]
          Binning strategy to be used for binning the src_col using KBinsDiscretizer.
    '''
    
    
    def __init__(self, column_pairs, n_bins=5, binning_strategy='quantile'):
        self.n_bins = n_bins
        self.binning_strategy = binning_strategy
        self.column_pairs = column_pairs
        self.src_cols = None
        self.dest_cols = None
        self.kbins_discretizor = KBinsDiscretizer(encode='ordinal', n_bins=self.n_bins, strategy=self.binning_strategy)
        self.impute_value_dict = dict()
        
    def fit(self, x):
        x = x.copy()
        self.src_cols = [i for i, j in self.column_pairs] # key columns based on which dest cols are imputed
        self.dest_cols = [j for i, j in self.column_pairs] # cols to be imputed based on src cols
        self.kbins_discretizor.fit(x[self.src_cols].fillna(0)) # fitting src cols into keys, so that they can be grouped
        binned_x = self.kbins_discretizor.transform(x[self.src_cols].fillna(0)).copy() # transforming src cols into bins
        binned_x = pd.DataFrame(binned_x, columns=self.src_cols, index=x.index) # converting bin result to df from array
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            # creating df with one col as binned src col and other as dest col.
            # selecting first column using .iloc[:, 0] in case multiple cols are returned when self.src_cols=['a', 'a']
            # or self.dest_cols=['a', 'a']. This happens when column_pairs are [['a', 'b'], ['a', 'c']]
            df = pd.DataFrame({src_col: binned_x[[src_col]].iloc[:, 0], dest_col: x[[dest_col]].iloc[:, 0]}).dropna() 
            # grouping using binned src col and calculating most common value of dest col which can be used for imputation
            df = df.groupby(src_col, as_index=False).agg(lambda k: Counter(k).most_common()[0][0])
            # Transforming result df into list of lists where first val is src bin val and second val is dest median val to impute
            self.impute_value_dict[src_col+"_"+dest_col] = df.to_dict(orient="split")['data']
        return self
            
    def transform(self, x):
        x = x.copy()
        binned_x = self.kbins_discretizor.transform(x[self.src_cols].fillna(0)).copy() # transforming src cols into bins using prefitted kbinsdiscretizor
        binned_x = pd.DataFrame(binned_x, columns=self.src_cols, index=x.index) # converting bin result to df from array
        for src_col, dest_col in zip(self.src_cols, self.dest_cols):
            for cat in self.impute_value_dict[src_col+"_"+dest_col]: # choosing values to impute based on src col
                src_val = cat[0] # src_val = bin value
                dest_val = cat[1] # dest_val = median val to impute
                # if binned_x src col val = src_val and dest_col in x == NaN then impute dest col with dest_val
                x.loc[(binned_x[[src_col]].iloc[:, 0] == src_val) & (x[[dest_col]].iloc[:, 0].isna()), dest_col] = dest_val
        return x
    
    def __repr__(self):
        return f'NumAndCatImputer(column_pairs={self.column_pairs}, n_bins={self.n_bins}, binning_strategy={self.binning_strategy})'

In [6]:
class MissingImputer:
    
    '''
    - Imputes numeric and categorical features based on the strategies specified.
    
    - Strategies must be specified as a dict like {column_name: strategy}.
    
    - Strategies supported are "mean", "median", "mode".
    
    - User can also pass in a numeric value and a str value which are used for imputing as a constant
    
    - Example: strategies={"col_1": "mean", "col_2": "median", "col_3": "mode", 
                           "col_4": "not_available", "col_5": 9999}
      
    - "not_available" and 9999 are examples of constant values to be imputed in categorical and numeric 
      columms respectively.
      
      Inputs:
      strategies: dict
          Dict specifying imputation strategies.
    '''
    
    
    def __init__(self, strategies: dict):
        self.strategies = strategies
        self.impute_values = {}
        
    def fit(self, x):
        x = x.copy()
        for col in self.strategies:
            col_data = x[col].dropna()
            if self.strategies[col] == 'mean':
                self.impute_values[col] = np.mean(col_data)
            elif self.strategies[col] == 'median':
                self.impute_values[col] = np.median(col_data)
            elif self.strategies[col] == 'mode':
                mode_ = Counter(col_data).most_common(1)[0][0]
                self.impute_values[col] = mode_
            else:
                self.impute_values[col] = self.strategies[col]
        return self
                
    def transform(self, x):
        x = x.copy()
        for col in self.impute_values:
            x[col] = x[col].fillna(self.impute_values[col])
        return x
    
    def __repr__(self):
        return f'MissingImputer(strategies={self.strategies})'

### Example

##### data source: https://www.kaggle.com/datasets/arashnic/hr-analytics-job-change-of-data-scientists/data

In [7]:
data = pd.read_csv('aug_train.csv')

In [8]:
data.dtypes

enrollee_id                 int64
city                       object
city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                 object
company_size               object
company_type               object
last_new_job               object
training_hours              int64
target                    float64
dtype: object

In [9]:
data.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

##### Using CatAndCatImputer to impute categorical features using another categorical feature. 

Using city feature to impute missing values in gender feature (['city', 'gender']), etc.

In [10]:
cat_cat_imputer = CatAndCatImputer(column_pairs=[['city', 'gender'], ['city', 'education_level'], ['education_level', 'major_discipline'], 
                                                 ['education_level', 'experience'], ['experience', 'last_new_job'], ['education_level', 'enrolled_university']])

In [11]:
cat_cat_imputer.fit(data) # fititng imputer to data

CatAndCatImputer(column_pairs=[['city', 'gender'], ['city', 'education_level'], ['education_level', 'major_discipline'], ['education_level', 'experience'], ['experience', 'last_new_job'], ['education_level', 'enrolled_university']])

In [12]:
# cat_cat_imputer.impute_value_dict

In [13]:
res = cat_cat_imputer.transform(data) # transforming data and storing it in res

##### Using NumAndCatImputer to impute categorical features using another categorical feature. 

Using city_development_index feature to impute missing values in company_size feature (['city_development_index', 'company_size']), etc.

In [14]:
num_cat_imputer = NumAndCatImputer(column_pairs=[['city_development_index', 'company_size'], ['city_development_index', 'company_type']])

In [15]:
num_cat_imputer.fit(res)

NumAndCatImputer(column_pairs=[['city_development_index', 'company_size'], ['city_development_index', 'company_type']], n_bins=5, binning_strategy=quantile)

In [16]:
num_cat_imputer.impute_value_dict

{'city_development_index_company_size': [[0.0, '50-99'],
  [1.0, '50-99'],
  [2.0, '50-99'],
  [3.0, '50-99']],
 'city_development_index_company_type': [[0.0, 'Pvt Ltd'],
  [1.0, 'Pvt Ltd'],
  [2.0, 'Pvt Ltd'],
  [3.0, 'Pvt Ltd']]}

In [17]:
res = num_cat_imputer.transform(res)

##### There are missing values left in gender and major_discipline features which couldn't be filled

In [18]:
res.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                       1
relevent_experience          0
enrolled_university          0
education_level              0
major_discipline          2327
experience                   0
company_size                 0
company_type                 0
last_new_job                 0
training_hours               0
target                       0
dtype: int64

##### Below we can see that gender is still 1 missing value for city_171

In [19]:
res.loc[res['gender'].isna(), ['city', 'gender']]

Unnamed: 0,city,gender
14451,city_171,


##### We can see that there is only 1 sample having city_171 and that too has missing gender value, due to which the gender value couldn't be imputed

In [20]:
res.loc[res['city'] == 'city_171', ['city', 'gender']]

Unnamed: 0,city,gender
14451,city_171,


##### Below we can see that where the major_discipline is still missing, the education_level is ['High School', 'Primary School'] 

In [21]:
res.loc[res['major_discipline'].isna(), ['education_level', 'major_discipline']].drop_duplicates()

Unnamed: 0,education_level,major_discipline
6,High School,
213,Primary School,


##### But we couldn't find ['High School', 'Primary School'] in imputation list below. As seen above, all the major_discipline values are missing for education_level ['High School', 'Primary School']

In [22]:
cat_cat_imputer.impute_value_dict['education_level_major_discipline']

[['Graduate', 'STEM'], ['Masters', 'STEM'], ['Phd', 'STEM']]

##### This seems logical as a person with ['High School', 'Primary School'] education cannot have a major_discipline. So these values must be marked separately, instead of marking them with mode. So we'll be imputing it with a constant value "not_applicable"

##### Finally, we'll use MissingImputer to impute any left over missing values with mean/mode/constant. We'll add strategy for every column to impute any left over missing values, just in case 

In [23]:
strategies = {"enrollee_id": -1, "city": "missing", "city_development_index": "mean", "gender": "mode", 
"relevent_experience": "mode", "enrolled_university": "mode", "education_level": "mode", 
"major_discipline": "not_applicable", "experience": "mode", "company_size": "mode", 
"company_type": "mode", "last_new_job": "not_available", "training_hours": "mean", "target": "mode"}

missing_imputer = MissingImputer(strategies=strategies)

In [24]:
missing_imputer.fit(res)

MissingImputer(strategies={'enrollee_id': -1, 'city': 'missing', 'city_development_index': 'mean', 'gender': 'mode', 'relevent_experience': 'mode', 'enrolled_university': 'mode', 'education_level': 'mode', 'major_discipline': 'not_applicable', 'experience': 'mode', 'company_size': 'mode', 'company_type': 'mode', 'last_new_job': 'not_available', 'training_hours': 'mean', 'target': 'mode'})

In [25]:
missing_imputer.impute_values

{'enrollee_id': -1,
 'city': 'missing',
 'city_development_index': 0.8288480008351603,
 'gender': 'Male',
 'relevent_experience': 'Has relevent experience',
 'enrolled_university': 'no_enrollment',
 'education_level': 'Graduate',
 'major_discipline': 'not_applicable',
 'experience': '>20',
 'company_size': '50-99',
 'company_type': 'Pvt Ltd',
 'last_new_job': 'not_available',
 'training_hours': 65.36689633573442,
 'target': 0.0}

In [26]:
res = missing_imputer.transform(res)

In [27]:
res.isna().sum() # no missing values left

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64