# Encoding and Imputation Lab

## RankFrequencyEncoding
Categorical encoding based upon the frequency that the value appears in the dataset and the rank of the feature value.

In [1]:
from __future__ import annotations
import sys
import pandas as pd
import logging
import numpy as np
from studioai.analysis.prep.encode import RankFrequencyEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:

# ------------------------------------------------------------------------------------------------ #
logging.basicConfig(stream=sys.stdout)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [3]:
FP = "data/meta/1_staged/calc_cases.csv"

In [4]:
class CaseImputer:
    def __init__(self, max_iter: int = 50, initial_strategy: str ='most_frequent', random_state: int = None) -> None:
        self._max_iter = max_iter
        self._initial_strategy = initial_strategy
        self._random_state = random_state
        self._encoded_values = {}
        self._enc = None
        self._imp = None

    def fit(self, df: pd.DataFrame) -> CaseImputer:
        """Fits the data to the imputer

        Instantiates the encoder, encodes the data and creates a 
        map of columns to valid encoded values. We capture these 
        values in order to map imputed values
        back to valid values before we inverse transform.

        Args:
            df (pd.DataFrame): Imputed DataFrame        
        """
        
        self._enc = RankFrequencyEncoder()        
        df_enc = self._enc.fit_transform(df=df)
        self._extract_encoded_values(df=df_enc)

        # Get complete cases for imputer training (fit)
        df_enc_complete = df_enc.dropna(axis=0)

        self._imp = IterativeImputer(max_iter=self._max_iter, initial_strategy=self._initial_strategy, random_state=self._random_state)
        self._imp.fit(X=df_enc_complete.values)
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Performs the imputation and returns the imputed DataFrame
        
        Args:
            df (pd.DataFrame): Imputed DataFrame
        
        """
        df_enc = self._enc.transform(df=df)
        imp = self._imp.transform(X=df_enc.values)
        df_imp = pd.DataFrame(data=imp, columns=df.columns)
        logger.debug(df_imp.head())
        df_imp = self._map_imputed_values(df=df_imp)
        logger.debug(df_imp.head())
        return self._enc.inverse_transform(df=df_imp)

    
    def _extract_encoded_values(self, df: pd.DataFrame) -> None:
        """Creates a dictionary of valid values by column."""
        for col in df.columns:
            valid = df[col].dropna()
            self._encoded_values[col] = valid.unique()
    
    def _map_imputed_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """Maps values to valid values (used after imputation)"""
        for col in df.columns:
            values = np.array(sorted(self._encoded_values[col]))
            df[col] = df[col].apply(lambda x: values[np.argmin(np.abs(x-values))])
        return df



In [5]:
df = pd.read_csv(FP)
imp = CaseImputer(random_state=50)
imp.fit(df=df)
df_imp = imp.transform(df=df)

<__main__.CaseImputer at 0x7fd7642ceb60>

DEBUG:__main__:   patient_id  breast_density  laterality  image_view  \
0      161.00            3.00                870.00      888.00   
1      161.00            3.00                870.00      985.00   
2      475.00            4.00              1,003.00      888.00   
3      475.00            4.00              1,003.00      985.00   
4      765.00            1.00              1,003.00      888.00   

   abnormality_id  abnormality_type  calc_type  calc_distribution  assessment  \
0            1.00          1,872.00     226.00             942.00        3.00   
1            1.00          1,872.00     226.00             942.00        3.00   
2            1.00          1,872.00     859.00             117.00        4.00   
3            1.00          1,872.00     859.00             117.00        4.00   
4            1.00          1,872.00     130.48             107.00        2.00   

   pathology  subtlety  fileset  mmg_id  cancer  
0     675.00      3.00 1,547.00     1.00    1.00  
1   

In [8]:
df_imp.isna().sum().sum()
for col in df_imp.columns:
    df_imp[col].value_counts().to_frame()


0

Unnamed: 0_level_0,count
patient_id,Unnamed: 1_level_1
P_01437,24
P_00112,23
P_01838,17
P_00008,16
P_01099,16
...,...
P_00977,1
P_00397,1
P_00393,1
P_00466,1


Unnamed: 0_level_0,count
breast_density,Unnamed: 1_level_1
3,654
2,616
4,401
1,199
0,2


Unnamed: 0_level_0,count
laterality,Unnamed: 1_level_1
LEFT,1002
RIGHT,870


Unnamed: 0_level_0,count
image_view,Unnamed: 1_level_1
MLO,984
CC,888


Unnamed: 0_level_0,count
abnormality_id,Unnamed: 1_level_1
1,1441
2,267
3,94
4,37
5,21
6,10
7,2


Unnamed: 0_level_0,count
abnormality_type,Unnamed: 1_level_1
calcification,1872


Unnamed: 0_level_0,count
calc_type,Unnamed: 1_level_1
PLEOMORPHIC,814
AMORPHOUS,186
PUNCTATE,132
LUCENT_CENTER,120
FINE_LINEAR_BRANCHING,102
VASCULAR,96
COARSE,41
PLEOMORPHIC-FINE_LINEAR_BRANCHING,31
ROUND_AND_REGULAR-LUCENT_CENTER,31
PUNCTATE-PLEOMORPHIC,30


Unnamed: 0_level_0,count
calc_distribution,Unnamed: 1_level_1
CLUSTERED,949
SEGMENTAL,627
LINEAR,112
REGIONAL,103
DIFFUSELY_SCATTERED,40
CLUSTERED-LINEAR,29
LINEAR-SEGMENTAL,7
CLUSTERED-SEGMENTAL,5


Unnamed: 0_level_0,count
assessment,Unnamed: 1_level_1
4,931
2,553
5,199
3,113
0,76


Unnamed: 0_level_0,count
pathology,Unnamed: 1_level_1
MALIGNANT,673
BENIGN,658
BENIGN_WITHOUT_CALLBACK,541


Unnamed: 0_level_0,count
subtlety,Unnamed: 1_level_1
3,606
5,427
4,422
2,298
1,119


Unnamed: 0_level_0,count
fileset,Unnamed: 1_level_1
train,1546
test,326


Unnamed: 0_level_0,count
mmg_id,Unnamed: 1_level_1
P_00005_RIGHT_calcification_CC_1,1
P_01582_RIGHT_calcification_CC_1,1
P_01616_RIGHT_calcification_MLO_1,1
P_01616_RIGHT_calcification_CC_1,1
P_01610_RIGHT_calcification_MLO_1,1
...,...
P_00780_RIGHT_calcification_CC_1,1
P_00780_LEFT_calcification_MLO_2,1
P_00780_LEFT_calcification_MLO_1,1
P_00780_LEFT_calcification_CC_2,1


Unnamed: 0_level_0,count
cancer,Unnamed: 1_level_1
False,1199
True,673
