In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import mean_squared_error, max_error

In [2]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

(4564, 42)

In [3]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [4]:
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=355, random_state=22)

In [5]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 919 to 2933
Data columns (total 41 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SpMax_L   4209 non-null   float64
 1   J_Dz(e)   4209 non-null   float64
 2   nHM       4209 non-null   float64
 3   F01       3739 non-null   float64
 4   F04       4209 non-null   float64
 5   NssssC    4209 non-null   float64
 6   nCb       4209 non-null   float64
 7   C         3503 non-null   float64
 8   nCp       3595 non-null   float64
 9   nO        4209 non-null   float64
 10  F03       4209 non-null   float64
 11  SdssC     4209 non-null   float64
 12  HyWi_B    3769 non-null   float64
 13  LOC       4209 non-null   float64
 14  SM6_L     4209 non-null   float64
 15  F03_CO    4171 non-null   float64
 16  Me        3796 non-null   float64
 17  Mi        4209 non-null   float64
 18  nN_N      4209 non-null   float64
 19  nArNO2    4209 non-null   float64
 20  nCRX3     4209 non-null   fl

In [6]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

Maximum missing attributes on the rows: 6


In [7]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

F01         470
C           706
nCp         614
HyWi_B      440
F03_CO       38
Me          413
nCIR        460
SpMax_A     618
SdO         199
nCrt        227
SpMax_B    1265
Psi_i_A     414
nX          634
dtype: int64

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- [ ] Logit
- [ ] LDA
- [ ] SVM
- [ ] Naive Bayes
- [ ] DecisionTree
- [ ] KNN
- [ ] Bagging
- [ ] Boosting 

## Testing Imputation Models

In [8]:
X_train_not_nan = X_Train.dropna()

In [9]:
X_train_not_nan.shape

(834, 41)

In [10]:
X_Train.shape

(4209, 41)

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [11]:
priors = X_Train.isna().sum()/X_Train.shape[0]

In [12]:
priors.shape

(41,)

In [13]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [20]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

In [21]:
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [32]:
N = 500

imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [39]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.012316,0.011602,0.011468
C,0.07101,0.072017,0.02797
nCp,0.02241,0.022667,0.016881
HyWi_B,0.042372,0.042425,0.015238
F03_CO,0.007959,0.008428,0.004201
Me,0.031815,0.032132,0.017047
nCIR,0.025187,0.029297,0.015826
SpMax_A,0.044149,0.044203,0.01711
SdO,0.036718,0.036876,0.016682
nCrt,0.007991,0.007299,0.006863


In [38]:
results[results>0].dropna().sum()

SimpleImpute_mean      0.408182
SimpleImpute_median    0.413740
KNN                    0.204209
dtype: float64

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.