In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import mean_squared_error, max_error

In [29]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

(4564, 42)

In [30]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [31]:
#new biodegradable (new_x):  1 if RB else -1
y = y.map(lambda x: 1 if x=='RB' else -1)

In [32]:
#Criação do train + test e validation set
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=355, random_state=22)

In [33]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 919 to 2933
Data columns (total 41 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SpMax_L   4209 non-null   float64
 1   J_Dz(e)   4209 non-null   float64
 2   nHM       4209 non-null   float64
 3   F01       3739 non-null   float64
 4   F04       4209 non-null   float64
 5   NssssC    4209 non-null   float64
 6   nCb       4209 non-null   float64
 7   C         3503 non-null   float64
 8   nCp       3595 non-null   float64
 9   nO        4209 non-null   float64
 10  F03       4209 non-null   float64
 11  SdssC     4209 non-null   float64
 12  HyWi_B    3769 non-null   float64
 13  LOC       4209 non-null   float64
 14  SM6_L     4209 non-null   float64
 15  F03_CO    4171 non-null   float64
 16  Me        3796 non-null   float64
 17  Mi        4209 non-null   float64
 18  nN_N      4209 non-null   float64
 19  nArNO2    4209 non-null   float64
 20  nCRX3     4209 non-null   fl

In [34]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

Maximum missing attributes on the rows: 6


In [35]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

F01         470
C           706
nCp         614
HyWi_B      440
F03_CO       38
Me          413
nCIR        460
SpMax_A     618
SdO         199
nCrt        227
SpMax_B    1265
Psi_i_A     414
nX          634
dtype: int64

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- [ ] Logit
- [ ] LDA
- [ ] SVM
- [ ] Naive Bayes
- [ ] DecisionTree
- [ ] KNN
- [ ] Bagging
- [ ] Boosting 

## Testing Imputation Models

In [36]:
X_train_not_nan = X_Train.dropna()

In [37]:
X_train_not_nan.shape

(834, 41)

In [38]:
X_Train.shape

(4209, 41)

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [39]:
priors = X_Train.isna().sum()/X_Train.shape[0]
priors.shape

(41,)

In [40]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [41]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

In [42]:
N = 500
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [43]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [44]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.012057,0.011302,0.011235
C,0.070986,0.072082,0.027642
nCp,0.021158,0.021369,0.015434
HyWi_B,0.042118,0.042154,0.01528
F03_CO,0.007594,0.008032,0.003928
Me,0.031325,0.031617,0.016507
nCIR,0.025592,0.029704,0.01624
SpMax_A,0.044536,0.04459,0.017372
SdO,0.036143,0.036309,0.016214
nCrt,0.007631,0.006909,0.006787


In [45]:
results[results>0].dropna().sum()

SimpleImpute_mean      0.405573
SimpleImpute_median    0.411116
KNN                    0.201358
dtype: float64

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.

# Standardize Data

In [46]:
pd.concat( (X_Train, y_Train), axis=1)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX,Biodegradable
919,5.323000,4.488700,2.0,,0.0,1.0,0.0,30.000000,,1.0,...,0.0,2.0,2.0,7.078000,,2.0,12.275000,0.0,,-1
4214,3.791996,2.636824,0.0,0.0,0.0,0.0,0.0,27.541395,2.0,2.0,...,0.0,0.0,1.0,2.937617,2.354723,0.0,6.868814,0.0,0.0,1
4299,4.347000,3.114317,0.0,0.0,0.0,0.0,0.0,31.481969,1.0,2.0,...,0.0,0.0,0.0,,2.183866,0.0,7.932197,0.0,0.0,1
2443,5.082463,3.276880,0.0,0.0,0.0,0.0,2.0,40.290079,,4.0,...,0.0,0.0,0.0,,3.278586,0.0,8.539939,2.0,0.0,1
602,4.303000,3.141300,0.0,0.0,0.0,0.0,0.0,23.500000,,1.0,...,0.0,2.0,1.0,,2.500000,1.0,6.959000,0.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,4.868000,3.025200,0.0,0.0,1.0,0.0,0.0,34.800000,3.0,1.0,...,0.0,5.0,1.0,,,2.0,8.261000,0.0,,-1
2527,3.974877,2.917110,0.0,0.0,0.0,0.0,0.0,,2.0,0.0,...,0.0,0.0,0.0,3.233164,1.800985,0.0,8.136023,0.0,0.0,1
2952,4.292865,3.156162,0.0,0.0,0.0,0.0,0.0,33.934010,2.0,2.0,...,0.0,0.0,0.0,3.344344,2.044372,0.0,8.317020,0.0,,1
356,4.596000,3.416100,2.0,0.0,0.0,0.0,0.0,45.500000,0.0,0.0,...,0.0,2.0,0.0,3.992000,2.569000,1.0,8.812000,0.0,2.0,-1


In [47]:
#Normalização por Standard Scaler
scaler = StandardScaler()
X_Train_scaled=scaler.fit_transform(X_Train)
pd.DataFrame(
    data = X_Train_scaled,
    columns=X_Train.columns
)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,nCrt,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
0,1.598096,2.235510,2.364419,,-0.216492,1.733380,-0.631804,-0.620715,,-0.694275,...,-0.09824,-0.373039,1.194667,1.063459,5.956433,,2.507713,4.659511,-0.221905,
1,-1.586753,-0.755025,-0.232955,-0.081936,-0.216492,-0.129694,-0.631804,-0.918621,0.604436,-0.031794,...,-0.09824,-0.373039,-0.318985,0.194820,-1.125318,-0.456089,-0.391206,-1.420591,-0.221905,-0.148289
2,-0.432214,0.016064,-0.232955,-0.081936,-0.216492,-0.129694,-0.631804,-0.441147,-0.166386,-0.031794,...,-0.09824,-0.373039,-0.318985,-0.673820,,-0.732959,-0.391206,-0.224651,-0.221905,-0.148289
3,1.097722,0.278582,-0.232955,-0.081936,-0.216492,-0.129694,0.792928,0.626120,,1.293167,...,-0.09824,-0.373039,-0.318985,-0.673820,,1.041019,-0.391206,0.458851,4.567835,-0.148289
4,-0.523745,0.059638,-0.232955,-0.081936,-0.216492,-0.129694,-0.631804,-1.408311,,-0.694275,...,-0.09824,-0.373039,1.194667,0.194820,,-0.220669,1.058253,-1.319163,-0.221905,-0.148289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0.651589,-0.127848,-0.232955,-0.081936,0.580722,-0.129694,-0.631804,-0.039106,1.375257,-0.694275,...,-0.09824,-0.373039,3.465145,0.194820,,,2.507713,0.145140,-0.221905,
4205,-1.206318,-0.302400,-0.232955,-0.081936,-0.216492,-0.129694,-0.631804,,0.604436,-1.356755,...,-0.09824,-0.373039,-0.318985,-0.673820,-0.619811,-1.353412,-0.391206,0.004583,-0.221905,-0.148289
4206,-0.544827,0.083638,-0.232955,-0.081936,-0.216492,-0.129694,-0.631804,-0.144036,0.604436,-0.031794,...,-0.09824,-0.373039,-0.318985,-0.673820,-0.429647,-0.959008,-0.391206,0.208143,-0.221905,
4207,0.085765,0.503403,2.364419,-0.081936,-0.216492,-0.129694,-0.631804,1.257399,-0.937208,-1.356755,...,-0.09824,-0.373039,1.194667,-0.673820,0.678109,-0.108855,1.058253,0.764825,-0.221905,1.575726
