In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import mean_squared_error, max_error

In [2]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

(4564, 42)

In [3]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [4]:
#new biodegradable (new_x):  1 if RB else -1
y = y.map(lambda x: 1 if x=='RB' else -1)

In [5]:
#Criação do train + test e validation set
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=355, random_state=22)

In [6]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 919 to 2933
Data columns (total 41 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SpMax_L   4209 non-null   float64
 1   J_Dz(e)   4209 non-null   float64
 2   nHM       4209 non-null   float64
 3   F01       3739 non-null   float64
 4   F04       4209 non-null   float64
 5   NssssC    4209 non-null   float64
 6   nCb       4209 non-null   float64
 7   C         3503 non-null   float64
 8   nCp       3595 non-null   float64
 9   nO        4209 non-null   float64
 10  F03       4209 non-null   float64
 11  SdssC     4209 non-null   float64
 12  HyWi_B    3769 non-null   float64
 13  LOC       4209 non-null   float64
 14  SM6_L     4209 non-null   float64
 15  F03_CO    4171 non-null   float64
 16  Me        3796 non-null   float64
 17  Mi        4209 non-null   float64
 18  nN_N      4209 non-null   float64
 19  nArNO2    4209 non-null   float64
 20  nCRX3     4209 non-null   fl

In [7]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

Maximum missing attributes on the rows: 6


In [8]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

F01         470
C           706
nCp         614
HyWi_B      440
F03_CO       38
Me          413
nCIR        460
SpMax_A     618
SdO         199
nCrt        227
SpMax_B    1265
Psi_i_A     414
nX          634
dtype: int64

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- [ ] Logit
- [ ] LDA
- [ ] SVM
- [ ] Naive Bayes
- [ ] DecisionTree
- [ ] KNN
- [ ] Bagging
- [ ] Boosting 

## Testing Imputation Models

In [9]:
X_train_not_nan = X_Train.dropna()

In [10]:
X_train_not_nan.shape

(834, 41)

In [11]:
X_Train.shape

(4209, 41)

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [12]:
priors = X_Train.isna().sum()/X_Train.shape[0]
priors.shape

(41,)

In [13]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [28]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,nCrt,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
0,0.400098,0.178206,0.0,0.0,0.0,0.000000,0.000,0.389059,0.083333,0.166667,...,0.000000,0.000000,0.00,0.166667,0.076636,0.187479,0.00,0.189439,0.0,0.000000
1,0.425338,0.180657,0.0,0.0,0.0,0.000000,0.000,0.464066,0.083333,0.000000,...,0.000000,0.000000,0.00,0.000000,0.114990,0.038843,0.00,0.265356,0.0,0.000000
2,0.546647,0.278569,0.0,0.0,0.0,0.000000,0.000,0.518794,0.083333,0.333333,...,0.000000,0.000000,0.00,0.000000,0.133254,0.080332,0.00,0.414856,0.0,0.000000
3,0.420663,0.178213,0.0,0.0,0.0,0.000000,0.000,0.430563,0.083333,0.000000,...,0.000000,0.000000,0.00,0.166667,0.076673,0.110396,0.00,0.211937,0.0,0.000000
4,0.631267,0.268328,0.0,0.0,0.2,0.000000,0.125,0.599156,0.000000,0.166667,...,0.000000,0.166667,0.50,0.333333,0.186890,0.369703,0.50,0.358895,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,0.560625,0.261577,0.0,0.0,0.0,0.000000,0.000,0.490511,0.041667,0.166667,...,0.000000,0.000000,0.00,0.000000,0.132536,0.152538,0.00,0.357274,0.0,0.000000
830,0.859361,0.097126,0.0,0.0,0.0,0.153846,0.000,0.552743,0.083333,0.166667,...,0.333333,0.000000,0.00,0.333333,0.165194,0.123859,0.00,0.324738,0.0,0.000000
831,0.757250,0.357773,0.2,0.0,0.0,0.076923,0.000,0.362869,0.166667,0.166667,...,0.000000,0.000000,0.00,0.333333,0.561887,0.280028,0.00,0.747243,0.0,0.074074
832,0.590139,0.276117,0.2,0.0,0.0,0.000000,0.000,0.767932,0.000000,0.000000,...,0.000000,0.000000,0.25,0.000000,0.206613,0.237649,0.25,0.390567,0.0,0.074074


In [15]:
N = 500
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [16]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [17]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.011554,0.010765,0.010683
C,0.071134,0.072326,0.028237
nCp,0.022022,0.022253,0.016383
HyWi_B,0.042192,0.042221,0.015188
F03_CO,0.007748,0.008191,0.003876
Me,0.031209,0.031461,0.016304
nCIR,0.024974,0.028979,0.015725
SpMax_A,0.044495,0.044565,0.017455
SdO,0.036088,0.036291,0.01603
nCrt,0.008169,0.00746,0.007055


In [18]:
results[results>0].dropna().sum()

SimpleImpute_mean      0.406246
SimpleImpute_median    0.411756
KNN                    0.202484
dtype: float64

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.

# Standardize Data

In [19]:
pd.concat( (X_Train, y_Train), axis=1)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX,Biodegradable
919,5.323000,4.488700,2.0,,0.0,1.0,0.0,30.000000,,1.0,...,0.0,2.0,2.0,7.078000,,2.0,12.275000,0.0,,-1
4214,3.791996,2.636824,0.0,0.0,0.0,0.0,0.0,27.541395,2.0,2.0,...,0.0,0.0,1.0,2.937617,2.354723,0.0,6.868814,0.0,0.0,1
4299,4.347000,3.114317,0.0,0.0,0.0,0.0,0.0,31.481969,1.0,2.0,...,0.0,0.0,0.0,,2.183866,0.0,7.932197,0.0,0.0,1
2443,5.082463,3.276880,0.0,0.0,0.0,0.0,2.0,40.290079,,4.0,...,0.0,0.0,0.0,,3.278586,0.0,8.539939,2.0,0.0,1
602,4.303000,3.141300,0.0,0.0,0.0,0.0,0.0,23.500000,,1.0,...,0.0,2.0,1.0,,2.500000,1.0,6.959000,0.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,4.868000,3.025200,0.0,0.0,1.0,0.0,0.0,34.800000,3.0,1.0,...,0.0,5.0,1.0,,,2.0,8.261000,0.0,,-1
2527,3.974877,2.917110,0.0,0.0,0.0,0.0,0.0,,2.0,0.0,...,0.0,0.0,0.0,3.233164,1.800985,0.0,8.136023,0.0,0.0,1
2952,4.292865,3.156162,0.0,0.0,0.0,0.0,0.0,33.934010,2.0,2.0,...,0.0,0.0,0.0,3.344344,2.044372,0.0,8.317020,0.0,,1
356,4.596000,3.416100,2.0,0.0,0.0,0.0,0.0,45.500000,0.0,0.0,...,0.0,2.0,0.0,3.992000,2.569000,1.0,8.812000,0.0,2.0,-1


In [31]:
#Normalização por Standard Scaler
scaler = StandardScaler()
X_Train_scaled=scaler.fit_transform(X_train_not_nan)
X_train_stdScaler=pd.DataFrame(
    data = X_Train_scaled,
    columns=X_Train.columns
)
X_train_stdScaler

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,nCrt,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
0,-1.534390,-0.728631,-0.229279,-0.081062,-0.255058,-0.126738,-0.584496,-0.884651,0.538213,-0.040027,...,-0.090755,-0.369753,-0.339912,0.172923,-1.037443,-0.455423,-0.398414,-1.338069,-0.211066,-0.136801
1,-1.315355,-0.698186,-0.229279,-0.081062,-0.255058,-0.126738,-0.584496,-0.452974,0.538213,-1.375317,...,-0.090755,-0.369753,-0.339912,-0.695857,-0.544696,-1.457718,-0.398414,-0.551293,-0.211066,-0.136801
2,-0.262643,0.518393,-0.229279,-0.081062,-0.255058,-0.126738,-0.584496,-0.138002,0.538213,1.295264,...,-0.090755,-0.369753,-0.339912,-0.695857,-0.310063,-1.177943,-0.398414,0.998060,-0.211066,-0.136801
3,-1.355929,-0.728546,-0.229279,-0.081062,-0.255058,-0.126738,-0.584496,-0.645787,0.538213,-1.375317,...,-0.090755,-0.369753,-0.339912,0.172923,-1.036967,-0.975212,-0.398414,-1.104908,-0.211066,-0.136801
4,0.471683,0.391145,-0.229279,-0.081062,2.148538,-0.126738,0.790594,0.324495,-0.855791,-0.040027,...,-0.090755,1.889396,3.365794,1.041703,0.379012,0.773361,2.766130,0.418098,-0.211066,-0.136801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,-0.141338,0.307264,-0.229279,-0.081062,-0.255058,-0.126738,-0.584496,-0.300778,-0.158789,-0.040027,...,-0.090755,-0.369753,-0.339912,-0.695857,-0.319282,-0.691042,-0.398414,0.401303,-0.211066,-0.136801
830,2.451084,-1.736078,-0.229279,-0.081062,-0.255058,2.451293,-0.584496,0.057377,0.538213,-0.040027,...,5.515881,-0.369753,-0.339912,1.041703,0.100277,-0.884433,-0.398414,0.064116,-0.211066,-0.136801
831,1.564969,1.502528,2.337418,-0.081062,-0.255058,1.162278,-0.584496,-1.035377,1.932216,-0.040027,...,-0.090755,-0.369753,-0.339912,1.041703,5.196698,0.168661,-0.398414,4.442773,-0.211066,1.145128
832,0.114776,0.487937,2.337418,-0.081062,-0.255058,-0.126738,-0.584496,1.295832,-0.855791,-1.375317,...,-0.090755,-0.369753,1.512941,-0.695857,0.632408,-0.117111,1.183858,0.746337,-0.211066,1.145128


In [32]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [33]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.223178,0.20794,0.198871
C,0.40939,0.416246,0.148901
nCp,0.368393,0.372249,0.261515
HyWi_B,0.321778,0.321996,0.11486
F03_CO,0.085571,0.090455,0.04371
Me,0.302268,0.304707,0.161725
nCIR,0.309862,0.359561,0.193257
SpMax_A,0.382195,0.382793,0.149928
SdO,0.2138,0.215004,0.093223
nCrt,0.137406,0.125475,0.112274


In [34]:
results[results>0].dropna().sum()

SimpleImpute_mean      3.925281
SimpleImpute_median    3.973920
KNN                    2.123615
dtype: float64