In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import mean_squared_error, max_error, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [2]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

(4564, 42)

In [3]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [4]:
#new biodegradable (new_x):  1 if RB else -1
y = y.map(lambda x: 1 if x=='RB' else -1)

In [5]:
#Criação do train + test e validation set
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.25, random_state=22)

In [22]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3423 entries, 4057 to 2933
Data columns (total 41 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SpMax_L   3423 non-null   float64
 1   J_Dz(e)   3423 non-null   float64
 2   nHM       3423 non-null   float64
 3   F01       3034 non-null   float64
 4   F04       3423 non-null   float64
 5   NssssC    3423 non-null   float64
 6   nCb       3423 non-null   float64
 7   C         2850 non-null   float64
 8   nCp       2926 non-null   float64
 9   nO        3423 non-null   float64
 10  F03       3423 non-null   float64
 11  SdssC     3423 non-null   float64
 12  HyWi_B    3076 non-null   float64
 13  LOC       3423 non-null   float64
 14  SM6_L     3423 non-null   float64
 15  F03_CO    3396 non-null   float64
 16  Me        3091 non-null   float64
 17  Mi        3423 non-null   float64
 18  nN_N      3423 non-null   float64
 19  nArNO2    3423 non-null   float64
 20  nCRX3     3423 non-null   f

In [23]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

Maximum missing attributes on the rows: 6


In [24]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

F01         389
C           573
nCp         497
HyWi_B      347
F03_CO       27
Me          332
nCIR        373
SpMax_A     494
SdO         166
nCrt        177
SpMax_B    1003
Psi_i_A     317
nX          513
dtype: int64

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- [ ] Logit
- [ ] LDA
- [ ] SVM
- [ ] Naive Bayes
- [ ] DecisionTree
- [ ] KNN
- [ ] Bagging
- [ ] Boosting 

# Testing Imputation Models

## Test with MinMaxScaler

In [25]:
X_train_not_nan = X_Train.dropna()

In [26]:
X_train_not_nan.shape

(698, 41)

In [27]:
X_Train.shape

(3423, 41)

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [28]:
priors = X_Train.isna().sum()/X_Train.shape[0]
priors.shape

(41,)

In [29]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [30]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

In [31]:
N = 50
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [32]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [33]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.008584,0.007659,0.008324
C,0.070528,0.071237,0.028571
nCp,0.04463,0.045187,0.032091
HyWi_B,0.041045,0.041094,0.015498
F03_CO,0.015056,0.015548,0.007097
Me,0.032115,0.032364,0.016534
nCIR,0.023859,0.027678,0.015375
SpMax_A,0.044225,0.044246,0.01848
SdO,0.040453,0.040772,0.0166
nCrt,0.008774,0.007801,0.007475


In [34]:
results[results>0].dropna().sum()

SimpleImpute_mean      0.436133
SimpleImpute_median    0.441253
KNN                    0.223760
dtype: float64

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.

## Test with StandardScaler

In [35]:
pd.concat( (X_Train, y_Train), axis=1)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX,Biodegradable
4057,3.776854,2.408741,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,0.0,...,0.0,0.0,0.0,3.177099,2.479789,0.0,6.967228,0.0,0.0,1
4322,4.207577,3.405557,0.0,0.0,0.0,0.0,0.0,25.000000,1.0,2.0,...,0.0,0.0,1.0,3.552206,4.118984,0.0,7.700636,0.0,,1
194,4.650000,4.031300,0.0,0.0,1.0,0.0,0.0,31.300000,3.0,2.0,...,0.0,3.0,0.0,3.527000,2.372000,1.0,8.131000,0.0,0.0,1
2202,4.500517,3.039395,0.0,0.0,0.0,0.0,2.0,,0.0,0.0,...,0.0,0.0,1.0,,2.511390,1.0,8.096866,0.0,0.0,1
4351,4.344574,3.645214,0.0,0.0,0.0,0.0,0.0,31.457361,2.0,3.0,...,0.0,0.0,1.0,3.423525,3.051219,0.0,7.743159,0.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,4.868000,3.025200,0.0,0.0,1.0,0.0,0.0,34.800000,3.0,1.0,...,0.0,5.0,1.0,,,2.0,8.261000,0.0,,-1
2527,3.974877,2.917110,0.0,0.0,0.0,0.0,0.0,,2.0,0.0,...,0.0,0.0,0.0,3.233164,1.800985,0.0,8.136023,0.0,0.0,1
2952,4.292865,3.156162,0.0,0.0,0.0,0.0,0.0,33.934010,2.0,2.0,...,0.0,0.0,0.0,3.344344,2.044372,0.0,8.317020,0.0,,1
356,4.596000,3.416100,2.0,0.0,0.0,0.0,0.0,45.500000,0.0,0.0,...,0.0,2.0,0.0,3.992000,2.569000,1.0,8.812000,0.0,2.0,-1


In [36]:
#Normalização por Standard Scaler
scaler = StandardScaler()
X_Train_scaled=scaler.fit_transform(X_train_not_nan)
X_train_stdScaler=pd.DataFrame(
    data = X_Train_scaled,
    columns=X_Train.columns
)
X_train_stdScaler

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,nCrt,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
0,0.230733,1.389010,-0.232094,-0.071715,1.025951,-0.128021,-0.583471,-0.400740,1.488723,-0.034949,...,-0.096025,-0.335282,2.457302,-0.678067,-0.100319,-0.438917,1.213572,0.024492,-0.225584,-0.139311
1,1.261726,0.682137,-0.232094,-0.071715,-0.255111,1.167032,-0.583471,-1.006461,2.322026,-0.034949,...,-0.096025,-0.335282,-0.336178,1.036755,-0.489638,0.353644,-0.399901,-0.902555,-0.225584,-0.139311
2,0.163050,-0.299268,-0.232094,-0.071715,-0.255111,-0.128021,0.169325,0.895370,-0.177883,0.642670,...,-0.096025,-0.335282,-0.336178,0.179344,0.031227,-0.235835,-0.399901,0.100488,-0.225584,-0.139311
3,0.023820,0.060363,-0.232094,-0.071715,-0.255111,-0.128021,0.169325,1.419572,-1.011186,-0.034949,...,-0.096025,-0.335282,-0.336178,-0.678067,0.114656,0.382507,-0.399901,-0.009828,-0.225584,-0.139311
4,-1.193804,-0.517543,-0.232094,-0.071715,-0.255111,-0.128021,-0.583471,-0.421137,0.655420,-1.390186,...,-0.096025,-0.335282,-0.336178,-0.678067,-0.975716,-1.355502,-0.399901,-0.703168,-0.225584,-0.139311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693,-0.128030,0.289936,-0.232094,-0.071715,-0.255111,-0.128021,-0.583471,-0.273512,-0.177883,-0.034949,...,-0.096025,-0.335282,-0.336178,-0.678067,-0.312834,-0.692220,-0.399901,0.410503,-0.225584,-0.139311
694,2.407050,-1.678164,-0.232094,-0.071715,-0.255111,2.462085,-0.583471,0.083838,0.655420,-0.034949,...,5.059796,-0.335282,-0.336178,1.036755,0.101377,-0.878552,-0.399901,0.072831,-0.225584,-0.139311
695,1.540535,1.441187,2.241214,-0.071715,-0.255111,1.167032,-0.583471,-1.006461,2.322026,-0.034949,...,-0.096025,-0.335282,-0.336178,1.036755,5.132820,0.136108,-0.399901,4.457771,-0.225584,1.046530
696,0.122419,0.463956,2.241214,-0.071715,-0.255111,-0.128021,-0.583471,1.319510,-1.011186,-1.390186,...,-0.096025,-0.335282,1.526142,-0.678067,0.626723,-0.139235,1.213572,0.756031,-0.225584,1.046530


In [37]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [38]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.171867,0.153363,0.150897
C,0.40499,0.40906,0.152144
nCp,0.371907,0.376549,0.262769
HyWi_B,0.313925,0.314297,0.112303
F03_CO,0.080004,0.082622,0.036944
Me,0.298827,0.301147,0.161031
nCIR,0.289752,0.336135,0.186764
SpMax_A,0.373608,0.373784,0.15411
SdO,0.21121,0.212875,0.089215
nCrt,0.135704,0.120664,0.106052


In [39]:
results[results>0].dropna().sum()

SimpleImpute_mean      3.775017
SimpleImpute_median    3.810716
KNN                    2.039125
dtype: float64

Similarmente ao caso do MinMaxScaler, o KNN imputer obtém o menor erro nas previsões dos valores em falta.

# Feature Selection

In [6]:
pipeline = [("scaler", MinMaxScaler()),
            ("imputer", KNNImputer(n_neighbors=4)),
            ("classifier", RandomForestClassifier(n_estimators=200, random_state=47))
           ]

In [7]:
rf = Pipeline(pipeline, verbose=True).fit(X_Train, y_Train)

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ........... (step 2 of 3) Processing imputer, total=   1.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   2.0s


In [8]:
rf["classifier"].feature_importances_

array([0.04161385, 0.02280242, 0.08690971, 0.00286961, 0.05453298,
       0.05805409, 0.05803263, 0.02117278, 0.0168544 , 0.01224928,
       0.04468076, 0.02097033, 0.01906134, 0.01992376, 0.02458327,
       0.01232904, 0.02018489, 0.02319297, 0.00092807, 0.00505014,
       0.00024832, 0.04060187, 0.02513782, 0.00049472, 0.01352648,
       0.0004414 , 0.03450688, 0.01420294, 0.00019239, 0.01625675,
       0.01686838, 0.01450246, 0.01740904, 0.04024602, 0.00640186,
       0.0490711 , 0.02541953, 0.01951893, 0.03622532, 0.00074949,
       0.06198195])

In [9]:
def get_sorted_labels(col_names : list[str], pipeline: Pipeline) -> list[str]:
    #join names and scores in tuples (column, score)
    label_scores = zip(col_names, pipeline["classifier"].feature_importances_)
    #sort tuples accoding to value in index 1 (column, -> score <-)
    sorted_labels = sorted(label_scores, key = lambda x: x[1], reverse=True)
    #return only the column labels, sorted
    return list(map(lambda x: x[0], sorted_labels))

In [10]:
get_sorted_labels(X_Train.columns, rf)

['nHM',
 'nX',
 'NssssC',
 'nCb',
 'F04',
 'SpMax_B',
 'F03',
 'SpMax_L',
 'SpPosA_B',
 'F02_CN',
 'SM6_B',
 'SpMax_A',
 'Psi_i_A',
 'nCIR',
 'SM6_L',
 'Mi',
 'J_Dz(e)',
 'C',
 'SdssC',
 'Me',
 'LOC',
 'nN',
 'HyWi_B',
 'C_026',
 'TI2_L',
 'nCp',
 'SdO',
 'nCrt',
 'Psi_i_1d',
 'B03',
 'F03_CO',
 'nO',
 'nHDon',
 'nArNO2',
 'F01',
 'nN_N',
 'nArCOOR',
 'B01',
 'N_073',
 'nCRX3',
 'B04']

In [43]:
model = Pipeline([
    ("scalar", MinMaxScaler()),
    ("imputer", KNNImputer(n_neighbors=4)),
    ("predicter", LogisticRegression())
])

In [54]:
grid = {
    "imputer__n_neighbors" : range(2,4+1),
    "imputer__weights": ["uniform", "distance"],
    #"predicter__penalty": ['l1', 'l2']
}

results = GridSearchCV(model, grid).fit(X_Train, y_Train)

In [55]:
results.best_params_

{'imputer__n_neighbors': 4, 'imputer__weights': 'uniform'}

In [56]:
results.best_estimator_

In [58]:
results.best_params_

{'imputer__n_neighbors': 4, 'imputer__weights': 'uniform'}

In [59]:
list_results = [results]*10