In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import (mean_squared_error, max_error, 
                             confusion_matrix, ConfusionMatrixDisplay, 
                             classification_report, make_scorer, matthews_corrcoef)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

import joblib

In [2]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

(4564, 42)

In [3]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [4]:
#new biodegradable (new_x):  1 if RB else -1
y = y.map(lambda x: 1 if x=='RB' else -1)

In [5]:
#Criação do train + test e validation set
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.25, random_state=22)

In [6]:
y_Train.value_counts()

 1    2858
-1     565
Name: Biodegradable, dtype: int64

In [7]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3423 entries, 4057 to 2933
Data columns (total 41 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SpMax_L   3423 non-null   float64
 1   J_Dz(e)   3423 non-null   float64
 2   nHM       3423 non-null   float64
 3   F01       3034 non-null   float64
 4   F04       3423 non-null   float64
 5   NssssC    3423 non-null   float64
 6   nCb       3423 non-null   float64
 7   C         2850 non-null   float64
 8   nCp       2926 non-null   float64
 9   nO        3423 non-null   float64
 10  F03       3423 non-null   float64
 11  SdssC     3423 non-null   float64
 12  HyWi_B    3076 non-null   float64
 13  LOC       3423 non-null   float64
 14  SM6_L     3423 non-null   float64
 15  F03_CO    3396 non-null   float64
 16  Me        3091 non-null   float64
 17  Mi        3423 non-null   float64
 18  nN_N      3423 non-null   float64
 19  nArNO2    3423 non-null   float64
 20  nCRX3     3423 non-null   f

In [8]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

Maximum missing attributes on the rows: 6


In [9]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

F01         389
C           573
nCp         497
HyWi_B      347
F03_CO       27
Me          332
nCIR        373
SpMax_A     494
SdO         166
nCrt        177
SpMax_B    1003
Psi_i_A     317
nX          513
dtype: int64

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- ~[ ] Logit~
- ~[ ] LDA~
- [x] SVM
- ~[ ] Naive Bayes~
- ~[ ] DecisionTree~
- ~[ ] KNN~
- [x] Bagging:
    - RandomForest
    - KNN
- ~[ ] Boosting~

# Testing Imputation Models

## Test with MinMaxScaler

In [10]:
X_train_not_nan = X_Train.dropna()

In [11]:
X_train_not_nan.shape

(698, 41)

In [12]:
X_Train.shape

(3423, 41)

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [13]:
priors = X_Train.isna().sum()/X_Train.shape[0]
priors.shape

(41,)

In [14]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [15]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

In [16]:
N = 50
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [17]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [18]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.010754,0.009998,0.010547
C,0.071441,0.072449,0.027826
nCp,0.044782,0.045537,0.033449
HyWi_B,0.041917,0.041995,0.016974
F03_CO,0.013934,0.013717,0.006619
Me,0.034073,0.034447,0.017974
nCIR,0.027865,0.031742,0.018134
SpMax_A,0.045051,0.045098,0.01909
SdO,0.041463,0.041685,0.015988
nCrt,0.00829,0.007409,0.007214


In [19]:
results[results>0].dropna().sum()

SimpleImpute_mean      0.449811
SimpleImpute_median    0.454975
KNN                    0.234306
dtype: float64

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.

In [20]:
imputers = (
        KNNImputer(n_neighbors=i) for i in range(3,11)
)

labels = [f"KNN_{i}_neighbors" for i in range(3,11)]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [21]:
results[results>0].dropna()

Unnamed: 0,KNN_3_neighbors,KNN_4_neighbors,KNN_5_neighbors,KNN_6_neighbors,KNN_7_neighbors,KNN_8_neighbors,KNN_9_neighbors,KNN_10_neighbors
F01,0.010922,0.010684,0.010547,0.01039,0.010224,0.0101,0.010088,0.010022
C,0.026605,0.027221,0.027826,0.028599,0.029189,0.03006,0.030693,0.031109
nCp,0.032792,0.033399,0.033449,0.033732,0.034061,0.034265,0.034352,0.034618
HyWi_B,0.015417,0.016176,0.016974,0.017557,0.017912,0.018342,0.018731,0.019007
F03_CO,0.006417,0.006447,0.006619,0.006668,0.006741,0.006807,0.006827,0.006811
Me,0.016164,0.017156,0.017974,0.018722,0.019162,0.019426,0.019758,0.020064
nCIR,0.017161,0.017539,0.018134,0.018452,0.018726,0.018962,0.019126,0.019357
SpMax_A,0.017711,0.018507,0.01909,0.019651,0.02015,0.020522,0.020912,0.021337
SdO,0.014794,0.015315,0.015988,0.016595,0.016955,0.017182,0.017621,0.018
nCrt,0.006305,0.006943,0.007214,0.007197,0.007133,0.007212,0.007246,0.007157


In [22]:
results[results>0].dropna().sum()

KNN_3_neighbors     0.217648
KNN_4_neighbors     0.226664
KNN_5_neighbors     0.234306
KNN_6_neighbors     0.240571
KNN_7_neighbors     0.245488
KNN_8_neighbors     0.249833
KNN_9_neighbors     0.253836
KNN_10_neighbors    0.257234
dtype: float64

In [23]:
results[results>0].dropna().sum().idxmin()

'KNN_3_neighbors'

## Test with StandardScaler

In [24]:
pd.concat( (X_Train, y_Train), axis=1)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX,Biodegradable
4057,3.776854,2.408741,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,0.0,...,0.0,0.0,0.0,3.177099,2.479789,0.0,6.967228,0.0,0.0,1
4322,4.207577,3.405557,0.0,0.0,0.0,0.0,0.0,25.000000,1.0,2.0,...,0.0,0.0,1.0,3.552206,4.118984,0.0,7.700636,0.0,,1
194,4.650000,4.031300,0.0,0.0,1.0,0.0,0.0,31.300000,3.0,2.0,...,0.0,3.0,0.0,3.527000,2.372000,1.0,8.131000,0.0,0.0,1
2202,4.500517,3.039395,0.0,0.0,0.0,0.0,2.0,,0.0,0.0,...,0.0,0.0,1.0,,2.511390,1.0,8.096866,0.0,0.0,1
4351,4.344574,3.645214,0.0,0.0,0.0,0.0,0.0,31.457361,2.0,3.0,...,0.0,0.0,1.0,3.423525,3.051219,0.0,7.743159,0.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,4.868000,3.025200,0.0,0.0,1.0,0.0,0.0,34.800000,3.0,1.0,...,0.0,5.0,1.0,,,2.0,8.261000,0.0,,-1
2527,3.974877,2.917110,0.0,0.0,0.0,0.0,0.0,,2.0,0.0,...,0.0,0.0,0.0,3.233164,1.800985,0.0,8.136023,0.0,0.0,1
2952,4.292865,3.156162,0.0,0.0,0.0,0.0,0.0,33.934010,2.0,2.0,...,0.0,0.0,0.0,3.344344,2.044372,0.0,8.317020,0.0,,1
356,4.596000,3.416100,2.0,0.0,0.0,0.0,0.0,45.500000,0.0,0.0,...,0.0,2.0,0.0,3.992000,2.569000,1.0,8.812000,0.0,2.0,-1


In [25]:
#Normalização por Standard Scaler
scaler = StandardScaler()
X_Train_scaled=scaler.fit_transform(X_train_not_nan)
X_train_stdScaler=pd.DataFrame(
    data = X_Train_scaled,
    columns=X_Train.columns
)
X_train_stdScaler

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,nCrt,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
0,0.230733,1.389010,-0.232094,-0.071715,1.025951,-0.128021,-0.583471,-0.400740,1.488723,-0.034949,...,-0.096025,-0.335282,2.457302,-0.678067,-0.100319,-0.438917,1.213572,0.024492,-0.225584,-0.139311
1,1.261726,0.682137,-0.232094,-0.071715,-0.255111,1.167032,-0.583471,-1.006461,2.322026,-0.034949,...,-0.096025,-0.335282,-0.336178,1.036755,-0.489638,0.353644,-0.399901,-0.902555,-0.225584,-0.139311
2,0.163050,-0.299268,-0.232094,-0.071715,-0.255111,-0.128021,0.169325,0.895370,-0.177883,0.642670,...,-0.096025,-0.335282,-0.336178,0.179344,0.031227,-0.235835,-0.399901,0.100488,-0.225584,-0.139311
3,0.023820,0.060363,-0.232094,-0.071715,-0.255111,-0.128021,0.169325,1.419572,-1.011186,-0.034949,...,-0.096025,-0.335282,-0.336178,-0.678067,0.114656,0.382507,-0.399901,-0.009828,-0.225584,-0.139311
4,-1.193804,-0.517543,-0.232094,-0.071715,-0.255111,-0.128021,-0.583471,-0.421137,0.655420,-1.390186,...,-0.096025,-0.335282,-0.336178,-0.678067,-0.975716,-1.355502,-0.399901,-0.703168,-0.225584,-0.139311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693,-0.128030,0.289936,-0.232094,-0.071715,-0.255111,-0.128021,-0.583471,-0.273512,-0.177883,-0.034949,...,-0.096025,-0.335282,-0.336178,-0.678067,-0.312834,-0.692220,-0.399901,0.410503,-0.225584,-0.139311
694,2.407050,-1.678164,-0.232094,-0.071715,-0.255111,2.462085,-0.583471,0.083838,0.655420,-0.034949,...,5.059796,-0.335282,-0.336178,1.036755,0.101377,-0.878552,-0.399901,0.072831,-0.225584,-0.139311
695,1.540535,1.441187,2.241214,-0.071715,-0.255111,1.167032,-0.583471,-1.006461,2.322026,-0.034949,...,-0.096025,-0.335282,-0.336178,1.036755,5.132820,0.136108,-0.399901,4.457771,-0.225584,1.046530
696,0.122419,0.463956,2.241214,-0.071715,-0.255111,-0.128021,-0.583471,1.319510,-1.011186,-1.390186,...,-0.096025,-0.335282,1.526142,-0.678067,0.626723,-0.139235,1.213572,0.756031,-0.225584,1.046530


In [26]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [27]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.215332,0.200185,0.191391
C,0.41023,0.416021,0.149578
nCp,0.373169,0.37946,0.271968
HyWi_B,0.320588,0.321185,0.124093
F03_CO,0.074044,0.072893,0.035714
Me,0.317044,0.320524,0.172057
nCIR,0.3384,0.38549,0.218057
SpMax_A,0.380584,0.380988,0.159256
SdO,0.216484,0.217641,0.08949
nCrt,0.128232,0.114596,0.099171


In [28]:
results[results>0].dropna().sum()

SimpleImpute_mean      3.955529
SimpleImpute_median    3.996884
KNN                    2.182967
dtype: float64

Similarly to the MinMaxScaler, KNN imputer is the model that has the least error on it's predictions.

In [29]:
imputers = (
        KNNImputer(n_neighbors=i) for i in range(3,11)
)

labels = [f"KNN_{i}_neighbors" for i in range(3,11)]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [30]:
results[results>0].dropna()

Unnamed: 0,KNN_3_neighbors,KNN_4_neighbors,KNN_5_neighbors,KNN_6_neighbors,KNN_7_neighbors,KNN_8_neighbors,KNN_9_neighbors,KNN_10_neighbors
F01,0.178269,0.187389,0.191391,0.195342,0.194926,0.196514,0.197779,0.198766
C,0.14459,0.145534,0.149578,0.154608,0.157112,0.15954,0.161733,0.164558
nCp,0.264548,0.268006,0.271968,0.274719,0.277843,0.279874,0.282284,0.284266
HyWi_B,0.113743,0.119429,0.124093,0.127168,0.129996,0.132486,0.135891,0.138935
F03_CO,0.035417,0.036313,0.035714,0.035875,0.035877,0.035821,0.03651,0.037104
Me,0.153361,0.164612,0.172057,0.18005,0.185503,0.189822,0.192359,0.194176
nCIR,0.20151,0.208021,0.218057,0.224472,0.228277,0.231699,0.234986,0.237708
SpMax_A,0.151723,0.155489,0.159256,0.16204,0.16536,0.169186,0.172343,0.175459
SdO,0.084991,0.087194,0.08949,0.091978,0.092081,0.093038,0.093842,0.09406
nCrt,0.090569,0.095914,0.099171,0.10247,0.10423,0.106363,0.107136,0.108274


In [31]:
results[results>0].dropna().sum()

KNN_3_neighbors     2.017022
KNN_4_neighbors     2.105754
KNN_5_neighbors     2.182967
KNN_6_neighbors     2.251150
KNN_7_neighbors     2.299200
KNN_8_neighbors     2.343646
KNN_9_neighbors     2.383182
KNN_10_neighbors    2.416884
dtype: float64

In [32]:
results[results>0].dropna().sum().idxmin()

'KNN_3_neighbors'

## Test with Power Transformer

In [33]:
#FAZER FUNÇAO PARA NAO REPETIR O CODIGO DEPOIS

#Normalização por Power Transform

X_Train_powerTscaled=PowerTransformer().fit_transform(X_train_not_nan)
X_train_powerTscaler=pd.DataFrame(
    data = X_Train_powerTscaled,
    columns=X_Train.columns
)
#X_train_powerTscaler

In [34]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_powerTscaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_powerTscaler.columns, 
                                 mean_squared_error(X_train_powerTscaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [35]:
results[results>0].dropna().sum()

SimpleImpute_mean      4.040909
SimpleImpute_median    4.158115
KNN                    1.920304
dtype: float64

As we can see, KNN has the minimum error in every scalling method, so we will only use KNN-Imputer.

In [36]:
imputers = (
        KNNImputer(n_neighbors=i) for i in range(3,11)
)

labels = [f"KNN_{i}_neighbors" for i in range(3,11)]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_powerTscaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_powerTscaler.columns, 
                                 mean_squared_error(X_train_powerTscaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [37]:
results[results>0].dropna().sum()

KNN_3_neighbors     1.780050
KNN_4_neighbors     1.867666
KNN_5_neighbors     1.920304
KNN_6_neighbors     1.974016
KNN_7_neighbors     2.027190
KNN_8_neighbors     2.071849
KNN_9_neighbors     2.107752
KNN_10_neighbors    2.137209
dtype: float64

In [38]:
results[results>0].dropna().sum().idxmin()

'KNN_3_neighbors'

## Imputer Results

For all the cases tested, the 3-NN imputer was the one that obtained better results

# Scale and impute the rest of the data

Perform imputation of missing values before scaling, as scaling could lead to distorted data if the missing values are not first replaced. This is because some calculations may include the missing values and their presence could lead to skewed results.

## Impute data with knn

In [39]:
# Create an instance of the KNNImputer
imputer = KNNImputer()

X_Train_imputed = imputer.fit_transform(X_Train)

X_Train_imputed

array([[3.77685373, 2.40874116, 0.        , ..., 6.96722794, 0.        ,
        0.        ],
       [4.20757702, 3.40555706, 0.        , ..., 7.70063609, 0.        ,
        0.        ],
       [4.65      , 4.0313    , 0.        , ..., 8.131     , 0.        ,
        0.        ],
       ...,
       [4.29286534, 3.15616198, 0.        , ..., 8.31702029, 0.        ,
        0.        ],
       [4.596     , 3.4161    , 2.        , ..., 8.812     , 0.        ,
        2.        ],
       [5.0138996 , 2.96835616, 0.        , ..., 8.8007641 , 2.        ,
        0.        ]])

## Scale with PowerTransform

In [40]:
pt = PowerTransformer()
X_Train_imputed_powerT = pt.fit_transform(X_Train_imputed)

# Feature Selection using RandomForest

Fitting a tree find the best features.

In [41]:
def get_sorted_labels(col_names : list[str], pipeline: Pipeline) -> list[str]:
    #join names and scores in tuples (column, score)
    label_scores = zip(col_names, pipeline["selector"].estimator_.feature_importances_)
    
    #sort tuples accoding to value in index 1 (column, -> score <-)
    sorted_labels = sorted(label_scores, key = lambda x: x[1], reverse=True)
    
    return sorted_labels

In [42]:
def conf_mat(y_test, pred):
    ConfusionMatrixDisplay(
        confusion_matrix(y_test, pred)
    ).plot()

In [43]:
(X_Train.std()==0).any()

False

There are no features with constant values, so there is no need to drop any column

In [44]:
temp_df = pd.DataFrame(columns=["Std dev", "Corr with Biodegradable"])
temp_df["Corr with Biodegradable"] = np.abs(pd.concat((X_Train, y_Train), axis=1).corr()["Biodegradable"]).sort_values(ascending=False)
temp_df["Std dev"] = X_Train.std()**2

temp_df

Unnamed: 0,Std dev,Corr with Biodegradable
Biodegradable,,1.0
nHM,0.593107,0.466193
C_026,0.831661,0.422011
F02_CN,1.78772,0.417339
B03,0.037895,0.399126
nCb,1.97521,0.388747
SM6_B,0.802933,0.385579
nN,0.480948,0.380732
F03,3.153011,0.380729
F04,1.554935,0.369411


> FALAR DE IMPORTÂNCIA DO SCORE DAS FEATURES

## Models with MinMaxScaler

## Random Forest Classifier

### First test, testing wit higher jumps to check 

In [54]:
FILENAME = "MinMax-RF_v3"
if os.path.exists(FILENAME):
    #if file exists, load it
    search_results = joblib.load(FILENAME)
else:
    pipeline = [
                ("scaler", MinMaxScaler()),
                ("imputer", KNNImputer(n_neighbors=3)),
                ("selector", SelectFromModel(
                    estimator = RandomForestClassifier(
                        random_state=0
                    ),
                    threshold=-np.inf,
                )),
                ("classifier", RandomForestClassifier(
                        min_samples_leaf= 1,
                ))
               ]

    model = Pipeline(pipeline)

    grid = {
        #{pipeline_name}__{feature_name} : [ ... ]
        "selector__max_features": range(22, 37+1, 1),
        "classifier__criterion": ["gini", "entropy"],
        "classifier__max_depth" : range(25, 35+1, 1),
        #classifier__min_samples_leaf : range(1,10)
    } 

    search_results = GridSearchCV(estimator = model,
                                  param_grid=grid,
                                  cv = 9,
                                  scoring=make_scorer(matthews_corrcoef),
                                  n_jobs=-1, #number of processes; -1 --> use all
                                  verbose=10, #text information
                                  return_train_score=True
                                 )
    search_results.fit(X_Train, y_Train)
    #save file
    joblib.dump(search_results, FILENAME)

Fitting 9 folds for each of 352 candidates, totalling 3168 fits


KeyboardInterrupt: 

In [None]:
search_results.best_params_

In [59]:
print("Best Parameters Test Score:", max(search_results.cv_results_["mean_test_score"])

SyntaxError: unexpected EOF while parsing (2577994034.py, line 1)

In [None]:
#
#
# Search results
#
#
method="gini"
is_2nd_half = 1 if method=="gini" else 0 
size = int(len(search_results.cv_results_["mean_test_score"]) / 2)

data = list(filter(lambda x: x["classifier__criterion"]==method, search_results.cv_results_["params"]))

#getting list of max depth values
x_max_depth = list(map(lambda x: x["classifier__max_depth"], data))
#min samples leaf list
y_min_samples_leaf = list(map(lambda x: x["classifier__min_samples_leaf"], data))
#max features list
z_max_features = list(map(lambda x: x["selector__max_features"], data))

#test scores list
#only gets half the data because subselection of "gini" data (in cv_results_ we can see that 
# the first results are all with gini and the rest is using entropy )
scores = search_results.cv_results_["mean_test_score"][ is_2nd_half*size : (is_2nd_half+1)*size ]

In [None]:
get_sorted_labels(X_Train.columns, search_results.best_estimator_)

In [None]:
list(filter(lambda item: item[1]>0.02, get_sorted_labels(X_Train.columns, search_results.best_estimator_)));

In [None]:
%matplotlib inline
im = plt.scatter(x_max_depth, z_max_features, c=scores)
plt.colorbar(im)

In [None]:
%matplotlib widget
#plt.figure(figsize=(12,12))

f, ax = plt.subplots(figsize=(8,8), subplot_kw={"projection":"3d"})
im = ax.scatter(x_max_depth, 
           y_min_samples_leaf, 
           z_max_features, 
           c = scores,
           cmap=plt.viridis(),
            #s=50
          )
ax.set_xlabel("Max Depth")
ax.set_ylabel("Min Samples Leaf")
ax.set_zlabel("Max Nº of Features")

cbar = plt.colorbar(im)
cbar.ax.set_ylabel('MCC Score', rotation=0)
plt.tight_layout()

## Using Standard Scaler

In [None]:

N,M=X_Train_imputed.shape

rfr=RandomForestRegressor(random_state=0)
#threshold is minus infinity
sel = SelectFromModel(estimator=rfr, threshold=-np.inf, max_features=5)

sel.fit(X_Train_imputed_powerT, y_Train)

print("Importances: ", sel.estimator_.feature_importances_)

print("Default threshold: ", sel.threshold_)

features=sel.get_support()
Features_selected =np.arange(M)[features]
print("The features selected are columns: ", Features_selected)



## Using Power Tranformer

### Another way to do the same. choose later

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_Train_imputed_powerT, y_Train)


# create a dictionary to store your feature importance scores
feature_imp = dict(zip(X_Train.columns, rfr.feature_importances_))

# specify the number of variables you want
num_vars = 5 # insert number of desired variables

# create an empty list to store
selected_features = []

# loop through each variable, sorted by their importance scores
for variable, score in sorted(feature_imp.items(), key=lambda x: x[1], reverse=True):
  # add the variable if below the specified number of variables
  if len(selected_features) < num_vars:
    selected_features.append(variable)

selected_features


# KNN with Bagging

### MinMaxScaler

In [68]:
FILENAME = "MinMax-KNN_v3"
if os.path.exists(FILENAME):
    #if file exists, load it
    search_results = joblib.load(FILENAME)
else:
    pipeline = [
                ("scaler", MinMaxScaler()),
                ("imputer", KNNImputer(n_neighbors=3)),
                ("selector", SelectFromModel(
                    estimator = RandomForestClassifier(
                        random_state=0
                    ),
                    threshold=-np.inf,
                )),
                ("classifier", BaggingClassifier(base_estimator=KNeighborsClassifier()))
               ]

    model = Pipeline(pipeline)

    grid = {
        #{pipeline_name}__{feature_name} : [ ... ]
        "selector__max_features": range(25,30), #testado de 1 a 32
        "classifier__n_estimators": range(15,19), #testado de 1 a 32
        "classifier__base_estimator__n_neighbors": range(2,4), #testado de 1 a 25
        "classifier__base_estimator__weights": ["distance"], #testado distance e uniforme
        "classifier__base_estimator__p":range(1,2) #testado de 1 a 5
    } 

    search_results = GridSearchCV(estimator = model,
                                  param_grid=grid,
                                  cv = 9,
                                  scoring=make_scorer(matthews_corrcoef),
                                  n_jobs=-1, #number of processes; -1 --> use all
                                  verbose=10, #text information
                                  return_train_score=True
                                 )
    search_results.fit(X_Train, y_Train)
    #save file
    joblib.dump(search_results, FILENAME)

Fitting 9 folds for each of 40 candidates, totalling 360 fits


In [69]:
search_results.best_params_

{'classifier__base_estimator__n_neighbors': 2,
 'classifier__base_estimator__p': 1,
 'classifier__base_estimator__weights': 'distance',
 'classifier__n_estimators': 15,
 'selector__max_features': 27}

In [70]:
print("Best Parameters Test Score:", max(search_results.cv_results_["mean_test_score"]))
#-------
#v_1 best score=0.842 Parâmetros: {'classifier__base_estimator__n_neighbors': 2,
 #'classifier__base_estimator__p': 1,
 #'classifier__base_estimator__weights': 'distance',
 #'classifier__n_estimators': 14,
 #'selector__max_features': 15}
#-------
#v_2 best score=0.858 Parâmetros: {'classifier__base_estimator__n_neighbors': 2,
 #'classifier__base_estimator__p': 1,
 #'classifier__base_estimator__weights': 'distance',
 #'classifier__n_estimators': 17,
 #'selector__max_features': 27}
#-------


Best Parameters Test Score: 0.8562796465074833


### Standard Scaler

In [74]:
FILENAME = "StdScaler-KNN_v2"
if os.path.exists(FILENAME):
    #if file exists, load it
    search_results = joblib.load(FILENAME)
else:
    pipeline = [
                ("scaler", StandardScaler()),
                ("imputer", KNNImputer(n_neighbors=3)),
                ("selector", SelectFromModel(
                    estimator = RandomForestClassifier(
                        random_state=0
                    ),
                    threshold=-np.inf,
                )),
                ("classifier", BaggingClassifier(base_estimator=KNeighborsClassifier()))
               ]

    model = Pipeline(pipeline)

    grid = {
        #{pipeline_name}__{feature_name} : [ ... ]
        "selector__max_features": range(31,35), #testado de 1 a 32
        "classifier__n_estimators": range(16,17), #testado de 1 a 32
        "classifier__base_estimator__n_neighbors": range(2,3), #testado de 1 a 25
        "classifier__base_estimator__weights": ["uniform"], #testado distance e uniforme
        "classifier__base_estimator__p":range(1,2) #testado de 1 a 5
    } 

    search_results = GridSearchCV(estimator = model,
                                  param_grid=grid,
                                  cv = 9,
                                  scoring=make_scorer(matthews_corrcoef),
                                  n_jobs=-1, #number of processes; -1 --> use all
                                  verbose=10, #text information
                                  return_train_score=True
                                 )
    search_results.fit(X_Train, y_Train)
    #save file
    joblib.dump(search_results, FILENAME)

Fitting 9 folds for each of 4 candidates, totalling 36 fits


In [75]:
search_results.best_params_

{'classifier__base_estimator__n_neighbors': 2,
 'classifier__base_estimator__p': 1,
 'classifier__base_estimator__weights': 'uniform',
 'classifier__n_estimators': 16,
 'selector__max_features': 31}

In [73]:
print("Best Parameters Test Score:", max(search_results.cv_results_["mean_test_score"]))
#v_1 best score=0.873 Parâmetros: {'classifier__base_estimator__n_neighbors': 2,
 #'classifier__base_estimator__p': 1,
 #'classifier__base_estimator__weights': 'uniform',
 #'classifier__n_estimators': 16,
 #'selector__max_features': 31}

Best Parameters Test Score: 0.8734163964130848


### Power Transformer

In [79]:
FILENAME = "PowerTransformer-KNN_v2"
if os.path.exists(FILENAME):
    #if file exists, load it
    search_results = joblib.load(FILENAME)
else:
    pipeline = [
                ("scaler", PowerTransformer()),
                ("imputer", KNNImputer(n_neighbors=3)),
                ("selector", SelectFromModel(
                    estimator = RandomForestClassifier(
                        random_state=0
                    ),
                    threshold=-np.inf,
                )),
                ("classifier", BaggingClassifier(base_estimator=KNeighborsClassifier()))
               ]

    model = Pipeline(pipeline)

    grid = {
        #{pipeline_name}__{feature_name} : [ ... ]
        "selector__max_features": range(22,28), #testado de 1 a 32
        "classifier__n_estimators": range(14,17), #testado de 1 a 32
        "classifier__base_estimator__n_neighbors": range(3,6), #testado de 1 a 25
        "classifier__base_estimator__weights": ["distance"], #testado distance e uniforme
        "classifier__base_estimator__p":range(1,2) #testado de 1 a 5
    } 

    search_results = GridSearchCV(estimator = model,
                                  param_grid=grid,
                                  cv = 9,
                                  scoring=make_scorer(matthews_corrcoef),
                                  n_jobs=-1, #number of processes; -1 --> use all
                                  verbose=10, #text information
                                  return_train_score=True
                                 )
    search_results.fit(X_Train, y_Train)
    #save file
    joblib.dump(search_results, FILENAME)

Fitting 9 folds for each of 54 candidates, totalling 486 fits


In [80]:
search_results.best_params_

{'classifier__base_estimator__n_neighbors': 3,
 'classifier__base_estimator__p': 1,
 'classifier__base_estimator__weights': 'distance',
 'classifier__n_estimators': 15,
 'selector__max_features': 25}

In [81]:
print("Best Parameters Test Score:", max(search_results.cv_results_["mean_test_score"]))
#v_1 best score=0.854 Parâmetros: {'classifier__base_estimator__n_neighbors': 3,
 #'classifier__base_estimator__p': 1,
 #'classifier__base_estimator__weights': 'distance',
 #'classifier__n_estimators': 16,
 #'selector__max_features': 27}
#-----
#v_2 best score=0.854 Parâmetros: {'classifier__base_estimator__n_neighbors': 3,
 #'classifier__base_estimator__p': 1,
 #'classifier__base_estimator__weights': 'distance',
 #'classifier__n_estimators': 15,
 #'selector__max_features': 27}

Best Parameters Test Score: 0.8542768235478974
