In [97]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import mean_squared_error, max_error
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [150]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

(4564, 42)

In [99]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [100]:
#new biodegradable (new_x):  1 if RB else -1
y = y.map(lambda x: 1 if x=='RB' else -1)

In [101]:
#Criação do train + test e validation set
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.25, random_state=22)

In [102]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3423 entries, 4057 to 2933
Data columns (total 41 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SpMax_L   3423 non-null   float64
 1   J_Dz(e)   3423 non-null   float64
 2   nHM       3423 non-null   float64
 3   F01       3034 non-null   float64
 4   F04       3423 non-null   float64
 5   NssssC    3423 non-null   float64
 6   nCb       3423 non-null   float64
 7   C         2850 non-null   float64
 8   nCp       2926 non-null   float64
 9   nO        3423 non-null   float64
 10  F03       3423 non-null   float64
 11  SdssC     3423 non-null   float64
 12  HyWi_B    3076 non-null   float64
 13  LOC       3423 non-null   float64
 14  SM6_L     3423 non-null   float64
 15  F03_CO    3396 non-null   float64
 16  Me        3091 non-null   float64
 17  Mi        3423 non-null   float64
 18  nN_N      3423 non-null   float64
 19  nArNO2    3423 non-null   float64
 20  nCRX3     3423 non-null   f

In [103]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

Maximum missing attributes on the rows: 6


In [104]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

F01         389
C           573
nCp         497
HyWi_B      347
F03_CO       27
Me          332
nCIR        373
SpMax_A     494
SdO         166
nCrt        177
SpMax_B    1003
Psi_i_A     317
nX          513
dtype: int64

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- [ ] Logit
- [ ] LDA
- [ ] SVM
- [ ] Naive Bayes
- [ ] DecisionTree
- [ ] KNN
- [ ] Bagging
- [ ] Boosting 

# Testing Imputation Models

## Test with MinMaxScaler

In [105]:
X_train_not_nan = X_Train.dropna()

In [106]:
X_train_not_nan.shape

(698, 41)

In [107]:
X_Train.shape

(3423, 41)

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [108]:
priors = X_Train.isna().sum()/X_Train.shape[0]
priors.shape

(41,)

In [109]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [110]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

In [111]:
N = 500
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [112]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [113]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.010483,0.009649,0.010376
C,0.071318,0.072185,0.029181
nCp,0.045277,0.045892,0.033346
HyWi_B,0.0416,0.041644,0.016341
F03_CO,0.015224,0.015884,0.007531
Me,0.032738,0.032997,0.01772
nCIR,0.024946,0.02884,0.015923
SpMax_A,0.044961,0.044988,0.018921
SdO,0.041475,0.041719,0.01739
nCrt,0.008369,0.00746,0.007198


In [114]:
results[results>0].dropna().sum()

SimpleImpute_mean      0.444530
SimpleImpute_median    0.449993
KNN                    0.233503
dtype: float64

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.

## Test with StandardScaler

In [115]:
pd.concat( (X_Train, y_Train), axis=1)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX,Biodegradable
4057,3.776854,2.408741,0.0,0.0,0.0,0.0,0.0,30.000000,2.0,0.0,...,0.0,0.0,0.0,3.177099,2.479789,0.0,6.967228,0.0,0.0,1
4322,4.207577,3.405557,0.0,0.0,0.0,0.0,0.0,25.000000,1.0,2.0,...,0.0,0.0,1.0,3.552206,4.118984,0.0,7.700636,0.0,,1
194,4.650000,4.031300,0.0,0.0,1.0,0.0,0.0,31.300000,3.0,2.0,...,0.0,3.0,0.0,3.527000,2.372000,1.0,8.131000,0.0,0.0,1
2202,4.500517,3.039395,0.0,0.0,0.0,0.0,2.0,,0.0,0.0,...,0.0,0.0,1.0,,2.511390,1.0,8.096866,0.0,0.0,1
4351,4.344574,3.645214,0.0,0.0,0.0,0.0,0.0,31.457361,2.0,3.0,...,0.0,0.0,1.0,3.423525,3.051219,0.0,7.743159,0.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,4.868000,3.025200,0.0,0.0,1.0,0.0,0.0,34.800000,3.0,1.0,...,0.0,5.0,1.0,,,2.0,8.261000,0.0,,-1
2527,3.974877,2.917110,0.0,0.0,0.0,0.0,0.0,,2.0,0.0,...,0.0,0.0,0.0,3.233164,1.800985,0.0,8.136023,0.0,0.0,1
2952,4.292865,3.156162,0.0,0.0,0.0,0.0,0.0,33.934010,2.0,2.0,...,0.0,0.0,0.0,3.344344,2.044372,0.0,8.317020,0.0,,1
356,4.596000,3.416100,2.0,0.0,0.0,0.0,0.0,45.500000,0.0,0.0,...,0.0,2.0,0.0,3.992000,2.569000,1.0,8.812000,0.0,2.0,-1


In [116]:
#Normalização por Standard Scaler
scaler = StandardScaler()
X_Train_scaled=scaler.fit_transform(X_train_not_nan)
X_train_stdScaler=pd.DataFrame(
    data = X_Train_scaled,
    columns=X_Train.columns
)
X_train_stdScaler

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01,F04,NssssC,nCb,C,nCp,nO,...,nCrt,C_026,F02_CN,nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
0,0.230733,1.389010,-0.232094,-0.071715,1.025951,-0.128021,-0.583471,-0.400740,1.488723,-0.034949,...,-0.096025,-0.335282,2.457302,-0.678067,-0.100319,-0.438917,1.213572,0.024492,-0.225584,-0.139311
1,1.261726,0.682137,-0.232094,-0.071715,-0.255111,1.167032,-0.583471,-1.006461,2.322026,-0.034949,...,-0.096025,-0.335282,-0.336178,1.036755,-0.489638,0.353644,-0.399901,-0.902555,-0.225584,-0.139311
2,0.163050,-0.299268,-0.232094,-0.071715,-0.255111,-0.128021,0.169325,0.895370,-0.177883,0.642670,...,-0.096025,-0.335282,-0.336178,0.179344,0.031227,-0.235835,-0.399901,0.100488,-0.225584,-0.139311
3,0.023820,0.060363,-0.232094,-0.071715,-0.255111,-0.128021,0.169325,1.419572,-1.011186,-0.034949,...,-0.096025,-0.335282,-0.336178,-0.678067,0.114656,0.382507,-0.399901,-0.009828,-0.225584,-0.139311
4,-1.193804,-0.517543,-0.232094,-0.071715,-0.255111,-0.128021,-0.583471,-0.421137,0.655420,-1.390186,...,-0.096025,-0.335282,-0.336178,-0.678067,-0.975716,-1.355502,-0.399901,-0.703168,-0.225584,-0.139311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693,-0.128030,0.289936,-0.232094,-0.071715,-0.255111,-0.128021,-0.583471,-0.273512,-0.177883,-0.034949,...,-0.096025,-0.335282,-0.336178,-0.678067,-0.312834,-0.692220,-0.399901,0.410503,-0.225584,-0.139311
694,2.407050,-1.678164,-0.232094,-0.071715,-0.255111,2.462085,-0.583471,0.083838,0.655420,-0.034949,...,5.059796,-0.335282,-0.336178,1.036755,0.101377,-0.878552,-0.399901,0.072831,-0.225584,-0.139311
695,1.540535,1.441187,2.241214,-0.071715,-0.255111,1.167032,-0.583471,-1.006461,2.322026,-0.034949,...,-0.096025,-0.335282,-0.336178,1.036755,5.132820,0.136108,-0.399901,4.457771,-0.225584,1.046530
696,0.122419,0.463956,2.241214,-0.071715,-0.255111,-0.128021,-0.583471,1.319510,-1.011186,-1.390186,...,-0.096025,-0.335282,1.526142,-0.678067,0.626723,-0.139235,1.213572,0.756031,-0.225584,1.046530


In [117]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [118]:
results[results>0].dropna()

Unnamed: 0,SimpleImpute_mean,SimpleImpute_median,KNN
F01,0.209905,0.193205,0.189561
C,0.409528,0.414505,0.154902
nCp,0.377293,0.382418,0.271015
HyWi_B,0.318167,0.318503,0.119724
F03_CO,0.080901,0.084408,0.042085
Me,0.304623,0.307028,0.169343
nCIR,0.302958,0.350247,0.19274
SpMax_A,0.37983,0.380056,0.157847
SdO,0.216545,0.217817,0.093795
nCrt,0.129451,0.115391,0.10571


In [119]:
results[results>0].dropna().sum()

SimpleImpute_mean      3.879530
SimpleImpute_median    3.919937
KNN                    2.151959
dtype: float64

Similarmente ao caso do MinMaxScaler, o KNN imputer obtém o menor erro nas previsões dos valores em falta.

# Test with Power Transformer

In [126]:
#FAZER FUNÇAO PARA NAO REPETIR O CODIGO DEPOIS

#Normalização por Power Transform

X_Train_powerTscaled=PowerTransformer().fit_transform(X_train_not_nan)
X_train_powerTscaler=pd.DataFrame(
    data = X_Train_powerTscaled,
    columns=X_Train.columns
)
#X_train_powerTscaler

In [127]:
results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_powerTScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_powerTscaler.columns, 
                                 mean_squared_error(X_train_powerTscaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [122]:
results[results>0].dropna().sum()

SimpleImpute_mean      4.026007
SimpleImpute_median    4.141768
KNN                    1.914111
dtype: float64

As we can see, KNN has the minimum error in every scalling method, so we will only use KNN-Imputer.

# Scale and impute the rest of the data

Perform imputation of missing values before scaling, as scaling could lead to distorted data if the missing values are not first replaced. This is because some calculations may include the missing values and their presence could lead to skewed results.

## Impute data with knn

In [145]:
# Create an instance of the KNNImputer
imputer = KNNImputer()

X_Train_imputed = imputer.fit_transform(X_Train)

X_Train_imputed

array([[3.77685373, 2.40874116, 0.        , ..., 6.96722794, 0.        ,
        0.        ],
       [4.20757702, 3.40555706, 0.        , ..., 7.70063609, 0.        ,
        0.        ],
       [4.65      , 4.0313    , 0.        , ..., 8.131     , 0.        ,
        0.        ],
       ...,
       [4.29286534, 3.15616198, 0.        , ..., 8.31702029, 0.        ,
        0.        ],
       [4.596     , 3.4161    , 2.        , ..., 8.812     , 0.        ,
        2.        ],
       [5.0138996 , 2.96835616, 0.        , ..., 8.8007641 , 2.        ,
        0.        ]])

## Scale with PowerTransform

In [152]:
pt = PowerTransformer()
X_Train_imputed_powerT = pt.fit_transform(X_Train_imputed)

# Using RFs for feature selection

Fitting a tree find the best features.

In [172]:
from sklearn.ensemble import RandomForestRegressor

N,M=X_Train_imputed.shape

rfr=RandomForestRegressor(random_state=0)
#threshold is minus infinity
sel = SelectFromModel(estimator=rfr, threshold=-np.inf, max_features=5)

sel.fit(X_Train_imputed_powerT, y_Train)

print("Importances: ", sel.estimator_.feature_importances_)

print("Default threshold: ", sel.threshold_)

features=sel.get_support()
Features_selected =np.arange(M)[features]
print("The features selected are columns: ", Features_selected)



Importances:  [8.23988104e-03 2.12618470e-02 2.48864917e-01 3.12028883e-03
 5.47606973e-02 1.07662200e-01 4.33974418e-02 1.42828950e-02
 9.50600090e-03 5.02455626e-03 8.80909238e-02 2.29096233e-02
 1.13669699e-02 1.86241705e-02 1.07720765e-02 1.46932586e-02
 1.89888605e-02 1.64687295e-02 7.60795629e-04 6.51909379e-03
 9.24109738e-05 4.48890665e-02 4.38464263e-03 1.41660015e-04
 8.58188805e-04 1.51889169e-04 1.08706637e-02 1.11504208e-02
 1.94641808e-04 1.38882741e-02 1.38802928e-02 3.15696164e-02
 7.46717594e-03 4.62407919e-02 3.99746537e-03 2.55049900e-02
 2.38750647e-02 5.22950970e-03 8.17522961e-03 1.06066515e-03
 2.10621132e-02]
Default threshold:  -inf
The features selected are columns:  [ 2  4  5 10 33]


### Another way to do the same. choose later

In [173]:
rfr = RandomForestRegressor()
rfr.fit(X_Train_imputed_powerT, y_Train)


# create a dictionary to store your feature importance scores
feature_imp = dict(zip(X_Train.columns, rfr.feature_importances_))

# specify the number of variables you want
num_vars = 5 # insert number of desired variables

# create an empty list to store
selected_features = []

# loop through each variable, sorted by their importance scores
for variable, score in sorted(feature_imp.items(), key=lambda x: x[1], reverse=True):
  # add the variable if below the specified number of variables
  if len(selected_features) < num_vars:
    selected_features.append(variable)

selected_features


['nHM', 'NssssC', 'F03', 'F02_CN', 'F04']