In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import (mean_squared_error, max_error, 
                             confusion_matrix, ConfusionMatrixDisplay, 
                             classification_report, make_scorer, matthews_corrcoef)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import joblib

In [None]:
df = pd.read_csv("biodegradable_a.csv")
df.shape

In [None]:
X = df.drop(columns="Biodegradable")
y = df.Biodegradable

In [None]:
#new biodegradable (new_x):  1 if RB else -1
y = y.map(lambda x: 1 if x=='RB' else -1)

In [None]:
#Criação do train + test e validation set
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.25, random_state=22)

In [None]:
y_Train.value_counts()

In [None]:
X_Train.info()

In [None]:
print(
    f"Maximum missing attributes on the rows: {X_Train.isna().sum(axis=1).max()}"
)

In [None]:
missing_cols = X_Train.isna().sum()
missing_cols[missing_cols>0]

Number of null values is significant on many columns ( > 25% ) <br>
Droping features is not an option for dealing with missing data, because we do not have the knowledge yet if they have relation with the class we want to predict<br>

However, per sample, 6 out of 40 attributes doesn't seem very significant.
This before the feature selection.

## Classification Models

- ~[ ] Logit~
- ~[ ] LDA~
- [x] SVM
- ~[ ] Naive Bayes~
- ~[ ] DecisionTree~
- ~[ ] KNN~
- [x] Bagging:
    - RandomForest
    - KNN
- ~[ ] Boosting~

# Testing Imputation Models

## Test with MinMaxScaler

In [None]:
X_train_not_nan = X_Train.dropna()

In [None]:
X_train_not_nan.shape

In [None]:
X_Train.shape

The difference in the number of rows, from the variable *X_train_not_nan* and the variable *X_train* indicates that a huge number of instances are missing at least one of the features, hence droping rows is not a viable option

In [None]:
priors = X_Train.isna().sum()/X_Train.shape[0]
priors.shape

In [None]:
def get_mask(X,priors):
    masks = np.empty(shape = X.shape, dtype=np.bool_)
    for i, p in enumerate(priors):
        masks[:, i] = np.random.choice((True,False), size=masks.shape[0], p=(p,1-p))
    return masks

In [None]:
scaler = MinMaxScaler().fit(X_train_not_nan)
X_train_not_nan_scaled = pd.DataFrame(data = scaler.transform(X_train_not_nan),
                                      columns=X_train_not_nan.columns)

In [None]:
N = 50
masks = [get_mask(X_train_not_nan, priors) for _ in range(N)]

In [None]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [None]:
results[results>0].dropna()

In [None]:
results[results>0].dropna().sum()

The *KNNImputer* is the one that better predicts the missing values, according to this test, since it is the one that gets closer results for every feature with missing values, which results having the least summed error. <br>
Not many different parameters were used for it, so it can probably achieve even better results.

In [None]:
imputers = (
        KNNImputer(n_neighbors=i) for i in range(3,11)
)

labels = [f"KNN_{i}_neighbors" for i in range(3,11)]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_not_nan_scaled.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_not_nan_scaled.columns, 
                                 mean_squared_error(X_train_not_nan_scaled, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [None]:
results[results>0].dropna()

In [None]:
results[results>0].dropna().sum()

In [None]:
results[results>0].dropna().sum().idxmin()

## Test with StandardScaler

In [None]:
pd.concat( (X_Train, y_Train), axis=1)

In [None]:
#Normalização por Standard Scaler
scaler = StandardScaler()
X_Train_scaled=scaler.fit_transform(X_train_not_nan)
X_train_stdScaler=pd.DataFrame(
    data = X_Train_scaled,
    columns=X_Train.columns
)
X_train_stdScaler

In [None]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [None]:
results[results>0].dropna()

In [None]:
results[results>0].dropna().sum()

Similarly to the MinMaxScaler, KNN imputer is the model that has the least error on it's predictions.

In [None]:
imputers = (
        KNNImputer(n_neighbors=i) for i in range(3,11)
)

labels = [f"KNN_{i}_neighbors" for i in range(3,11)]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_stdScaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_stdScaler.columns, 
                                 mean_squared_error(X_train_stdScaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [None]:
results[results>0].dropna()

In [None]:
results[results>0].dropna().sum()

In [None]:
results[results>0].dropna().sum().idxmin()

## Test with Power Transformer

In [None]:
#FAZER FUNÇAO PARA NAO REPETIR O CODIGO DEPOIS

#Normalização por Power Transform

X_Train_powerTscaled=PowerTransformer().fit_transform(X_train_not_nan)
X_train_powerTscaler=pd.DataFrame(
    data = X_Train_powerTscaled,
    columns=X_Train.columns
)
#X_train_powerTscaler

In [None]:
imputers = (
        SimpleImputer(),
        SimpleImputer(strategy="median"),
        KNNImputer()
)
labels = ["SimpleImpute_mean", "SimpleImpute_median", "KNN"]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_powerTscaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_powerTscaler.columns, 
                                 mean_squared_error(X_train_powerTscaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [None]:
results[results>0].dropna().sum()

As we can see, KNN has the minimum error in every scalling method, so we will only use KNN-Imputer.

In [None]:
imputers = (
        KNNImputer(n_neighbors=i) for i in range(3,11)
)

labels = [f"KNN_{i}_neighbors" for i in range(3,11)]

results = pd.DataFrame(index=X_train_not_nan.columns)
for label, model in zip(labels,imputers):
    errors=pd.DataFrame(columns = X_train_not_nan.columns)
    for _ in range(N):
        X_masked = X_train_powerTscaler.mask(masks[_])
        
        model = model.fit(X_masked)
        X_imputed = model.transform(X_masked)

        errors.loc[_] = dict(zip(X_train_powerTscaler.columns, 
                                 mean_squared_error(X_train_powerTscaler, 
                                                    X_imputed, 
                                                    squared=False, 
                                                    multioutput="raw_values")
                                ))
    results[label] = errors.mean()

In [None]:
results[results>0].dropna().sum()

In [None]:
results[results>0].dropna().sum().idxmin()

## Imputer Results

For all the cases tested, the 3-NN imputer was the one that obtained better results

# Scale and impute the rest of the data

Perform imputation of missing values before scaling, as scaling could lead to distorted data if the missing values are not first replaced. This is because some calculations may include the missing values and their presence could lead to skewed results.

## Impute data with knn

In [None]:
# Create an instance of the KNNImputer
imputer = KNNImputer()

X_Train_imputed = imputer.fit_transform(X_Train)

X_Train_imputed

## Scale with PowerTransform

In [None]:
pt = PowerTransformer()
X_Train_imputed_powerT = pt.fit_transform(X_Train_imputed)

# Feature Selection using RandomForest

Fitting a tree find the best features.

In [None]:
def get_sorted_labels(col_names : list[str], pipeline: Pipeline) -> list[str]:
    #join names and scores in tuples (column, score)
    label_scores = zip(col_names, pipeline["selector"].estimator_.feature_importances_)
    
    #sort tuples accoding to value in index 1 (column, -> score <-)
    sorted_labels = sorted(label_scores, key = lambda x: x[1], reverse=True)
    
    return sorted_labels

In [None]:
def conf_mat(y_test, pred):
    ConfusionMatrixDisplay(
        confusion_matrix(y_test, pred)
    ).plot()

In [None]:
(X_Train.std()==0).any()

There are no features with constant values, so there is no need to drop any column

In [None]:
temp_df = pd.DataFrame(columns=["Std dev", "Corr with Biodegradable"])
temp_df["Corr with Biodegradable"] = np.abs(pd.concat((X_Train, y_Train), axis=1).corr()["Biodegradable"]).sort_values(ascending=False)
temp_df["Std dev"] = X_Train.std()**2

temp_df

> FALAR DE IMPORTÂNCIA DO SCORE DAS FEATURES

## Models with MinMaxScaler

## Random Forest Classifier

### First test, testing wit higher jumps to check 

In [None]:
FILENAME = "MinMax-RF_v3"
if os.path.exists(FILENAME):
    #if file exists, load it
    search_results = joblib.load(FILENAME)
else:
    pipeline = [
                ("scaler", MinMaxScaler()),
                ("imputer", KNNImputer(n_neighbors=3)),
                ("selector", SelectFromModel(
                    estimator = RandomForestClassifier(
                        random_state=0
                    ),
                    threshold=-np.inf,
                )),
                ("classifier", RandomForestClassifier(
                        min_samples_leaf= 1,
                ))
               ]

    model = Pipeline(pipeline)

    grid = {
        #{pipeline_name}__{feature_name} : [ ... ]
        "selector__max_features": range(22, 37+1, 1),
        "classifier__criterion": ["gini", "entropy"],
        "classifier__max_depth" : range(25, 35+1, 1),
        #classifier__min_samples_leaf : range(1,10)
    } 

    search_results = GridSearchCV(estimator = model,
                                  param_grid=grid,
                                  cv = 9,
                                  scoring=make_scorer(matthews_corrcoef),
                                  n_jobs=-1, #number of processes; -1 --> use all
                                  verbose=10, #text information
                                  return_train_score=True
                                 )
    search_results.fit(X_Train, y_Train)
    #save file
    joblib.dump(search_results, FILENAME)

In [None]:
search_results.best_params_

In [None]:
print("Best Parameters Test Score:", max(search_results.cv_results_["mean_test_score"])

In [None]:
#
#
# Search results
#
#
method="gini"
is_2nd_half = 1 if method=="gini" else 0 
size = int(len(search_results.cv_results_["mean_test_score"]) / 2)

data = list(filter(lambda x: x["classifier__criterion"]==method, search_results.cv_results_["params"]))

#getting list of max depth values
x_max_depth = list(map(lambda x: x["classifier__max_depth"], data))
#min samples leaf list
y_min_samples_leaf = list(map(lambda x: x["classifier__min_samples_leaf"], data))
#max features list
z_max_features = list(map(lambda x: x["selector__max_features"], data))

#test scores list
#only gets half the data because subselection of "gini" data (in cv_results_ we can see that 
# the first results are all with gini and the rest is using entropy )
scores = search_results.cv_results_["mean_test_score"][ is_2nd_half*size : (is_2nd_half+1)*size ]

In [None]:
get_sorted_labels(X_Train.columns, search_results.best_estimator_)

In [None]:
list(filter(lambda item: item[1]>0.02, get_sorted_labels(X_Train.columns, search_results.best_estimator_)));

In [None]:
%matplotlib inline
im = plt.scatter(x_max_depth, z_max_features, c=scores)
plt.colorbar(im)

In [None]:
%matplotlib widget
#plt.figure(figsize=(12,12))

f, ax = plt.subplots(figsize=(8,8), subplot_kw={"projection":"3d"})
im = ax.scatter(x_max_depth, 
           y_min_samples_leaf, 
           z_max_features, 
           c = scores,
           cmap=plt.viridis(),
            #s=50
          )
ax.set_xlabel("Max Depth")
ax.set_ylabel("Min Samples Leaf")
ax.set_zlabel("Max Nº of Features")

cbar = plt.colorbar(im)
cbar.ax.set_ylabel('MCC Score', rotation=0)
plt.tight_layout()

## Using Standard Scaler

In [None]:

N,M=X_Train_imputed.shape

rfr=RandomForestRegressor(random_state=0)
#threshold is minus infinity
sel = SelectFromModel(estimator=rfr, threshold=-np.inf, max_features=5)

sel.fit(X_Train_imputed_powerT, y_Train)

print("Importances: ", sel.estimator_.feature_importances_)

print("Default threshold: ", sel.threshold_)

features=sel.get_support()
Features_selected =np.arange(M)[features]
print("The features selected are columns: ", Features_selected)



## Using Power Tranformer

### Another way to do the same. choose later

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_Train_imputed_powerT, y_Train)


# create a dictionary to store your feature importance scores
feature_imp = dict(zip(X_Train.columns, rfr.feature_importances_))

# specify the number of variables you want
num_vars = 5 # insert number of desired variables

# create an empty list to store
selected_features = []

# loop through each variable, sorted by their importance scores
for variable, score in sorted(feature_imp.items(), key=lambda x: x[1], reverse=True):
  # add the variable if below the specified number of variables
  if len(selected_features) < num_vars:
    selected_features.append(variable)

selected_features
