# Imports

In [1]:
# Standard Library Imports
import re

# Third-party Library Imports
import pandas as pd
from scipy.stats import ttest_rel as t_test, wilcoxon

# Local Imports
import MLE as mle

# Data

In [2]:
df = pd.read_csv('hemofilia.csv')

shape = df.shape
nan = df.isna().sum().sum()
dup = df.duplicated().sum()
cols = df.columns.tolist()

print(f'Linhas x Colunas: {shape}')
print(f'Qnt. linhas nulas: {nan}')
print(f'Qnt. linhas duplicadas: {dup}\n')
df.info()

Linhas x Colunas: (415, 19)
Qnt. linhas nulas: 0
Qnt. linhas duplicadas: 0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   AA_HGVS              415 non-null    int64  
 1   AA_Legacy            415 non-null    int64  
 2   aa1                  415 non-null    object 
 3   AA_dist              415 non-null    float64
 4   psi                  415 non-null    float64
 5   phi                  415 non-null    float64
 6   bfactor              415 non-null    float64
 7   areaSAS              415 non-null    float64
 8   areaSES              415 non-null    float64
 9   kdHydrophobicity     415 non-null    float64
 10  ConsurfDB            415 non-null    float64
 11  degree               415 non-null    float64
 12  betweenness          415 non-null    float64
 13  closeness            415 non-null    float64
 14  burts         

In [3]:
target = df['Calculated_Severity']
target

0        Severe
1      Moderate
2        Severe
3      Moderate
4          Mild
         ...   
410    Moderate
411        Mild
412    Moderate
413      Severe
414        Mild
Name: Calculated_Severity, Length: 415, dtype: object

In [4]:
features = df.drop(columns='Calculated_Severity')
features

Unnamed: 0,AA_HGVS,AA_Legacy,aa1,AA_dist,psi,phi,bfactor,areaSAS,areaSES,kdHydrophobicity,ConsurfDB,degree,betweenness,closeness,burts,pr,auth,kcore
0,26,7,Leu,1.622514,150.866429,-107.867773,146.53,4.380023,20.015846,3.8,-793.0,8.0,0.010313,0.115715,0.225146,0.000859,0.074260,4.0
1,28,9,Ala,1.523528,18.239183,168.251651,139452.00,0.000000,0.000000,1.8,-1019.0,10.0,0.004677,0.107229,0.179909,0.001077,0.081676,4.0
2,29,10,Val,1.568551,116.506984,-35.703606,193867.00,10.603600,41.534812,4.2,-746.0,7.0,0.002008,0.101614,0.239194,0.000837,0.043540,4.0
3,31,12,Leu,1.641493,177.755055,-104.656175,154735.00,45.264135,75.880436,3.8,-134.0,7.0,0.002169,0.097581,0.263617,0.000879,0.034803,4.0
4,34,15,Asp,1.602138,-66.129605,-99.559307,195175.00,27.504607,44.515711,-3.5,-925.0,4.0,0.001263,0.098256,0.373760,0.000559,0.004828,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,2322,2303,Thr,1.486620,156.667100,-145.542531,137589.00,9.031416,14.374980,-0.7,-608.0,10.0,0.009058,0.096300,0.195930,0.001149,0.001092,4.0
411,2324,2305,Tyr,1.540734,136.859439,-146.304906,186622.00,39.680483,36.738038,-1.3,-243.0,9.0,0.027182,0.093559,0.188421,0.001049,0.000591,4.0
412,2327,2308,Ile,1.531856,138.332710,-148.494916,169267.00,0.000000,0.000000,4.5,-703.0,7.0,0.000740,0.080816,0.255136,0.000778,0.000115,4.0
413,2330,2311,Gln,1.445797,-63.539770,-138.920341,176917.00,52.479163,63.099512,-3.5,666.0,7.0,0.005663,0.077490,0.230786,0.000863,0.000043,4.0


# Get Top 3 Classifiers Without Tunning

In [5]:
worker = mle.MLE(features,target)

     AA_HGVS  AA_Legacy   AA_dist         psi         phi    bfactor  \
0         26          7  1.622514  150.866429 -107.867773     146.53   
1         28          9  1.523528   18.239183  168.251651  139452.00   
2         29         10  1.568551  116.506984  -35.703606  193867.00   
3         31         12  1.641493  177.755055 -104.656175  154735.00   
4         34         15  1.602138  -66.129605  -99.559307  195175.00   
..       ...        ...       ...         ...         ...        ...   
410     2322       2303  1.486620  156.667100 -145.542531  137589.00   
411     2324       2305  1.540734  136.859439 -146.304906  186622.00   
412     2327       2308  1.531856  138.332710 -148.494916  169267.00   
413     2330       2311  1.445797  -63.539770 -138.920341  176917.00   
414     2334       2315  1.436106  -55.755947   63.847554  161833.00   

        areaSAS     areaSES  kdHydrophobicity  ConsurfDB  ...  aa1_Leu  \
0      4.380023   20.015846               3.8     -793.0  ...

In [6]:
worker

<MLE.MLE at 0x7e2d3c736da0>

In [None]:
list_all_default_results = []
for classifier in worker.classifiers.keys():
    for scaler in worker.scalers.keys():
        results = worker.estimator_validate_without_param_search(scaler, classifier)
        list_all_default_results.append(results)

None Dummy {}
Standard Dummy {}
None GaussianNB {}
Standard GaussianNB {}
None KNeighbors {}
Standard KNeighbors {}
None DecisionTree {}
Standard DecisionTree {}
None RandomForest {}
Standard RandomForest {}
None AdaBoost {}
Standard AdaBoost {}
None Bagging {}
Standard Bagging {}
None GradientBoosting {}
Standard GradientBoosting {}


In [None]:
df_all_default_results = pd.DataFrame(list_all_default_results)
df_all_default_results = df_all_default_results.sort_values(by=worker.scoring, ascending=False).reset_index(drop=True)
df_all_default_results

# Adjust Hyperparameters

In [None]:
list_all_search_results = []
for classifier in worker.classifiers.keys():
    results = worker.estimator_validate('Standard', classifier)
    list_all_search_results.append(results)

In [None]:
df_all_search_results = pd.DataFrame(list_all_search_results)
df_all_search_results = df_all_search_results.sort_values(by=worker.scoring, ascending=False).reset_index(drop=True)
df_all_search_results