# Training models

## MLP

### Heart

In [7]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt


# diabetes = pd.read_csv('data/datasets_228_482_diabetes.csv', sep = ',')
heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')
# wine = pd.read_csv('data/datasets_794161_1363233_wine.csv', sep = ',')


#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer

from sklearn.neural_network import MLPClassifier


X = heart.loc[:, heart.columns != 'target']
y = heart.target
    

tuned_parameters = [
{'solver':['sgd'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'momentum':[0.1, 0.3, 0.5, 0.7], 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive']
},
{'solver':['lbfgs'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive']
},
{'solver':['adam'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive'],
 'epsilon':[1e-8, 1e-7, 1e-3, 1e-1]
}
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = MLPClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/MLP_heart.pkl')


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'activation': 'tanh', 'alpha': 0.1, 'hidden_layer_sizes': 10, 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'solver': 'lbfgs', 'verbose': False}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.70      0.21      0.33        33
           1       0.61      0.93      0.73        43

    accuracy                           0.62        76
   macro avg       0.65      0.57      0.53        76
weighted avg       0.65      0.62      0.56        76




### wine

In [11]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer

from sklearn.neural_network import MLPClassifier

wine = pd.read_csv('data/wine_limpo.csv', sep = ',')


X = wine.loc[:, wine.columns != 'target']
y = wine.target
    

tuned_parameters = [
{'solver':['sgd'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'momentum':[0.1, 0.3, 0.5, 0.7], 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive']
},
{'solver':['lbfgs'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive']
},
{'solver':['adam'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive'],
 'epsilon':[1e-8, 1e-7, 1e-3, 1e-1]
}
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = MLPClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/MLP_wine.pkl')


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'activation': 'logistic', 'alpha': 0.001, 'epsilon': 1e-08, 'hidden_layer_sizes': 150, 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'solver': 'adam', 'verbose': False}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.87      0.79       118
           1       0.70      0.47      0.56        74

    accuracy                           0.72       192
   macro avg       0.71      0.67      0.68       192
weighted avg       0.72      0.72      0.70       192




### diabetes

In [12]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer

from sklearn.neural_network import MLPClassifier

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')


X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target
    

tuned_parameters = [
{'solver':['sgd'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'momentum':[0.1, 0.3, 0.5, 0.7], 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive']
},
{'solver':['lbfgs'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive']
},
{'solver':['adam'], 'hidden_layer_sizes':[10, 25, 50, 100, 150],
 'learning_rate_init':[0.001], 'alpha':[0.0001, 0.001, 0.01, 0.1], 
 'activation':['identity', 'logistic', 'tanh', 'relu'],
 'verbose':[False], 'learning_rate':['constant', 'invscaling', 'adaptive'],
 'epsilon':[1e-8, 1e-7, 1e-3, 1e-1]
}
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = MLPClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/MLP_diabetes.pkl')


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'activation': 'logistic', 'alpha': 0.01, 'epsilon': 1e-07, 'hidden_layer_sizes': 100, 'learning_rate': 'invscaling', 'learning_rate_init': 0.001, 'solver': 'adam', 'verbose': False}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      0.65      0.67       189
           1       0.70      0.74      0.72       211

    accuracy                           0.69       400
   macro avg       0.69      0.69      0.69       400
weighted avg       0.69      0.69      0.69       400




## Random Forest

### heart

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer

from sklearn.ensemble import RandomForestClassifier


X = heart.loc[:, heart.columns != 'target']
y = heart.target


# n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
# verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)[source]
tuned_parameters = [
                    {'criterion':['gini'], 'random_state':[6411994], 
                     'max_depth':[10, 25, 50, 100], 'bootstrap':[True],
                     'min_samples_split':[0.1, 0.3, 0.5, 0.7], 'max_features':['auto', 'sqrt', 'log2'],
                     'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 
                     'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100], 'max_leaf_nodes':[None],
                     'oob_score':[True, False]
                    },
    
                    {'criterion':['entropy'], 'random_state':[6411994], 
                     'max_depth':[10, 25, 50, 100], 'bootstrap':[True],
                     'min_samples_split':[0.1, 0.3, 0.5, 0.7], 'max_features':['auto', 'sqrt', 'log2'],
                     'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 
                     'min_impurity_split':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 'max_leaf_nodes':[None],
                     'oob_score':[True, False] }
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = RandomForestClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/RF_heart.pkl')


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.001, 'min_impurity_split': 0.1, 'min_samples_split': 0.1, 'oob_score': True, 'random_state': 6411994}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.86      0.76      0.81        33
           1       0.83      0.91      0.87        43

    accuracy                           0.84        76
   macro avg       0.85      0.83      0.84        76
weighted avg       0.84      0.84      0.84        76




### wine dataset

In [13]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

# heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer

from sklearn.ensemble import RandomForestClassifier


wine = pd.read_csv('data/wine_limpo.csv', sep = ',')


X = wine.loc[:, wine.columns != 'target']
y = wine.target

# n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
# verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)[source]
tuned_parameters = [
                    {'criterion':['gini'], 'random_state':[6411994], 
                     'max_depth':[10, 25, 50, 100], 'bootstrap':[True],
                     'min_samples_split':[0.1, 0.3, 0.5, 0.7], 'max_features':['auto', 'sqrt', 'log2'],
                     'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 
                     'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100], 'max_leaf_nodes':[None],
                     'oob_score':[True, False]
                    },
    
                    {'criterion':['entropy'], 'random_state':[6411994], 
                     'max_depth':[10, 25, 50, 100], 'bootstrap':[True],
                     'min_samples_split':[0.1, 0.3, 0.5, 0.7], 'max_features':['auto', 'sqrt', 'log2'],
                     'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 
                     'min_impurity_split':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 'max_leaf_nodes':[None],
                     'oob_score':[True, False] }
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = RandomForestClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/RF_wine.pkl')


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'min_impurity_split': 0, 'min_samples_split': 0.1, 'oob_score': True, 'random_state': 6411994}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.91      0.81       118
           1       0.76      0.46      0.57        74

    accuracy                           0.73       192
   macro avg       0.74      0.68      0.69       192
weighted avg       0.74      0.73      0.72       192




### diabetes

In [14]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer

from sklearn.ensemble import RandomForestClassifier


X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target


# n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
# verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)[source]
tuned_parameters = [
                    {'criterion':['gini'], 'random_state':[6411994], 
                     'max_depth':[10, 25, 50, 100], 'bootstrap':[True],
                     'min_samples_split':[0.1, 0.3, 0.5, 0.7], 'max_features':['auto', 'sqrt', 'log2'],
                     'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 
                     'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100], 'max_leaf_nodes':[None],
                     'oob_score':[True, False]
                    },
    
                    {'criterion':['entropy'], 'random_state':[6411994], 
                     'max_depth':[10, 25, 50, 100], 'bootstrap':[True],
                     'min_samples_split':[0.1, 0.3, 0.5, 0.7], 'max_features':['auto', 'sqrt', 'log2'],
                     'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 
                     'min_impurity_split':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100], 'max_leaf_nodes':[None],
                     'oob_score':[True, False] }
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = RandomForestClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/RF_diabetes.pkl')


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'min_impurity_split': 0, 'min_samples_split': 0.1, 'oob_score': True, 'random_state': 6411994}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.68      0.70       189
           1       0.73      0.78      0.75       211

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400




## Naive Bayes

### heart

In [27]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.naive_bayes import GaussianNB



X = heart.loc[:, heart.columns != 'target']
y = heart.target


tuned_parameters = [
                    {'var_smoothing':[1e-9, 1e-3, 1e-4]}
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = GaussianNB())

serializer.serialize_model(model = model, file_name = 'serialized_model/NaiveBayes_heart.pkl')



# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'var_smoothing': 1e-09}

Grid scores on development set:

0.775 (+/-0.199) for {'var_smoothing': 1e-09}
0.710 (+/-0.115) for {'var_smoothing': 0.001}
0.763 (+/-0.130) for {'var_smoothing': 0.0001}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.88      0.91      0.90        33
           1       0.93      0.91      0.92        43

    accuracy                           0.91        76
   macro avg       0.91      0.91      0.91        76
weighted avg       0.91      0.91      0.91        76




### wine

In [15]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

# heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.naive_bayes import GaussianNB

wine = pd.read_csv('data/wine_limpo.csv', sep = ',')

X = wine.loc[:, wine.columns != 'target']
y = wine.target


tuned_parameters = [
                    {'var_smoothing':[1e-9, 1e-3, 1e-4]}
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = GaussianNB())

serializer.serialize_model(model = model, file_name = 'serialized_model/NaiveBayes_wine.pkl')



# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'var_smoothing': 1e-09}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.72      0.80      0.76       118
           1       0.61      0.51      0.56        74

    accuracy                           0.69       192
   macro avg       0.67      0.66      0.66       192
weighted avg       0.68      0.69      0.68       192




### diabetes

In [16]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt



#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.naive_bayes import GaussianNB

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')

X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target


tuned_parameters = [
                    {'var_smoothing':[1e-9, 1e-3, 1e-4]}
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = GaussianNB())

serializer.serialize_model(model = model, file_name = 'serialized_model/NaiveBayes_diabetes.pkl')



# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'var_smoothing': 1e-09}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.68      0.70       189
           1       0.73      0.78      0.75       211

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400




## Decision Tree

### base classifier

In [67]:
import numpy as np
import lime
import lime.lime_tabular
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.model_selection import train_test_split

np.random.seed(1)

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#my svm to be explained
model = serializer.load_model(file_name = 'serialized_model/DT_heart.pkl')

predictors = heart.loc[:, heart.columns != 'target']
target = heart.target    
    
X_train, X_test, y_train, y_test = train_test_split(predictors, 
                                                    target, 
                                                    test_size = 0.25, 
                                                    random_state = 6411994)



predicoes = model.predict(X_test)
saida = pd.DataFrame(index = y_test.index)

saida['real_label'] = y_test
saida['pred_label'] = predicoes
saida['mat_conf_mod'] = np.select([
                                    ((saida['real_label'] == 1) & (saida['pred_label'] == 1)),
                                    ((saida['real_label'] == 0) & (saida['pred_label'] == 1)), 
                                    ((saida['real_label'] == 0) & (saida['pred_label'] == 0)), 
                                    ((saida['real_label'] == 1) & (saida['pred_label'] == 0)) 
                                  ], 
                                  [
                                    'TP',
                                    'FP',
                                    'TN',
                                    'FN'
                                  ], 
                                    default='Unknown'
                                )


#random sample by each possible value of confusion matrix
grouped_data = saida.groupby(['mat_conf_mod'])
sample_by_group = grouped_data.sample(n = 1, random_state = 6411994)

print(sample_by_group)
print(sample_by_group.index)


     real_label  pred_label mat_conf_mod
113           1           0           FN
267           0           1           FP
168           0           0           TN
154           1           1           TP
Int64Index([113, 267, 168, 154], dtype='int64')


In [37]:
from sklearn.metrics import confusion_matrix

y_true = [1, 0, 1, 0, 0, 1]
y_pred = [0, 0, 1, 1, 0, 0]

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(tn)
print(fp)
print(fn)
print(tp)


2
1
2
1


### Heart

In [5]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn import tree


X = heart.loc[:, heart.columns != 'target']
y = heart.target


tuned_parameters = [
{'criterion':['gini'], 'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2'],
 'max_depth':[None], 'min_samples_split':[2, 4, 8, 10], 'min_samples_leaf':[1, 5, 10, 15],
 'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100],
 'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100]},
{'criterion':['entropy'], 'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2'],
 'max_depth':[None], 'min_samples_split':[2, 4, 8, 10], 'min_samples_leaf':[1, 5, 10, 15],
 'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100],
 'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100]}
]

model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = tree.DecisionTreeClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/DT_heart.pkl')

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_impurity_decrease': 0, 'min_impurity_split': 0.1, 'min_samples_leaf': 5, 'min_samples_split': 8, 'splitter': 'best'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.70      0.85      0.77        33
           1       0.86      0.72      0.78        43

    accuracy                           0.78        76
   macro avg       0.78      0.78      0.78        76
weighted avg       0.79      0.78      0.78        76




### wine

In [18]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

# heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn import tree

wine = pd.read_csv('data/wine_limpo.csv', sep = ',')

X = wine.loc[:, wine.columns != 'target']
y = wine.target

tuned_parameters = [
{'criterion':['gini'], 'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2'],
 'max_depth':[None], 'min_samples_split':[2, 4, 8, 10], 'min_samples_leaf':[1, 5, 10, 15],
 'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100],
 'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100]},
{'criterion':['entropy'], 'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2'],
 'max_depth':[None], 'min_samples_split':[2, 4, 8, 10], 'min_samples_leaf':[1, 5, 10, 15],
 'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100],
 'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100]}
]

model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = tree.DecisionTreeClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/DT_wine.pkl')

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_impurity_decrease': 0, 'min_impurity_split': 0.1, 'min_samples_leaf': 15, 'min_samples_split': 8, 'splitter': 'best'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.75      0.77      0.76       118
           1       0.62      0.59      0.61        74

    accuracy                           0.70       192
   macro avg       0.69      0.68      0.68       192
weighted avg       0.70      0.70      0.70       192




### diabetes

In [19]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

# heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn import tree


diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')

X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target



tuned_parameters = [
{'criterion':['gini'], 'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2'],
 'max_depth':[None], 'min_samples_split':[2, 4, 8, 10], 'min_samples_leaf':[1, 5, 10, 15],
 'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100],
 'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100]},
{'criterion':['entropy'], 'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2'],
 'max_depth':[None], 'min_samples_split':[2, 4, 8, 10], 'min_samples_leaf':[1, 5, 10, 15],
 'min_impurity_decrease':[0, 0.1,0.01, 0.001, 1, 2, 5, 10, 100],
 'min_impurity_split':[0.0, 0.1,0.01, 0.001, 1.0, 2.0, 5, 10, 100]}
]

model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = tree.DecisionTreeClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/DT_diabetes.pkl')

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_impurity_decrease': 0.001, 'min_impurity_split': 0.1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.70      0.71      0.71       189
           1       0.74      0.73      0.74       211

    accuracy                           0.72       400
   macro avg       0.72      0.72      0.72       400
weighted avg       0.72      0.72      0.72       400




## KNN

### Heart

In [6]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn import neighbors



X = heart.loc[:, heart.columns != 'target']
y = heart.target


tuned_parameters = [
{'algorithm':['auto'], 'n_neighbors':[3, 5, 10, 20], 
 'weights':['uniform', 'distance'], 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean']  },

{'algorithm':['ball_tree'], 'n_neighbors':[3, 5, 10, 20], 
 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean'],
 'weights':['uniform', 'distance'], 'leaf_size':[3, 10, 30, 30, 50] },
    
{'algorithm':['kd_tree'], 'n_neighbors':[3, 5, 10, 20], 
 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean'],
 'weights':['uniform', 'distance'], 'leaf_size':[3, 10, 30, 30, 50] },    
    
{'algorithm':['brute'], 'n_neighbors':[3, 5, 10, 20], 
 'weights':['uniform', 'distance'], 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean']  },
                    ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = neighbors.KNeighborsClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/KNN_heart.pkl')



# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'algorithm': 'ball_tree', 'leaf_size': 10, 'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.73      0.73        33
           1       0.79      0.79      0.79        43

    accuracy                           0.76        76
   macro avg       0.76      0.76      0.76        76
weighted avg       0.76      0.76      0.76        76




### Wine

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt


import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn import neighbors


wine = pd.read_csv('data/wine_limpo.csv', sep = ',')

X = wine.loc[:, wine.columns != 'target']
y = wine.target


tuned_parameters = [
{'algorithm':['auto'], 'n_neighbors':[3, 5, 10, 20], 
 'weights':['uniform', 'distance'], 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean']  },

{'algorithm':['ball_tree'], 'n_neighbors':[3, 5, 10, 20], 
 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean'],
 'weights':['uniform', 'distance'], 'leaf_size':[3, 10, 30, 30, 50] },
    
{'algorithm':['kd_tree'], 'n_neighbors':[3, 5, 10, 20], 
 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean'],
 'weights':['uniform', 'distance'], 'leaf_size':[3, 10, 30, 30, 50] },    
    
{'algorithm':['brute'], 'n_neighbors':[3, 5, 10, 20], 
 'weights':['uniform', 'distance'], 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean']  },
                    ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = neighbors.KNeighborsClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/KNN_wine.pkl')



# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.66      0.69       189
           1       0.72      0.76      0.74       211

    accuracy                           0.71       400
   macro avg       0.71      0.71      0.71       400
weighted avg       0.71      0.71      0.71       400




### Diabetes

In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt


import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn import neighbors


diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')
X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target


tuned_parameters = [
{'algorithm':['auto'], 'n_neighbors':[3, 5, 10, 20], 
 'weights':['uniform', 'distance'], 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean']  },

{'algorithm':['ball_tree'], 'n_neighbors':[3, 5, 10, 20], 
 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean'],
 'weights':['uniform', 'distance'], 'leaf_size':[3, 10, 30, 30, 50] },
    
{'algorithm':['kd_tree'], 'n_neighbors':[3, 5, 10, 20], 
 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean'],
 'weights':['uniform', 'distance'], 'leaf_size':[3, 10, 30, 30, 50] },    
    
{'algorithm':['brute'], 'n_neighbors':[3, 5, 10, 20], 
 'weights':['uniform', 'distance'], 'metric':['minkowski', 'chebyshev', 'manhattan', 'euclidean']  },
                    ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = neighbors.KNeighborsClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/KNN_diabetes.pkl')



# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'algorithm': 'auto', 'metric': 'chebyshev', 'n_neighbors': 5, 'weights': 'distance'}
Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.69      0.84      0.76       118
           1       0.60      0.39      0.48        74

    accuracy                           0.67       192
   macro avg       0.65      0.62      0.62       192
weighted avg       0.66      0.67      0.65       192




## Gaussian Process

### Heart

In [9]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

X = heart.loc[:, heart.columns != 'target']
y = heart.target

kernel = 1.0 * RBF(1.0)
tuned_parameters = [
                    {'kernel':[kernel], 
                     'optimizer': ['fmin_l_bfgs_b'], 
                     'n_restarts_optimizer':[5, 10, 50], 
                     'max_iter_predict':[100, 200]
                    }
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = GaussianProcessClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/GaussianProcessClassifier_heart.pkl')

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'kernel': 1**2 * RBF(length_scale=1), 'max_iter_predict': 100, 'n_restarts_optimizer': 50, 'optimizer': 'fmin_l_bfgs_b'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.90      0.82      0.86        33
           1       0.87      0.93      0.90        43

    accuracy                           0.88        76
   macro avg       0.88      0.87      0.88        76
weighted avg       0.88      0.88      0.88        76




### Wine

In [23]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

# heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF


wine = pd.read_csv('data/wine_limpo.csv', sep = ',')

X = wine.loc[:, wine.columns != 'target']
y = wine.target

kernel = 1.0 * RBF(1.0)
tuned_parameters = [
                    {'kernel':[kernel], 
                     'optimizer': ['fmin_l_bfgs_b'], 
                     'n_restarts_optimizer':[5, 10, 50], 
                     'max_iter_predict':[100, 200]
                    }
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = GaussianProcessClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/GaussianProcessClassifier_wine.pkl')

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'kernel': 1**2 * RBF(length_scale=1), 'max_iter_predict': 200, 'n_restarts_optimizer': 10, 'optimizer': 'fmin_l_bfgs_b'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.74      0.86      0.80       118
           1       0.70      0.51      0.59        74

    accuracy                           0.73       192
   macro avg       0.72      0.69      0.70       192
weighted avg       0.73      0.73      0.72       192




### Diabetes

In [24]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

# heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')

X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target


kernel = 1.0 * RBF(1.0)
tuned_parameters = [
                    {'kernel':[kernel], 
                     'optimizer': ['fmin_l_bfgs_b'], 
                     'n_restarts_optimizer':[5, 10, 50], 
                     'max_iter_predict':[100, 200]
                    }
                   ]


model = model_train.train_model(predictors = X,
                                target = y,
                                folds = 10,
                                param_to_be_tunned = tuned_parameters,
                                estimator = GaussianProcessClassifier())

serializer.serialize_model(model = model, file_name = 'serialized_model/GaussianProcessClassifier_diabetes.pkl')

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'kernel': 1**2 * RBF(length_scale=1), 'max_iter_predict': 200, 'n_restarts_optimizer': 10, 'optimizer': 'fmin_l_bfgs_b'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.71      0.72       189
           1       0.75      0.77      0.76       211

    accuracy                           0.74       400
   macro avg       0.74      0.74      0.74       400
weighted avg       0.74      0.74      0.74       400




## SVM

### heart

In [3]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.svm import SVC

X = heart.loc[:, heart.columns != 'target']
y = heart.target
    
tuned_parameters = [
                    #kernel rbf
                    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000], 'probability':[True]},
                    #kernel linear 
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'probability':[True]}
                    #kernel sigmoid
#                     {'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 
#                      'coef0':[1e-3, 1e-4, 0, 1.0, 2.0, 10], 'probability':[True]},
#                     #kernel poly
#                     {'kernel': ['poly'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] , 
#                      'degree':[1,2,3,4], 'coef0':[1e-3, 1e-4, 0, 1.0, 2.0, 10], 'probability':[True]}                    
                   ]


model = model_train.train_model(predictors = X, 
                                target = y, 
                                folds = 10, 
                                param_to_be_tunned = tuned_parameters, 
                                estimator = SVC())

serializer.serialize_model(model = model, file_name = 'serialized_model/svm_heart.pkl')

# Tuning hyper-parameters for f1
# Best parameters set found on development set:
# {'C': 1, 'coef0': 0.001, 'degree': 3, 'gamma': 0.0001, 'kernel': 'poly'}

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 100, 'kernel': 'linear', 'probability': True}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.88      0.85      0.86        33
           1       0.89      0.91      0.90        43

    accuracy                           0.88        76
   macro avg       0.88      0.88      0.88        76
weighted avg       0.88      0.88      0.88        76




### wine

In [4]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.svm import SVC

wine = pd.read_csv('data/wine_limpo.csv', sep = ',')

X = wine.loc[:, wine.columns != 'target']
y = wine.target
    
tuned_parameters = [
                    #kernel rbf
                    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000], 'probability':[True]},
                    #kernel linear 
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'probability':[True]}
                    #kernel sigmoid
#                     {'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 
#                      'coef0':[1e-3, 1e-4, 0, 1.0, 2.0, 10], 'probability':[True]},
#                     #kernel poly
#                     {'kernel': ['poly'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] , 
#                      'degree':[1,2,3,4], 'coef0':[1e-3, 1e-4, 0, 1.0, 2.0, 10], 'probability':[True]}                    
                   ]


model = model_train.train_model(predictors = X, 
                                target = y, 
                                folds = 10, 
                                param_to_be_tunned = tuned_parameters, 
                                estimator = SVC())

serializer.serialize_model(model = model, file_name = 'serialized_model/svm_wine.pkl')

# Tuning hyper-parameters for f1
# Best parameters set found on development set:
# {'C': 1, 'coef0': 0.001, 'degree': 3, 'gamma': 0.0001, 'kernel': 'poly'}

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear', 'probability': True}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.74      0.87      0.80       118
           1       0.72      0.51      0.60        74

    accuracy                           0.73       192
   macro avg       0.73      0.69      0.70       192
weighted avg       0.73      0.73      0.72       192




### diabetes

In [2]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.svm import SVC
import pandas as pd

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')

X = diabetes.loc[:, diabetes.columns != 'target']
y = diabetes.target
    
tuned_parameters = [
                    #kernel rbf
                    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000], 'probability':[True]},
                    #kernel linear 
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'probability':[True]}
                    #kernel sigmoid
#                     {'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 
#                      'coef0':[1e-3, 1e-4, 0, 1.0, 2.0, 10], 'probability':[True]},
#                     #kernel poly
#                     {'kernel': ['poly'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] , 
#                      'degree':[1,2,3,4], 'coef0':[1e-3, 1e-4, 0, 1.0, 2.0, 10], 'probability':[True]}                    
                   ]


model = model_train.train_model(predictors = X, 
                                target = y, 
                                folds = 10, 
                                param_to_be_tunned = tuned_parameters, 
                                estimator = SVC())

serializer.serialize_model(model = model, file_name = 'serialized_model/svm_diabetes.pkl')

# Tuning hyper-parameters for f1 
# Best parameters set found on development set:
# {'C': 1, 'coef0': 0.001, 'degree': 3, 'gamma': 0.0001, 'kernel': 'poly'}b

# Tuning hyper-parameters for f1

# Best parameters set found on development set:

# {'C': 1, 'kernel': 'linear', 'probability': True}
# Detailed classification report:

# The model is trained on the full development set.
# The scores are computed on the full evaluation set.

#               precision    recall  f1-score   support

#            0       0.75      0.78      0.76       189
#            1       0.79      0.77      0.78       211

#     accuracy                           0.77       400
#    macro avg       0.77      0.77      0.77       400
# weighted avg       0.77      0.77      0.77       400


# executed in 5h 53m 26s, finished 14:16:22 2020-08-18

# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear', 'probability': True}
Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.75      0.78      0.76       189
           1       0.79      0.77      0.78       211

    accuracy                           0.77       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.77      0.77      0.77       400


