# Multiple Classifiers Ensemble System (MCS)

#### Iury Zanonni de Faria

### Imports

#### General imports

In [58]:
import pandas as pd
import numpy as np
import datetime as dt
import statistics as st
import matplotlib.pyplot as plt
import warnings
import optuna

#### Feature Selection imports

In [59]:
from sklearn.feature_selection import mutual_info_classif
# Info gain - weka

#### Diversity imports

In [60]:
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

#### Classifiers imports

In [61]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


#### Metrics

In [62]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

### Settings

In [63]:
DATA = ['Date', 'Current Ratio','Quick Ratio','Current Assets', 'Long-term debt to equity ratio', 'Share Holder Equity','Debt to Equity Ratio', 'Percentage of net profit to sale',
'Percentage of operating profit to sale','Percentage of Gross profit to sale','ROA','ROE','EPS','P/E','P/S','Stock book value','Stock Price','ROI','MarketReturn', 'Company']

DATA = ['Unnamed: 0', 'revenue','cost-goods-sold','gross-profit','research-development-expenses','selling-general-administrative-expenses','operating-expenses',
'operating-income','total-non-operating-income-expense','pre-tax-income','total-provision-income-taxes','income-after-taxes','income-from-continuous-operations',
'income-from-discontinued-operations','net-income','ebitda','ebit','basic-shares-outstanding','shares-outstanding','eps-basic-net-earnings-per-share',
'eps-earnings-per-share-diluted','cash-on-hand','receivables-total','inventory','other-current-assets','total-current-assets','net-property-plant-equipment',
'long-term-investments','goodwill-intangible-assets-total','other-long-term-assets','total-long-term-assets','total-assets','total-current-liabilities','long-term-debt',
'other-non-current-liabilities','total-long-term-liabilities','total-liabilities','common-stock-net','retained-earnings-accumulated-deficit','comprehensive-income',
'total-share-holder-equity','total-liabilities-share-holders-equity','net-income-loss','total-depreciation-amortization-cash-flow','other-non-cash-items','total-non-cash-items',
'change-in-accounts-receivable','change-in-inventories','change-in-accounts-payable','change-in-assets-liabilities','total-change-in-assets-liabilities',
'cash-flow-from-operating-activities','net-change-in-property-plant-equipment','net-change-in-intangible-assets','net-acquisitions-divestitures','investing-activities-other',
'cash-flow-from-investing-activities','net-long-term-debt','net-current-debt','debt-issuance-retirement-net-total','net-common-equity-issued-repurchased',
'net-total-equity-issued-repurchased','total-common-preferred-stock-dividends-paid','financial-activities-other','cash-flow-from-financial-activities',
'net-cash-flow','stock-based-compensation','common-stock-dividends-paid','current-ratio','long-term-debt-capital','debt-equity-ratio','gross-margin',
'operating-margin','ebit-margin','pre-tax-profit-margin','net-profit-margin','asset-turnover','inventory-turnover','receiveable-turnover','days-sales-in-receivables',
'roe','return-on-tangible-equity','roa','roi','book-value-per-share','operating-cash-flow-per-share','free-cash-flow-per-share','net-change-in-short-term-investments',
'net-change-in-long-term-investments','net-change-in-investments-total','other-operating-income-expenses','pre-paid-expenses','other-share-holders-equity','other-income',
'ebitda-margin']

REAL_RETURN_CLASS = "RealReturnClass"
REAL_RETURN = "RealReturn"
RISK_CLASS = 'RiskClass'
RISK = "Risk"

HIGH = 'high'
MEDIUM = 'medium'
LOW = 'low'

DATE = 'Unnamed: 0'

N_PERIODS = 2
N_FEATURES = 20

DATASET_PATH = 'new_dataset/process_final_{}.csv'.format(N_PERIODS)

N_TRIALS = 100

#Remove os warnings do notebook
warnings.filterwarnings('ignore')

### Import dataset

In [64]:
dataset = pd.read_csv(DATASET_PATH)

### Feature Selection

In [65]:
dataset = dataset.replace(to_replace=[HIGH], value=2.0)
dataset = dataset.replace(to_replace=[MEDIUM], value=1.0)
dataset = dataset.replace(to_replace=[LOW], value=0.0)

dataset = dataset.replace(to_replace=[np.NaN], value=0.0)

dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, DATE])
dataset_y = dataset.drop(columns=DATA)

#### Real Return

In [66]:
rank_real_return = mutual_info_classif(dataset_X, dataset_y[REAL_RETURN_CLASS], discrete_features=True)

result_real_return = {}

for i in range(0, len(dataset_X.columns)):
    result_real_return[dataset_X.columns[i]] = rank_real_return[i]

final_ranking_real_return = sorted(result_real_return.items(), key=lambda x: x[1])
final_ranking_real_return.reverse()
final_ranking_real_return

[('book-value-per-share', 1.0781738725762178),
 ('total-liabilities-share-holders-equity', 1.0730975504432838),
 ('total-assets', 1.0730975504432838),
 ('return-on-tangible-equity', 1.0720236307826683),
 ('pre-tax-profit-margin', 1.0715423436623848),
 ('net-profit-margin', 1.0655457595048283),
 ('total-long-term-assets', 1.063440098960802),
 ('total-liabilities', 1.060025221503162),
 ('roe', 1.037837312201647),
 ('total-long-term-liabilities', 1.0371848786605442),
 ('roi', 1.0332105579822386),
 ('total-share-holder-equity', 1.0330730639322019),
 ('ebit-margin', 1.0308664906950786),
 ('operating-margin', 1.030584461918695),
 ('retained-earnings-accumulated-deficit', 1.0204939670787345),
 ('total-current-assets', 1.007819172959037),
 ('total-current-liabilities', 0.9908536271554558),
 ('roa', 0.9704110154244339),
 ('long-term-debt', 0.9535283875008169),
 ('net-property-plant-equipment', 0.9396579621468436),
 ('days-sales-in-receivables', 0.9325970570255593),
 ('revenue', 0.92858714383506

#### Risk

In [67]:
rank_risk = mutual_info_classif(dataset_X, dataset_y[RISK_CLASS], discrete_features=True)

result_risk = {}

for i in range(0, len(dataset_X.columns)):
    result_risk[dataset_X.columns[i]] = rank_risk[i]

final_ranking_risk = sorted(result_risk.items(), key=lambda x: x[1])
final_ranking_risk.reverse()
final_ranking_risk

[('book-value-per-share', 1.0784866809287685),
 ('total-liabilities-share-holders-equity', 1.0716668597392707),
 ('total-assets', 1.0716668597392707),
 ('pre-tax-profit-margin', 1.0713654930019336),
 ('net-profit-margin', 1.0648272532186995),
 ('return-on-tangible-equity', 1.0641284969009375),
 ('total-long-term-assets', 1.0614202187725474),
 ('total-liabilities', 1.0599517591343064),
 ('total-long-term-liabilities', 1.0361093780620672),
 ('total-share-holder-equity', 1.0352116158920783),
 ('ebit-margin', 1.0318595511234556),
 ('operating-margin', 1.0318226107738098),
 ('retained-earnings-accumulated-deficit', 1.0216322493402616),
 ('roe', 1.0153269192164784),
 ('total-current-assets', 1.006928473411139),
 ('total-current-liabilities', 0.9881403013559005),
 ('roi', 0.971769896256639),
 ('net-property-plant-equipment', 0.9307835159126874),
 ('revenue', 0.926023829434609),
 ('days-sales-in-receivables', 0.9203868010526002),
 ('long-term-debt', 0.9137434746678487),
 ('roa', 0.912712498968

In [68]:
def getColumnsRank(rank: list):
  ranking = []
  for column in rank:
    ranking.append(column[0])
    
  return ranking

In [69]:
features_real_return = getColumnsRank(final_ranking_real_return)[:N_FEATURES]
features_risk = getColumnsRank(final_ranking_risk)[:N_FEATURES]

### Divisão do dataset

In [70]:
columns_dataset = DATA
columns_dataset.append(REAL_RETURN)
columns_dataset.append(RISK)
columns_dataset.append(REAL_RETURN_CLASS)
columns_dataset.append(RISK_CLASS)

df_train = None
df_test = None

df_train = pd.DataFrame(columns=columns_dataset)
df_test = pd.DataFrame(columns=columns_dataset)

In [71]:
TRAINING_START_DATE =  dt.datetime.strptime('2009-03-31', "%Y-%m-%d")
TRAINING_END_DATE =  dt.datetime.strptime('2018-03-31', "%Y-%m-%d")

TEST_START_DATE =  dt.datetime.strptime('2018-06-30', "%Y-%m-%d")
TEST_END_DATE =  dt.datetime.strptime('2022-03-31', "%Y-%m-%d")

dataset_sort = dataset.sort_values(by=DATE)
count_train = 0
count_test = 0

for index, row in dataset_sort.iterrows():
  date = dt.datetime.strptime(row[DATE], "%Y-%m-%d")
  if date.year < TEST_START_DATE.year:
    df_train = df_train.append(row)
    count_train +=1
  elif date.year == TEST_START_DATE.year and date.month < TEST_START_DATE.month:
    df_train = df_train.append(row)
    count_train +=1
  else:
    df_test = df_test.append(row)
    count_test += 1

print(count_train)
print(count_test)

df_train = df_train.drop(columns=[REAL_RETURN, RISK, DATE])
df_test = df_test.drop(columns=[REAL_RETURN, RISK, DATE])

17006
8299


##### Real Return

In [72]:
X_real_return_train = df_train[features_real_return]
y_real_return_train = df_train[REAL_RETURN_CLASS]

X_real_return_test = df_test[features_real_return]
y_real_return_test = df_test[REAL_RETURN_CLASS]

##### Risk

In [73]:
X_risk_train = df_train[features_risk]
y_risk_train = df_train[RISK_CLASS]

X_risk_test = df_test[features_risk]
y_risk_test = df_test[RISK_CLASS]

### Classificadores Únicos

In [74]:
classifiers_real_return = {}
classifiers_risk = {}

### Optuna

In [75]:
def print_best_result(study, classifier, type):
	print(f'{classifier} - {type}')
	print('Melhor pontuação:', study.best_value)
	print('Melhores hiperparâmetros:', study.best_params)

### Random Forest

##### Real Return

In [76]:
def objective_random_forest_return(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)
    }
    model = RandomForestClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [77]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:18:10,085][0m A new study created in memory with name: no-name-b26d3784-3635-42bc-b6b9-e12706a454fd[0m
[32m[I 2023-04-18 03:18:55,128][0m Trial 2 finished with value: 0.841065188576937 and parameters: {'n_estimators': 475, 'max_depth': 110, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 'log2', 'n_jobs': 3}. Best is trial 2 with value: 0.841065188576937.[0m
[32m[I 2023-04-18 03:19:15,080][0m Trial 0 finished with value: 0.8399807205687432 and parameters: {'n_estimators': 693, 'max_depth': 78, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'log2', 'n_jobs': 3}. Best is trial 2 with value: 0.841065188576937.[0m
[32m[I 2023-04-18 03:19:28,897][0m Trial 1 finished with value: 0.8386552596698398 and parameters: {'n_estimators': 675, 'max_depth': 94, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'n_jobs': 3}. Best is trial 2 with value: 0.841065188576937.[0m
[32m[I 2023-04-18 03:19:30,261][0m Trial 

In [78]:
print_best_result(study, 'Random Forest', 'Real Return')


Random Forest - Real Return
Melhor pontuação: 0.841065188576937
Melhores hiperparâmetros: {'n_estimators': 475, 'max_depth': 110, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 'log2', 'n_jobs': 3}


##### Risk

In [79]:
def objective_random_forest_risk(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)
    }
    model = RandomForestClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [80]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:21:41,759][0m A new study created in memory with name: no-name-9d7920e7-5e22-4e12-ae5b-d9f7dad9c4ca[0m
[32m[I 2023-04-18 03:22:14,387][0m Trial 0 finished with value: 0.47909386673093146 and parameters: {'n_estimators': 595, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2', 'n_jobs': 3}. Best is trial 0 with value: 0.47909386673093146.[0m
[32m[I 2023-04-18 03:22:27,272][0m Trial 1 finished with value: 0.5133148572117122 and parameters: {'n_estimators': 311, 'max_depth': 177, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_features': 'log2', 'n_jobs': 3}. Best is trial 1 with value: 0.5133148572117122.[0m
[32m[I 2023-04-18 03:22:49,874][0m Trial 2 finished with value: 0.5098204602964213 and parameters: {'n_estimators': 502, 'max_depth': 124, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'n_jobs': 3}. Best is trial 1 with value: 0.5133148572117122.[0m
[32m[I 2023-04-18 03:23:00,175][0m T

In [81]:
print_best_result(study, 'Random Forest', 'Risk')

Random Forest - Risk
Melhor pontuação: 0.5154837932280998
Melhores hiperparâmetros: {'n_estimators': 326, 'max_depth': 142, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'n_jobs': 3}


### SVM

##### Real Return

In [82]:
def objective_svc_return(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
	}
    model = SVC(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [83]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:26:01,704][0m A new study created in memory with name: no-name-d8513562-fab3-4e28-9951-857b59ccafcc[0m
[32m[I 2023-04-18 03:27:31,520][0m Trial 0 finished with value: 0.5858537173153392 and parameters: {'kernel': 'rbf', 'C': 94.96385981118057, 'class_weight': None}. Best is trial 0 with value: 0.5858537173153392.[0m
[32m[I 2023-04-18 03:28:11,938][0m Trial 2 finished with value: 0.41330280756717674 and parameters: {'kernel': 'poly', 'C': 58.528619774990176, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.5858537173153392.[0m
[32m[I 2023-04-18 03:28:49,414][0m Trial 1 finished with value: 0.33172671406193516 and parameters: {'kernel': 'poly', 'C': 99.05302594611123, 'class_weight': None}. Best is trial 0 with value: 0.5858537173153392.[0m
[32m[I 2023-04-18 03:29:15,195][0m Trial 3 finished with value: 0.5824798168454031 and parameters: {'kernel': 'rbf', 'C': 67.70078768552747, 'class_weight': None}. Best is trial 0 with value: 0.585853717315

In [84]:
print_best_result(study, 'SVM', 'Real Return')

SVM - Real Return
Melhor pontuação: 0.5858537173153392
Melhores hiperparâmetros: {'kernel': 'rbf', 'C': 94.96385981118057, 'class_weight': None}


##### Risk

In [85]:
def objective_svc_risk(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
	}
    model = SVC(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [86]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:31:11,227][0m A new study created in memory with name: no-name-cb7028aa-8548-4502-ac26-a87d6f9eca3c[0m
[32m[I 2023-04-18 03:31:45,928][0m Trial 2 finished with value: 0.351970116881552 and parameters: {'kernel': 'sigmoid', 'C': 64.88949125851607, 'class_weight': None}. Best is trial 2 with value: 0.351970116881552.[0m
[32m[I 2023-04-18 03:32:17,743][0m Trial 3 finished with value: 0.3487167128569707 and parameters: {'kernel': 'sigmoid', 'C': 78.13130403277016, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.351970116881552.[0m
[32m[I 2023-04-18 03:32:48,515][0m Trial 4 finished with value: 0.35209061332690683 and parameters: {'kernel': 'sigmoid', 'C': 36.33849931874259, 'class_weight': None}. Best is trial 4 with value: 0.35209061332690683.[0m
[32m[I 2023-04-18 03:33:22,576][0m Trial 5 finished with value: 0.3487167128569707 and parameters: {'kernel': 'sigmoid', 'C': 95.7619611405676, 'class_weight': 'balanced'}. Best is trial 4 with value:

In [87]:
print_best_result(study, 'SVM', 'Risk')

SVM - Risk
Melhor pontuação: 0.41450777202072536
Melhores hiperparâmetros: {'kernel': 'rbf', 'C': 13.642159048041334, 'class_weight': None}


### Decision Tree

##### Real Return

In [88]:
def objective_decision_tree_return(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2000),
	}
    model = DecisionTreeClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [89]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_decision_tree_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:37:58,708][0m A new study created in memory with name: no-name-b65705ac-cbf7-477b-9839-953ff7743ed6[0m
[32m[I 2023-04-18 03:37:58,746][0m Trial 0 finished with value: 0.5253645017471985 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 138, 'min_samples_split': 1302}. Best is trial 0 with value: 0.5253645017471985.[0m
[32m[I 2023-04-18 03:37:58,773][0m Trial 1 finished with value: 0.5882636462224364 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 200, 'min_samples_split': 1833}. Best is trial 1 with value: 0.5882636462224364.[0m
[32m[I 2023-04-18 03:37:58,825][0m Trial 3 finished with value: 0.6136883961923123 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 194, 'min_samples_split': 1945}. Best is trial 3 with value: 0.6136883961923123.[0m
[32m[I 2023-04-18 03:37:58,840][0m Trial 2 finished with value: 0.7232196650198819 and parameters: {'criterion': 'entropy', 'split

In [90]:
print_best_result(study, 'DecisionTreeClassifier', 'Real Return')

DecisionTreeClassifier - Real Return
Melhor pontuação: 0.8341968911917098
Melhores hiperparâmetros: {'criterion': 'log_loss', 'splitter': 'best', 'max_depth': 169, 'min_samples_split': 1524}


##### Risk

In [91]:
def objective_decision_tree_risk(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2000),
	}
    model = DecisionTreeClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [92]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_decision_tree_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:38:00,421][0m A new study created in memory with name: no-name-957a0916-6455-4195-a532-95b8c159da5a[0m
[32m[I 2023-04-18 03:38:00,515][0m Trial 0 finished with value: 0.4582479816845403 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 62, 'min_samples_split': 586}. Best is trial 0 with value: 0.4582479816845403.[0m
[32m[I 2023-04-18 03:38:00,523][0m Trial 2 finished with value: 0.4464393300397638 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 74, 'min_samples_split': 323}. Best is trial 0 with value: 0.4582479816845403.[0m
[32m[I 2023-04-18 03:38:00,537][0m Trial 1 finished with value: 0.4440294011326666 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 170, 'min_samples_split': 251}. Best is trial 0 with value: 0.4582479816845403.[0m
[32m[I 2023-04-18 03:38:00,571][0m Trial 3 finished with value: 0.4433064224605374 and parameters: {'criterion': 'entropy', 'splitter': 'random

In [93]:
print_best_result(study, 'DecisionTreeClassifier', 'Risk')


DecisionTreeClassifier - Risk
Melhor pontuação: 0.4757199662609953
Melhores hiperparâmetros: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 182, 'min_samples_split': 1361}


### Naive Bayes

##### Real Return

In [94]:
def objective_naive_bayes_return(trial):
    params = {
        'var_smoothing': trial.suggest_loguniform('var_smoothing', 1e-12, 1e-5)
	}
    model = GaussianNB(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [95]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_naive_bayes_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:38:01,634][0m A new study created in memory with name: no-name-8c5e7011-c219-4a81-95bd-5dd2536059ad[0m
[32m[I 2023-04-18 03:38:01,686][0m Trial 1 finished with value: 0.3341366429690324 and parameters: {'var_smoothing': 4.45987846648015e-09}. Best is trial 1 with value: 0.3341366429690324.[0m
[32m[I 2023-04-18 03:38:01,694][0m Trial 0 finished with value: 0.3352211109772262 and parameters: {'var_smoothing': 2.940685475296123e-09}. Best is trial 0 with value: 0.3352211109772262.[0m
[32m[I 2023-04-18 03:38:01,703][0m Trial 2 finished with value: 0.33485962164116156 and parameters: {'var_smoothing': 3.416049213033788e-09}. Best is trial 0 with value: 0.3352211109772262.[0m
[32m[I 2023-04-18 03:38:01,744][0m Trial 3 finished with value: 0.33883600433787203 and parameters: {'var_smoothing': 7.178957811254408e-10}. Best is trial 3 with value: 0.33883600433787203.[0m
[32m[I 2023-04-18 03:38:01,750][0m Trial 4 finished with value: 0.3316062176165803 and pa

In [96]:
print_best_result(study, 'Naive Bayes', 'Real Return')

Naive Bayes - Real Return
Melhor pontuação: 0.4451138691408603
Melhores hiperparâmetros: {'var_smoothing': 1.9075529453658458e-12}


##### Risk

In [97]:
def objective_naive_bayes_risk(trial):
    params = {
        'var_smoothing': trial.suggest_loguniform('var_smoothing', 1e-12, 1e-5)
	}
    model = GaussianNB(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [98]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_naive_bayes_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:38:02,031][0m A new study created in memory with name: no-name-aafeba3e-c8d8-4009-ba4e-fd73bbba4e43[0m
[32m[I 2023-04-18 03:38:02,081][0m Trial 1 finished with value: 0.38462465357271963 and parameters: {'var_smoothing': 2.8163997323343977e-07}. Best is trial 1 with value: 0.38462465357271963.[0m
[32m[I 2023-04-18 03:38:02,094][0m Trial 2 finished with value: 0.33184721050729005 and parameters: {'var_smoothing': 7.465270436657749e-09}. Best is trial 1 with value: 0.38462465357271963.[0m
[32m[I 2023-04-18 03:38:02,101][0m Trial 0 finished with value: 0.3323291962887095 and parameters: {'var_smoothing': 5.4428435220092545e-09}. Best is trial 1 with value: 0.38462465357271963.[0m
[32m[I 2023-04-18 03:38:02,138][0m Trial 3 finished with value: 0.38378117845523557 and parameters: {'var_smoothing': 1.0504676989088914e-06}. Best is trial 1 with value: 0.38462465357271963.[0m
[32m[I 2023-04-18 03:38:02,154][0m Trial 5 finished with value: 0.338836004337872

In [99]:
print_best_result(study, 'Naive Bayes', 'Risk')

Naive Bayes - Risk
Melhor pontuação: 0.3849861429087842
Melhores hiperparâmetros: {'var_smoothing': 3.3259108903304924e-07}


### Rede Neural

##### Real Return

In [100]:
def objective_neural_network_return(trial):
	params = {
		'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
		'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
		'max_iter': trial.suggest_int('max_iter', 200, 2000),
		'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(100,), (200,), (300,), (400,), (500,), (600,), (700,), (800,), (900,), (1000,)]),
		'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']), 
	}

	model = MLPClassifier(**params)
	model.fit(X_real_return_train, y_real_return_train)
	preds = model.predict(X_real_return_test)
	accuracy = accuracy_score(y_real_return_test, preds)

	return accuracy

In [101]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_neural_network_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:38:02,420][0m A new study created in memory with name: no-name-ea051408-afb9-441d-88ff-87dd304eb0f6[0m
[32m[I 2023-04-18 03:38:10,034][0m Trial 2 finished with value: 0.39305940474755996 and parameters: {'activation': 'tanh', 'solver': 'sgd', 'max_iter': 1018, 'hidden_layer_sizes': (400,), 'learning_rate': 'constant'}. Best is trial 2 with value: 0.39305940474755996.[0m
[32m[I 2023-04-18 03:38:12,668][0m Trial 3 finished with value: 0.3129292685865767 and parameters: {'activation': 'relu', 'solver': 'sgd', 'max_iter': 209, 'hidden_layer_sizes': (300,), 'learning_rate': 'constant'}. Best is trial 2 with value: 0.39305940474755996.[0m
[32m[I 2023-04-18 03:38:24,921][0m Trial 0 finished with value: 0.5359681889384263 and parameters: {'activation': 'tanh', 'solver': 'adam', 'max_iter': 529, 'hidden_layer_sizes': (1000,), 'learning_rate': 'constant'}. Best is trial 0 with value: 0.5359681889384263.[0m
[32m[I 2023-04-18 03:38:40,530][0m Trial 5 finished wit

In [102]:
print_best_result(study, 'Rede Neural', 'Real Return')

Rede Neural - Real Return
Melhor pontuação: 0.5828413061814677
Melhores hiperparâmetros: {'activation': 'relu', 'solver': 'adam', 'max_iter': 1684, 'hidden_layer_sizes': (700,), 'learning_rate': 'invscaling'}


##### Risk

In [103]:
def objective_neural_network_risk(trial):
	params = {
		'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
		'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
		'max_iter': trial.suggest_int('max_iter', 200, 2000),
		'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(100,), (200,), (300,), (400,), (500,), (600,), (700,), (800,), (900,), (1000,)]),
		'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']), 
	}

	model = MLPClassifier(**params)
	model.fit(X_risk_train, y_risk_train)
	preds = model.predict(X_risk_test)
	accuracy = accuracy_score(y_risk_test, preds)

	return accuracy

In [104]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_neural_network_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:47:45,852][0m A new study created in memory with name: no-name-10e2cdd4-98d9-4fcd-9c61-cd4321f3120c[0m
[32m[I 2023-04-18 03:47:54,469][0m Trial 0 finished with value: 0.38474515001807447 and parameters: {'activation': 'identity', 'solver': 'adam', 'max_iter': 1033, 'hidden_layer_sizes': (300,), 'learning_rate': 'invscaling'}. Best is trial 0 with value: 0.38474515001807447.[0m
[32m[I 2023-04-18 03:48:02,177][0m Trial 2 finished with value: 0.31027834678876975 and parameters: {'activation': 'logistic', 'solver': 'adam', 'max_iter': 772, 'hidden_layer_sizes': (900,), 'learning_rate': 'invscaling'}. Best is trial 0 with value: 0.38474515001807447.[0m
[32m[I 2023-04-18 03:48:04,010][0m Trial 1 finished with value: 0.38378117845523557 and parameters: {'activation': 'identity', 'solver': 'adam', 'max_iter': 587, 'hidden_layer_sizes': (500,), 'learning_rate': 'adaptive'}. Best is trial 0 with value: 0.38474515001807447.[0m
[32m[I 2023-04-18 03:48:13,832][0m 

In [105]:
print_best_result(study, 'Rede Neural', 'Risk')

Rede Neural - Risk
Melhor pontuação: 0.3992047234606579
Melhores hiperparâmetros: {'activation': 'relu', 'solver': 'adam', 'max_iter': 1375, 'hidden_layer_sizes': (900,), 'learning_rate': 'adaptive'}


### Regressão Logistica

##### Real Return

In [106]:
def objective_logistic_regression_return(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 10000),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)

	}
    model = LogisticRegression(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [107]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_logistic_regression_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:50:11,471][0m A new study created in memory with name: no-name-2dc3bfd3-75f6-4677-b614-181af340ae2f[0m
[32m[I 2023-04-18 03:50:20,708][0m Trial 0 finished with value: 0.7845523557055066 and parameters: {'penalty': 'l2', 'C': 83.90422586374864, 'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 2279, 'n_jobs': 3}. Best is trial 0 with value: 0.7845523557055066.[0m
[32m[I 2023-04-18 03:50:29,461][0m Trial 1 finished with value: 0.6247740691649596 and parameters: {'penalty': 'l2', 'C': 62.09677659440283, 'class_weight': None, 'solver': 'liblinear', 'max_iter': 266, 'n_jobs': 3}. Best is trial 0 with value: 0.7845523557055066.[0m
[32m[I 2023-04-18 03:50:36,353][0m Trial 3 finished with value: 0.7878057597300879 and parameters: {'penalty': 'l2', 'C': 33.764382182969335, 'class_weight': 'balanced', 'solver': 'newton-cholesky', 'max_iter': 6022, 'n_jobs': 3}. Best is trial 3 with value: 0.7878057597300879.[0m
[32m[I 2023-04-18 03:50:49,517][0m Tr

In [108]:
print_best_result(study, 'Logistic Regression', 'Real Return')

Logistic Regression - Real Return
Melhor pontuação: 0.8167249066152549
Melhores hiperparâmetros: {'penalty': 'l2', 'C': 83.99221686719915, 'class_weight': None, 'solver': 'newton-cg', 'max_iter': 4782, 'n_jobs': 3}


##### Risk

In [109]:
def objective_logistic_regression_risk(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 10000),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)

	}
    model = LogisticRegression(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [110]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_logistic_regression_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:52:54,239][0m A new study created in memory with name: no-name-182599f3-49c2-4ead-bb3e-7b7b59666a17[0m
[32m[I 2023-04-18 03:52:55,870][0m Trial 2 finished with value: 0.43246174237859986 and parameters: {'penalty': 'l2', 'C': 83.79503120547326, 'class_weight': 'balanced', 'solver': 'newton-cholesky', 'max_iter': 1250, 'n_jobs': 3}. Best is trial 2 with value: 0.43246174237859986.[0m
[32m[I 2023-04-18 03:53:07,324][0m Trial 0 finished with value: 0.3953488372093023 and parameters: {'penalty': 'l2', 'C': 39.50166431528555, 'class_weight': None, 'solver': 'lbfgs', 'max_iter': 8745, 'n_jobs': 3}. Best is trial 2 with value: 0.43246174237859986.[0m
[32m[I 2023-04-18 03:53:07,688][0m Trial 4 finished with value: 0.429690324135438 and parameters: {'penalty': 'l2', 'C': 69.359069192544, 'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 4753, 'n_jobs': 3}. Best is trial 2 with value: 0.43246174237859986.[0m
[32m[I 2023-04-18 03:53:13,712][0m Trial

In [111]:
print_best_result(study, 'Logistic Regression', 'Risk')

Logistic Regression - Risk
Melhor pontuação: 0.43631762862995543
Melhores hiperparâmetros: {'penalty': 'l2', 'C': 22.449917989570746, 'class_weight': 'balanced', 'solver': 'newton-cg', 'max_iter': 7233, 'n_jobs': 3}


### KNeighborsClassifier

##### Real Return

In [112]:
def objective_kn_return(trial):
    params = {
		'n_neighbors': trial.suggest_int('n_neighbors', 5, 200),
		'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
		'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
		'leaf_size': trial.suggest_int('leaf_size', 30, 100),
		'p': trial.suggest_int('p', 1, 3),
    	'n_jobs': trial.suggest_int('n_jobs', 3, 3)
	}
    
    model = KNeighborsClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [113]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_kn_return, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:55:18,061][0m A new study created in memory with name: no-name-027c76ad-2368-44c9-a095-3219ae3b82d5[0m
[32m[I 2023-04-18 03:55:21,522][0m Trial 1 finished with value: 0.5457284010121701 and parameters: {'n_neighbors': 44, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 86, 'p': 1, 'n_jobs': 3}. Best is trial 1 with value: 0.5457284010121701.[0m
[32m[I 2023-04-18 03:55:25,297][0m Trial 3 finished with value: 0.5457284010121701 and parameters: {'n_neighbors': 161, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 80, 'p': 1, 'n_jobs': 3}. Best is trial 1 with value: 0.5457284010121701.[0m
[32m[I 2023-04-18 03:55:28,248][0m Trial 4 finished with value: 0.5474153512471382 and parameters: {'n_neighbors': 51, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 63, 'p': 1, 'n_jobs': 3}. Best is trial 4 with value: 0.5474153512471382.[0m
[32m[I 2023-04-18 03:56:11,800][0m Trial 0 finished with value: 0.5577780455476563 and parameter

In [114]:
print_best_result(study, 'KNeighborsClassifier', 'Real Return')

KNeighborsClassifier - Real Return
Melhor pontuação: 0.5621159175804313
Melhores hiperparâmetros: {'n_neighbors': 96, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 86, 'p': 2, 'n_jobs': 3}


##### Risk

In [121]:
def objective_kn_risk(trial):
    params = {
		'n_neighbors': trial.suggest_int('n_neighbors', 5, 200),
		'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
		'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
		'leaf_size': trial.suggest_int('leaf_size', 30, 100),
		'p': trial.suggest_int('p', 1, 3),
    	'n_jobs': trial.suggest_int('n_jobs', 3, 3)
	}
    
    model = KNeighborsClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [122]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_kn_risk, n_trials=N_TRIALS, n_jobs=3)

[32m[I 2023-04-18 03:58:11,144][0m A new study created in memory with name: no-name-39a27b32-5d7b-4169-a806-0f0bc05b3b22[0m
[32m[I 2023-04-18 03:58:13,233][0m Trial 2 finished with value: 0.4151102542474997 and parameters: {'n_neighbors': 84, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 90, 'p': 2, 'n_jobs': 3}. Best is trial 2 with value: 0.4151102542474997.[0m
[32m[I 2023-04-18 03:58:13,740][0m Trial 0 finished with value: 0.4205325942884685 and parameters: {'n_neighbors': 45, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 84, 'p': 2, 'n_jobs': 3}. Best is trial 0 with value: 0.4205325942884685.[0m
[32m[I 2023-04-18 03:58:13,955][0m Trial 1 finished with value: 0.4096879142065309 and parameters: {'n_neighbors': 12, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 36, 'p': 2, 'n_jobs': 3}. Best is trial 0 with value: 0.4205325942884685.[0m
[32m[I 2023-04-18 03:58:16,277][0m Trial 3 finished with value: 0.4186046511627907 and parameters: 

In [123]:
print_best_result(study, 'KNeighborsClassifier', 'Risk')

KNeighborsClassifier - Risk
Melhor pontuação: 0.4230630196409206
Melhores hiperparâmetros: {'n_neighbors': 79, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 88, 'p': 1, 'n_jobs': 3}
