# Multiple Classifiers Ensemble System (MCS)

#### Iury Zanonni de Faria

### Imports

#### General imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import statistics as st
import matplotlib.pyplot as plt
import warnings
import optuna

  from .autonotebook import tqdm as notebook_tqdm


#### Feature Selection imports

In [2]:
from sklearn.feature_selection import mutual_info_classif
# Info gain - weka

#### Diversity imports

In [3]:
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

#### Classifiers imports

In [4]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


#### Metrics

In [5]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

### Settings

In [6]:
DATA = ['Date', 'Current Ratio','Quick Ratio','Current Assets', 'Long-term debt to equity ratio', 'Share Holder Equity','Debt to Equity Ratio', 'Percentage of net profit to sale',
'Percentage of operating profit to sale','Percentage of Gross profit to sale','ROA','ROE','EPS','P/E','P/S','Stock book value','Stock Price','ROI','MarketReturn', 'Company']

DATA = ['Unnamed: 0', 'revenue','cost-goods-sold','gross-profit','research-development-expenses','selling-general-administrative-expenses','operating-expenses',
'operating-income','total-non-operating-income-expense','pre-tax-income','total-provision-income-taxes','income-after-taxes','income-from-continuous-operations',
'income-from-discontinued-operations','net-income','ebitda','ebit','basic-shares-outstanding','shares-outstanding','eps-basic-net-earnings-per-share',
'eps-earnings-per-share-diluted','cash-on-hand','receivables-total','inventory','other-current-assets','total-current-assets','net-property-plant-equipment',
'long-term-investments','goodwill-intangible-assets-total','other-long-term-assets','total-long-term-assets','total-assets','total-current-liabilities','long-term-debt',
'other-non-current-liabilities','total-long-term-liabilities','total-liabilities','common-stock-net','retained-earnings-accumulated-deficit','comprehensive-income',
'total-share-holder-equity','total-liabilities-share-holders-equity','net-income-loss','total-depreciation-amortization-cash-flow','other-non-cash-items','total-non-cash-items',
'change-in-accounts-receivable','change-in-inventories','change-in-accounts-payable','change-in-assets-liabilities','total-change-in-assets-liabilities',
'cash-flow-from-operating-activities','net-change-in-property-plant-equipment','net-change-in-intangible-assets','net-acquisitions-divestitures','investing-activities-other',
'cash-flow-from-investing-activities','net-long-term-debt','net-current-debt','debt-issuance-retirement-net-total','net-common-equity-issued-repurchased',
'net-total-equity-issued-repurchased','total-common-preferred-stock-dividends-paid','financial-activities-other','cash-flow-from-financial-activities',
'net-cash-flow','stock-based-compensation','common-stock-dividends-paid','current-ratio','long-term-debt-capital','debt-equity-ratio','gross-margin',
'operating-margin','ebit-margin','pre-tax-profit-margin','net-profit-margin','asset-turnover','inventory-turnover','receiveable-turnover','days-sales-in-receivables',
'roe','return-on-tangible-equity','roa','roi','book-value-per-share','operating-cash-flow-per-share','free-cash-flow-per-share','net-change-in-short-term-investments',
'net-change-in-long-term-investments','net-change-in-investments-total','other-operating-income-expenses','pre-paid-expenses','other-share-holders-equity','other-income',
'ebitda-margin']

REAL_RETURN_CLASS = "RealReturnClass"
REAL_RETURN = "RealReturn"
RISK_CLASS = 'RiskClass'
RISK = "Risk"

HIGH = 'high'
MEDIUM = 'medium'
LOW = 'low'

DATE = 'Unnamed: 0'

N_PERIODS = 2
N_FEATURES = 20

DATASET_PATH = 'new_dataset/process_final_{}.csv'.format(N_PERIODS)

N_TRIALS = 100

N_JOBS = -1

MUTUAL_INFORMATION = "MUTUAL_INFORMATION"

#Remove os warnings do notebook
warnings.filterwarnings('ignore')

### Import dataset

In [7]:
dataset = pd.read_csv(DATASET_PATH)

In [8]:
dataset = dataset.replace(to_replace=[HIGH], value=2.0)
dataset = dataset.replace(to_replace=[MEDIUM], value=1.0)
dataset = dataset.replace(to_replace=[LOW], value=0.0)

dataset = dataset.replace(to_replace=[np.NaN], value=0.0)

dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, DATE])
dataset_y = dataset.drop(columns=DATA)

### Feature Selection

In [9]:
def readFeatures(typeFeature, typeClass, num):
	file = open(f'./feature_selection/files/{typeFeature}_{typeClass}.txt', 'r')
	result = []
	
	for feature in file:
		result.append(eval(str(feature)))
	    
	file.close()
    
	return result[:num]

#### Real Return

In [10]:
final_ranking_real_return = readFeatures(MUTUAL_INFORMATION, REAL_RETURN, N_FEATURES)

#### Risk

In [11]:
final_ranking_risk  = readFeatures(MUTUAL_INFORMATION, RISK, N_FEATURES)

In [12]:
def getColumnsRank(rank: list):
  ranking = []
  for column in rank:
    ranking.append(column[0])
    
  return ranking

In [13]:
features_real_return = getColumnsRank(final_ranking_real_return)[:N_FEATURES]
features_risk = getColumnsRank(final_ranking_risk)[:N_FEATURES]

### Divisão do dataset

In [14]:
columns_dataset = DATA
columns_dataset.append(REAL_RETURN)
columns_dataset.append(RISK)
columns_dataset.append(REAL_RETURN_CLASS)
columns_dataset.append(RISK_CLASS)

df_train = None
df_test = None

df_train = pd.DataFrame(columns=columns_dataset)
df_test = pd.DataFrame(columns=columns_dataset)

In [15]:
# TRAINING_START_DATE =  dt.datetime.strptime('2009-03-31', "%Y-%m-%d")
# TRAINING_END_DATE =  dt.datetime.strptime('2018-03-31', "%Y-%m-%d")

# TEST_START_DATE =  dt.datetime.strptime('2018-06-30', "%Y-%m-%d")
# TEST_END_DATE =  dt.datetime.strptime('2022-03-31', "%Y-%m-%d")

# dataset_sort = dataset.sort_values(by=DATE)
# count_train = 0
# count_test = 0

# for index, row in dataset_sort.iterrows():
#   date = dt.datetime.strptime(row[DATE], "%Y-%m-%d")
#   if date.year < TEST_START_DATE.year:
#     df_train = df_train.append(row)
#     count_train +=1
#   elif date.year == TEST_START_DATE.year and date.month < TEST_START_DATE.month:
#     df_train = df_train.append(row)
#     count_train +=1
#   else:
#     df_test = df_test.append(row)
#     count_test += 1

# print(count_train)
# print(count_test)

# df_train = df_train.drop(columns=[REAL_RETURN, RISK, DATE])
# df_test = df_test.drop(columns=[REAL_RETURN, RISK, DATE])

In [16]:
df_train = pd.read_csv('./util/dataset_train.csv')
df_test = pd.read_csv('./util/dataset_test.csv')

##### Real Return

In [17]:
X_real_return_train = df_train[features_real_return]
y_real_return_train = df_train[REAL_RETURN_CLASS]

X_real_return_test = df_test[features_real_return]
y_real_return_test = df_test[REAL_RETURN_CLASS]

##### Risk

In [18]:
X_risk_train = df_train[features_risk]
y_risk_train = df_train[RISK_CLASS]

X_risk_test = df_test[features_risk]
y_risk_test = df_test[RISK_CLASS]

### Classificadores Únicos

In [19]:
classifiers_real_return = {}
classifiers_risk = {}

### Optuna

In [20]:
def print_best_result(study, classifier, type):
	print(f'{classifier} - {type}')
	print('Melhor pontuação:', study.best_value)
	print('Melhores hiperparâmetros:', study.best_params)

### Random Forest

##### Real Return

In [21]:
def objective_random_forest_return(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'n_jobs': N_JOBS
    }
    model = RandomForestClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [22]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 14:43:29,322][0m A new study created in memory with name: no-name-35cb4aaf-80d4-4a80-a686-ae24ae54b5fb[0m
[32m[I 2023-04-21 14:43:34,564][0m Trial 10 finished with value: 0.8376912881070009 and parameters: {'n_estimators': 13, 'max_depth': 53, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 10 with value: 0.8376912881070009.[0m
[32m[I 2023-04-21 14:43:38,305][0m Trial 2 finished with value: 0.8363658272080974 and parameters: {'n_estimators': 51, 'max_depth': 195, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.8376912881070009.[0m
[32m[I 2023-04-21 14:44:03,188][0m Trial 7 finished with value: 0.8390167490059043 and parameters: {'n_estimators': 318, 'max_depth': 107, 'min_samples_split': 3, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 7 with value: 0.8390167490059043.[0m
[32m[I 2023-04-21 14:44:07,560][0m Trial 0 finished with value: 0.836486323

In [23]:
print_best_result(study, 'Random Forest', 'Real Return')


Random Forest - Real Return
Melhor pontuação: 0.8405832027955176
Melhores hiperparâmetros: {'n_estimators': 840, 'max_depth': 178, 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_features': 'sqrt'}


##### Risk

In [24]:
def objective_random_forest_risk(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'n_jobs': N_JOBS
    }
    model = RandomForestClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [25]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 14:53:25,214][0m A new study created in memory with name: no-name-521767eb-a1f3-4560-b7ef-b26cd6e10aaa[0m
[32m[I 2023-04-21 14:53:49,292][0m Trial 9 finished with value: 0.4986142908784191 and parameters: {'n_estimators': 175, 'max_depth': 65, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 9 with value: 0.4986142908784191.[0m
[32m[I 2023-04-21 14:53:51,681][0m Trial 11 finished with value: 0.4988552837691288 and parameters: {'n_estimators': 181, 'max_depth': 28, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 11 with value: 0.4988552837691288.[0m
[32m[I 2023-04-21 14:53:58,202][0m Trial 6 finished with value: 0.5000602482226775 and parameters: {'n_estimators': 285, 'max_depth': 114, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 6 with value: 0.5000602482226775.[0m
[32m[I 2023-04-21 14:54:16,205][0m Trial 1 finished with value: 0.498132305

In [26]:
print_best_result(study, 'Random Forest', 'Risk')

Random Forest - Risk
Melhor pontuação: 0.5052415953729364
Melhores hiperparâmetros: {'n_estimators': 622, 'max_depth': 70, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': 'log2'}


### SVM

##### Real Return

In [27]:
def objective_svc_return(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
	}
    model = SVC(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [28]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 15:04:08,175][0m A new study created in memory with name: no-name-3e33e691-c1a9-432f-a158-803810329a38[0m
[32m[I 2023-04-21 15:05:15,133][0m Trial 1 finished with value: 0.38149174599349317 and parameters: {'kernel': 'sigmoid', 'C': 43.27231690712184, 'class_weight': None}. Best is trial 1 with value: 0.38149174599349317.[0m
[32m[I 2023-04-21 15:05:15,918][0m Trial 5 finished with value: 0.38161224243884806 and parameters: {'kernel': 'sigmoid', 'C': 69.7530951812302, 'class_weight': None}. Best is trial 5 with value: 0.38161224243884806.[0m
[32m[I 2023-04-21 15:05:19,689][0m Trial 11 finished with value: 0.38799855404265576 and parameters: {'kernel': 'sigmoid', 'C': 48.424966953003626, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.38799855404265576.[0m
[32m[I 2023-04-21 15:05:42,182][0m Trial 9 finished with value: 0.6941800216893602 and parameters: {'kernel': 'rbf', 'C': 87.96849210269706, 'class_weight': None}. Best is trial 9 with value:

In [29]:
print_best_result(study, 'SVM', 'Real Return')

SVM - Real Return
Melhor pontuação: 0.6983973972767803
Melhores hiperparâmetros: {'kernel': 'rbf', 'C': 99.94849891435051, 'class_weight': 'balanced'}


##### Risk

In [30]:
def objective_svc_risk(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
	}
    model = SVC(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [31]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 15:18:13,342][0m A new study created in memory with name: no-name-6ae1c79e-3e63-4aec-b275-42276bafa628[0m
[32m[I 2023-04-21 15:19:16,390][0m Trial 10 finished with value: 0.320520544643933 and parameters: {'kernel': 'sigmoid', 'C': 83.7901007847199, 'class_weight': None}. Best is trial 10 with value: 0.320520544643933.[0m
[32m[I 2023-04-21 15:19:17,214][0m Trial 0 finished with value: 0.320520544643933 and parameters: {'kernel': 'sigmoid', 'C': 27.022476186365395, 'class_weight': None}. Best is trial 10 with value: 0.320520544643933.[0m
[32m[I 2023-04-21 15:19:21,357][0m Trial 5 finished with value: 0.31811061573683574 and parameters: {'kernel': 'sigmoid', 'C': 47.869830531643885, 'class_weight': 'balanced'}. Best is trial 10 with value: 0.320520544643933.[0m
[32m[I 2023-04-21 15:19:21,404][0m Trial 1 finished with value: 0.31811061573683574 and parameters: {'kernel': 'sigmoid', 'C': 62.25071138792972, 'class_weight': 'balanced'}. Best is trial 10 with v

In [32]:
print_best_result(study, 'SVM', 'Risk')

SVM - Risk
Melhor pontuação: 0.39558983010001203
Melhores hiperparâmetros: {'kernel': 'rbf', 'C': 99.89489576327396, 'class_weight': 'balanced'}


### Decision Tree

##### Real Return

In [33]:
def objective_decision_tree_return(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2000),
	}
    model = DecisionTreeClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [34]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_decision_tree_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 18:03:47,102][0m A new study created in memory with name: no-name-42b6efd6-17b7-4002-8897-d8d16720259d[0m
[32m[I 2023-04-21 18:03:47,210][0m Trial 8 finished with value: 0.7043017230991686 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 67, 'min_samples_split': 1629}. Best is trial 8 with value: 0.7043017230991686.[0m
[32m[I 2023-04-21 18:03:47,218][0m Trial 1 finished with value: 0.6642969032413544 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 170, 'min_samples_split': 1865}. Best is trial 8 with value: 0.7043017230991686.[0m
[32m[I 2023-04-21 18:03:47,230][0m Trial 0 finished with value: 0.7409326424870466 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 105, 'min_samples_split': 739}. Best is trial 0 with value: 0.7409326424870466.[0m
[32m[I 2023-04-21 18:03:47,237][0m Trial 5 finished with value: 0.6629714423424509 and parameters: {'criterion': 'gini', 'splitter': 'ran

In [35]:
print_best_result(study, 'DecisionTreeClassifier', 'Real Return')

DecisionTreeClassifier - Real Return
Melhor pontuação: 0.8350403663091939
Melhores hiperparâmetros: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 1097}


##### Risk

In [36]:
def objective_decision_tree_risk(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2000),
	}
    model = DecisionTreeClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [37]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_decision_tree_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 18:03:52,129][0m A new study created in memory with name: no-name-3ad0daf5-b316-4d92-bbdb-b225b6213851[0m
[32m[I 2023-04-21 18:03:52,203][0m Trial 0 finished with value: 0.4328232317146644 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 9, 'min_samples_split': 542}. Best is trial 0 with value: 0.4328232317146644.[0m
[32m[I 2023-04-21 18:03:52,217][0m Trial 2 finished with value: 0.4598144354741535 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 30, 'min_samples_split': 1058}. Best is trial 2 with value: 0.4598144354741535.[0m
[32m[I 2023-04-21 18:03:52,243][0m Trial 5 finished with value: 0.45704301723099167 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 27, 'min_samples_split': 1962}. Best is trial 2 with value: 0.4598144354741535.[0m
[32m[I 2023-04-21 18:03:52,247][0m Trial 10 finished with value: 0.4472828051572479 and parameters: {'criterion': 'gini', 'splitter': 

In [38]:
print_best_result(study, 'DecisionTreeClassifier', 'Risk')


DecisionTreeClassifier - Risk
Melhor pontuação: 0.4805398240751898
Melhores hiperparâmetros: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 133, 'min_samples_split': 919}


### Naive Bayes

##### Real Return

In [39]:
def objective_naive_bayes_return(trial):
    params = {
        'var_smoothing': trial.suggest_loguniform('var_smoothing', 1e-12, 1e-5)
	}
    model = GaussianNB(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [40]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_naive_bayes_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 18:03:58,846][0m A new study created in memory with name: no-name-28889485-c2a3-41f9-898d-4092f2ee26ab[0m
[32m[I 2023-04-21 18:03:58,958][0m Trial 0 finished with value: 0.37341848415471746 and parameters: {'var_smoothing': 1.8557146704786043e-06}. Best is trial 0 with value: 0.37341848415471746.[0m
[32m[I 2023-04-21 18:03:58,959][0m Trial 2 finished with value: 0.37341848415471746 and parameters: {'var_smoothing': 1.847122980665886e-06}. Best is trial 0 with value: 0.37341848415471746.[0m
[32m[I 2023-04-21 18:03:58,968][0m Trial 7 finished with value: 0.3779973490782022 and parameters: {'var_smoothing': 8.335200190981194e-06}. Best is trial 7 with value: 0.3779973490782022.[0m
[32m[I 2023-04-21 18:03:58,980][0m Trial 4 finished with value: 0.3811302566574286 and parameters: {'var_smoothing': 1.3935991982085752e-08}. Best is trial 4 with value: 0.3811302566574286.[0m
[32m[I 2023-04-21 18:03:58,985][0m Trial 1 finished with value: 0.38125075310278345 a

In [41]:
print_best_result(study, 'Naive Bayes', 'Real Return')

Naive Bayes - Real Return
Melhor pontuação: 0.429690324135438
Melhores hiperparâmetros: {'var_smoothing': 1.1218244619811e-12}


##### Risk

In [42]:
def objective_naive_bayes_risk(trial):
    params = {
        'var_smoothing': trial.suggest_loguniform('var_smoothing', 1e-12, 1e-5)
	}
    model = GaussianNB(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [43]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_naive_bayes_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 18:04:00,856][0m A new study created in memory with name: no-name-bd42f733-73c2-4a3e-93fd-cbb5676c5892[0m
[32m[I 2023-04-21 18:04:00,964][0m Trial 2 finished with value: 0.356307988914327 and parameters: {'var_smoothing': 2.179134213612953e-09}. Best is trial 2 with value: 0.356307988914327.[0m
[32m[I 2023-04-21 18:04:00,967][0m Trial 0 finished with value: 0.3195565730810941 and parameters: {'var_smoothing': 9.654162864784708e-07}. Best is trial 2 with value: 0.356307988914327.[0m
[32m[I 2023-04-21 18:04:00,972][0m Trial 1 finished with value: 0.31521870104831906 and parameters: {'var_smoothing': 5.004721403902181e-06}. Best is trial 2 with value: 0.356307988914327.[0m
[32m[I 2023-04-21 18:04:00,979][0m Trial 7 finished with value: 0.35871791782142426 and parameters: {'var_smoothing': 1.9701785382290422e-11}. Best is trial 7 with value: 0.35871791782142426.[0m
[32m[I 2023-04-21 18:04:00,982][0m Trial 4 finished with value: 0.3354621038679359 and para

In [44]:
print_best_result(study, 'Naive Bayes', 'Risk')

Naive Bayes - Risk
Melhor pontuação: 0.3590794071574889
Melhores hiperparâmetros: {'var_smoothing': 1.013375533407592e-12}


### Rede Neural

##### Real Return

In [45]:
def objective_neural_network_return(trial):
	params = {
		'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
		'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
		'max_iter': trial.suggest_int('max_iter', 200, 2000),
		'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(100,), (200,), (300,), (400,), (500,), (600,), (700,), (800,), (900,), (1000,)]),
		'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']), 
	}

	model = MLPClassifier(**params)
	model.fit(X_real_return_train, y_real_return_train)
	preds = model.predict(X_real_return_test)
	accuracy = accuracy_score(y_real_return_test, preds)

	return accuracy

In [46]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_neural_network_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 18:04:02,689][0m A new study created in memory with name: no-name-0819c187-d1d8-447c-a2d3-27889cab3951[0m
[32m[I 2023-04-21 18:06:37,112][0m Trial 4 finished with value: 0.710929027593686 and parameters: {'activation': 'tanh', 'solver': 'adam', 'max_iter': 829, 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}. Best is trial 4 with value: 0.710929027593686.[0m
[32m[I 2023-04-21 18:07:08,694][0m Trial 3 finished with value: 0.6986383901674901 and parameters: {'activation': 'tanh', 'solver': 'lbfgs', 'max_iter': 266, 'hidden_layer_sizes': (500,), 'learning_rate': 'adaptive'}. Best is trial 4 with value: 0.710929027593686.[0m
[32m[I 2023-04-21 18:07:19,987][0m Trial 8 finished with value: 0.6880347029762622 and parameters: {'activation': 'tanh', 'solver': 'adam', 'max_iter': 1846, 'hidden_layer_sizes': (300,), 'learning_rate': 'adaptive'}. Best is trial 4 with value: 0.710929027593686.[0m
[32m[I 2023-04-21 18:07:25,606][0m Trial 7 finished with v

In [47]:
print_best_result(study, 'Rede Neural', 'Real Return')

Rede Neural - Real Return
Melhor pontuação: 0.7322568984214965
Melhores hiperparâmetros: {'activation': 'logistic', 'solver': 'lbfgs', 'max_iter': 1471, 'hidden_layer_sizes': (800,), 'learning_rate': 'constant'}


##### Risk

In [48]:
def objective_neural_network_risk(trial):
	params = {
		'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
		'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
		'max_iter': trial.suggest_int('max_iter', 200, 2000),
		'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(100,), (200,), (300,), (400,), (500,), (600,), (700,), (800,), (900,), (1000,)]),
		'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']), 
	}

	model = MLPClassifier(**params)
	model.fit(X_risk_train, y_risk_train)
	preds = model.predict(X_risk_test)
	accuracy = accuracy_score(y_risk_test, preds)

	return accuracy

In [49]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_neural_network_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-21 20:44:18,308][0m A new study created in memory with name: no-name-0bc2ea2b-3032-43e1-b768-9d72314e5e59[0m
[32m[I 2023-04-21 20:48:51,882][0m Trial 3 finished with value: 0.4055910350644656 and parameters: {'activation': 'identity', 'solver': 'lbfgs', 'max_iter': 1940, 'hidden_layer_sizes': (700,), 'learning_rate': 'invscaling'}. Best is trial 3 with value: 0.4055910350644656.[0m
[32m[I 2023-04-21 20:51:39,607][0m Trial 9 finished with value: 0.37149054102903967 and parameters: {'activation': 'logistic', 'solver': 'sgd', 'max_iter': 1508, 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}. Best is trial 3 with value: 0.4055910350644656.[0m
[32m[I 2023-04-21 20:54:25,308][0m Trial 5 finished with value: 0.376430895288589 and parameters: {'activation': 'identity', 'solver': 'adam', 'max_iter': 727, 'hidden_layer_sizes': (400,), 'learning_rate': 'invscaling'}. Best is trial 3 with value: 0.4055910350644656.[0m
[32m[I 2023-04-21 20:57:55,743][0m Tri

ValueError: Solver produced non-finite parameter weights. The input data may contain large values and need to be preprocessed.

In [None]:
print_best_result(study, 'Rede Neural', 'Risk')

### Regressão Logistica

##### Real Return

In [50]:
def objective_logistic_regression_return(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 10000),
        'n_jobs': N_JOBS

	}
    model = LogisticRegression(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [51]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_logistic_regression_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-22 01:15:52,001][0m A new study created in memory with name: no-name-33d91aac-bc0f-48fd-b75e-6efb86872be6[0m
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear fe

In [52]:
print_best_result(study, 'Logistic Regression', 'Real Return')

Logistic Regression - Real Return
Melhor pontuação: 0.8208217857573201
Melhores hiperparâmetros: {'penalty': 'l2', 'C': 8.82516085005323, 'class_weight': None, 'solver': 'newton-cg', 'max_iter': 5411}


##### Risk

In [53]:
def objective_logistic_regression_risk(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 10000),
        'n_jobs': N_JOBS

	}
    model = LogisticRegression(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [54]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_logistic_regression_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-22 01:54:24,919][0m A new study created in memory with name: no-name-32e50b4d-440c-425b-b0c5-318054062d60[0m
[32m[I 2023-04-22 01:54:25,203][0m Trial 0 finished with value: 0.40426557416556214 and parameters: {'penalty': 'l2', 'C': 4.327319582199049, 'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 7696}. Best is trial 0 with value: 0.40426557416556214.[0m
[32m[I 2023-04-22 01:54:25,267][0m Trial 8 finished with value: 0.4040245812748524 and parameters: {'penalty': 'l2', 'C': 33.78414884741859, 'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 399}. Best is trial 0 with value: 0.40426557416556214.[0m
[32m[I 2023-04-22 01:54:25,318][0m Trial 12 finished with value: 0.4040245812748524 and parameters: {'penalty': 'l2', 'C': 32.2822339929061, 'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 7981}. Best is trial 0 with value: 0.40426557416556214.[0m
[32m[I 2023-04-22 01:54:25,329][0m Trial 10 finished with value: 0.418243

In [55]:
print_best_result(study, 'Logistic Regression', 'Risk')

Logistic Regression - Risk
Melhor pontuação: 0.43246174237859986
Melhores hiperparâmetros: {'penalty': 'l2', 'C': 12.784017261261628, 'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 2936}


### KNeighborsClassifier

##### Real Return

In [56]:
def objective_kn_return(trial):
    params = {
		'n_neighbors': trial.suggest_int('n_neighbors', 5, 200),
		'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
		'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
		'leaf_size': trial.suggest_int('leaf_size', 30, 100),
		'p': trial.suggest_int('p', 1, 3),
    	'n_jobs': N_JOBS
	}
    
    model = KNeighborsClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [57]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_kn_return, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-22 02:03:07,694][0m A new study created in memory with name: no-name-09e1b074-7376-4e99-98c3-332d620db329[0m
[32m[I 2023-04-22 02:03:09,903][0m Trial 4 finished with value: 0.6046511627906976 and parameters: {'n_neighbors': 73, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 62, 'p': 2}. Best is trial 4 with value: 0.6046511627906976.[0m
[32m[I 2023-04-22 02:03:11,518][0m Trial 5 finished with value: 0.616580310880829 and parameters: {'n_neighbors': 23, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 44, 'p': 2}. Best is trial 5 with value: 0.616580310880829.[0m
[32m[I 2023-04-22 02:03:12,553][0m Trial 6 finished with value: 0.5717556332088204 and parameters: {'n_neighbors': 139, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 65, 'p': 2}. Best is trial 5 with value: 0.616580310880829.[0m
[32m[I 2023-04-22 02:03:13,448][0m Trial 12 finished with value: 0.6697192432823231 and parameters: {'n_neighbors': 187, 'weights': 'dist

In [58]:
print_best_result(study, 'KNeighborsClassifier', 'Real Return')

KNeighborsClassifier - Real Return
Melhor pontuação: 0.6911676105554886
Melhores hiperparâmetros: {'n_neighbors': 47, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 37, 'p': 1}


##### Risk

In [59]:
def objective_kn_risk(trial):
    params = {
		'n_neighbors': trial.suggest_int('n_neighbors', 5, 200),
		'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
		'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
		'leaf_size': trial.suggest_int('leaf_size', 30, 100),
		'p': trial.suggest_int('p', 1, 3),
    	'n_jobs': N_JOBS
	}
    
    model = KNeighborsClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [60]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_kn_risk, n_trials=N_TRIALS, n_jobs=N_JOBS)

[32m[I 2023-04-22 02:05:00,479][0m A new study created in memory with name: no-name-d7e6d596-071b-4a08-9a36-5224c40e1b31[0m
[32m[I 2023-04-22 02:05:06,292][0m Trial 8 finished with value: 0.4075189781901434 and parameters: {'n_neighbors': 80, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 98, 'p': 2}. Best is trial 8 with value: 0.4075189781901434.[0m
[32m[I 2023-04-22 02:05:08,837][0m Trial 1 finished with value: 0.4119773466682733 and parameters: {'n_neighbors': 96, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 62, 'p': 2}. Best is trial 1 with value: 0.4119773466682733.[0m
[32m[I 2023-04-22 02:05:09,058][0m Trial 10 finished with value: 0.4205325942884685 and parameters: {'n_neighbors': 81, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 58, 'p': 1}. Best is trial 10 with value: 0.4205325942884685.[0m
[32m[I 2023-04-22 02:05:09,123][0m Trial 0 finished with value: 0.4161947222556935 and parameters: {'n_neighbors': 142, 'weights

In [61]:
print_best_result(study, 'KNeighborsClassifier', 'Risk')

KNeighborsClassifier - Risk
Melhor pontuação: 0.4230630196409206
Melhores hiperparâmetros: {'n_neighbors': 104, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 69, 'p': 1}
