# Multiple Classifiers Ensemble System (MCS)

#### Iury Zanonni de Faria

### Imports

#### General imports

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import statistics as st
import matplotlib.pyplot as plt
import warnings

#### Feature Selection imports

In [None]:
from sklearn.feature_selection import mutual_info_classif

#### Diversity imports

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

#### Classifiers imports

In [None]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


#### k-mean

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans

#### Metrics

In [None]:
from sklearn.metrics import precision_score

In [None]:
from sklearn.model_selection import train_test_split

### Settings

In [None]:
DATA = ['Unnamed: 0', 'revenue','cost-goods-sold','gross-profit','research-development-expenses','selling-general-administrative-expenses','operating-expenses',
'operating-income','total-non-operating-income-expense','pre-tax-income','total-provision-income-taxes','income-after-taxes','income-from-continuous-operations',
'income-from-discontinued-operations','net-income','ebitda','ebit','basic-shares-outstanding','shares-outstanding','eps-basic-net-earnings-per-share',
'eps-earnings-per-share-diluted','cash-on-hand','receivables-total','inventory','other-current-assets','total-current-assets','net-property-plant-equipment',
'long-term-investments','goodwill-intangible-assets-total','other-long-term-assets','total-long-term-assets','total-assets','total-current-liabilities','long-term-debt',
'other-non-current-liabilities','total-long-term-liabilities','total-liabilities','common-stock-net','retained-earnings-accumulated-deficit','comprehensive-income',
'total-share-holder-equity','total-liabilities-share-holders-equity','net-income-loss','total-depreciation-amortization-cash-flow','other-non-cash-items','total-non-cash-items',
'change-in-accounts-receivable','change-in-inventories','change-in-accounts-payable','change-in-assets-liabilities','total-change-in-assets-liabilities',
'cash-flow-from-operating-activities','net-change-in-property-plant-equipment','net-change-in-intangible-assets','net-acquisitions-divestitures','investing-activities-other',
'cash-flow-from-investing-activities','net-long-term-debt','net-current-debt','debt-issuance-retirement-net-total','net-common-equity-issued-repurchased',
'net-total-equity-issued-repurchased','total-common-preferred-stock-dividends-paid','financial-activities-other','cash-flow-from-financial-activities',
'net-cash-flow','stock-based-compensation','common-stock-dividends-paid','current-ratio','long-term-debt-capital','debt-equity-ratio','gross-margin',
'operating-margin','ebit-margin','pre-tax-profit-margin','net-profit-margin','asset-turnover','inventory-turnover','receiveable-turnover','days-sales-in-receivables',
'roe','return-on-tangible-equity','roa','roi','book-value-per-share','operating-cash-flow-per-share','free-cash-flow-per-share','net-change-in-short-term-investments',
'net-change-in-long-term-investments','net-change-in-investments-total','other-operating-income-expenses','pre-paid-expenses','other-share-holders-equity','other-income',
'ebitda-margin']

REAL_RETURN_CLASS = "RealReturnClass"
REAL_RETURN = "RealReturn"
RISK_CLASS = 'RiskClass'
RISK = "Risk"

HIGH = 'high'
MEDIUM = 'medium'
LOW = 'low'

DATE = 'Unnamed: 0'

N_PERIODS = 2
N_FEATURES = 60

DATASET_PATH = 'new_dataset/process_final_{}.csv'.format(N_PERIODS)

MUTUAL_INFORMATION = "MUTUAL_INFORMATION"

SPEARMAN = "SPEARMAN"

ONE_R = "ONE_R"

FEATURE_SELECTION = MUTUAL_INFORMATION

#Remove os warnings do notebook
warnings.filterwarnings('ignore')

In [None]:
file = open(f'files/{FEATURE_SELECTION}_{N_FEATURES}.txt', 'w+')

### Import dataset

In [None]:
dataset = pd.read_csv(DATASET_PATH)

### Data Frequency

In [None]:
def plotResults(dataset:pd.DataFrame, title_1:str, title_2:str):
  fig, axes = plt.subplots(nrows=1, ncols=2)
  fig.set_figwidth(15)
  fig.set_figheight(5)

  x = dataset[REAL_RETURN_CLASS].value_counts()
  x.plot.bar(ax=axes[0])
  axes[0].set_title(title_1)

  x = dataset[RISK_CLASS].value_counts()
  x.plot.bar(ax=axes[1])
  axes[1].set_title(title_2)

plotResults(dataset, "Real Return", "Risk")

In [None]:
dataset = dataset.replace(to_replace=[HIGH], value=2.0)
dataset = dataset.replace(to_replace=[MEDIUM], value=1.0)
dataset = dataset.replace(to_replace=[LOW], value=0.0)

dataset = dataset.replace(to_replace=[np.NaN], value=0.0)

dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, DATE])
dataset_y = dataset.drop(columns=DATA)

### Feature Selection

In [None]:
file.write("######## FEATURES ########")
file.write("\n")

In [None]:
run_start_date = dt.datetime.now()
file.write(f"\nSTART: {run_start_date}\n")

In [None]:
def readFeatures(typeFeature, typeClass, num):
	file = open(f'./feature_selection/files/{typeFeature}_{typeClass}.txt', 'r')
	result = []
	
	for feature in file:
		result.append(eval(str(feature)))
	    
	file.close()
    
	return result[:num]

#### Real Return

In [None]:
final_ranking_real_return = readFeatures(FEATURE_SELECTION, REAL_RETURN, N_FEATURES)

#### Risk

In [None]:
final_ranking_risk  = readFeatures(FEATURE_SELECTION, RISK, N_FEATURES)

In [None]:
def getColumnsRank(rank: list):
  ranking = []
  for column in rank:
    ranking.append(column[0])
    
  return ranking

In [None]:
features_real_return = getColumnsRank(final_ranking_real_return)[:N_FEATURES]
features_risk = getColumnsRank(final_ranking_risk)[:N_FEATURES]

In [None]:
file.write("REAL RETURN\n")
file.write(str(features_real_return))
file.write("\nRISK\n")
file.write(str(features_risk))


### SSCA

O cálculo será feito com o número total de features ou somente com as 15 mais bem ranqueadas?

In [None]:

SSCAS = []

features_return = getColumnsRank(final_ranking_real_return)[:N_FEATURES]

#dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, "Date", "Company"])
dataset_X = dataset[features_return]
dataset_y = dataset.drop(columns=DATA)

dataset_X

In [None]:
for cluster in range(2, 7):
  clusterer = KMeans(n_clusters=cluster, random_state=10)

  cluster_labels = clusterer.fit_predict(dataset_X)

  silhouette_values = silhouette_samples(dataset_X, cluster_labels)
  #silhouette_avg = silhouette_score(dataset_X, cluster_labels)

  sum_count = 0
  count_2 = 0

  #Primeiro somatorio
  for k in range(cluster):
    count = 0
    n_j = 0

    #Segundo somatorio
    for j in range(len(cluster_labels)):
      if cluster_labels[j] == k:
        n_j += 1
        count += silhouette_values[j]

    count_2 = (count/n_j)
    sum_count += count_2

  SSCA = (sum_count/cluster)
  SSCAS.append((cluster, round(SSCA, 2)))

SSCAS

In [None]:
file.write("\n\n######## SSCAS ########\n\n")
file.write(str(SSCAS))
file.write("\n")

### Divisão do dataset

In [None]:
columns_dataset = DATA
columns_dataset.append(REAL_RETURN)
columns_dataset.append(RISK)
columns_dataset.append(REAL_RETURN_CLASS)
columns_dataset.append(RISK_CLASS)

df_train = None
df_test = None

df_train = pd.DataFrame(columns=columns_dataset)
df_test = pd.DataFrame(columns=columns_dataset)


In [None]:
# TRAINING_START_DATE =  dt.datetime.strptime('2009-03-31', "%Y-%m-%d")
# TRAINING_END_DATE =  dt.datetime.strptime('2018-03-31', "%Y-%m-%d")

# TEST_START_DATE =  dt.datetime.strptime('2018-06-30', "%Y-%m-%d")
# TEST_END_DATE =  dt.datetime.strptime('2022-03-31', "%Y-%m-%d")

# dataset_sort = dataset.sort_values(by=DATE)
# count_train = 0
# count_test = 0

# for index, row in dataset_sort.iterrows():
#   date = dt.datetime.strptime(row[DATE], "%Y-%m-%d")
#   if date.year < TEST_START_DATE.year:
#     df_train = df_train.append(row)
#     count_train +=1
#   elif date.year == TEST_START_DATE.year and date.month < TEST_START_DATE.month:
#     df_train = df_train.append(row)
#     count_train +=1
#   else:
#     df_test = df_test.append(row)
#     count_test += 1

# print(count_train)
# print(count_test)

# df_train = df_train.drop(columns=[REAL_RETURN, RISK, DATE])
# df_test = df_test.drop(columns=[REAL_RETURN, RISK, DATE])

# plotResults(df_train, "Real Return", "Risk")

# plotResults(df_test, "Real Return", "Risk")

# df_train.to_csv('./util/dataset_train.csv')
# df_test.to_csv('./util/dataset_test.csv')

In [None]:
df_train = pd.read_csv('./util/dataset_train.csv')
df_test = pd.read_csv('./util/dataset_test.csv')

##### Real Return

In [None]:
X_real_return_train = df_train[features_real_return]
y_real_return_train = df_train[REAL_RETURN_CLASS]

X_real_return_test = df_test[features_real_return]
y_real_return_test = df_test[REAL_RETURN_CLASS]

##### Risk

In [None]:
X_risk_train = df_train[features_risk]
y_risk_train = df_train[RISK_CLASS]

X_risk_test = df_test[features_risk]
y_risk_test = df_test[RISK_CLASS]

### Classificadores Únicos

In [None]:
file.write("\n######## CLASSIFICADORES UNICOS ########\n")

In [None]:
classifiers_real_return = {}
classifiers_risk = {}

#### Random Forest

##### Real Return

In [None]:
RANDOM_FOREST = 'RANDOM_FOREST'

randon_forest_return = RandomForestClassifier(n_estimators = 840, max_depth = 178, min_samples_split = 4, min_samples_leaf = 6, max_features = 'sqrt')
classifiers_real_return[RANDOM_FOREST] = randon_forest_return

randon_forest_return.fit(X_real_return_train, y_real_return_train)

result_randon_forest_return = randon_forest_return.score(X_real_return_test, y_real_return_test)
result_randon_forest_return

##### Risk

In [None]:
randon_forest_risk = RandomForestClassifier(n_estimators = 622, max_depth = 70, min_samples_split = 3, min_samples_leaf = 9, max_features = 'log2')

classifiers_risk[RANDOM_FOREST] = randon_forest_risk
randon_forest_risk.fit(X_risk_train, y_risk_train)

result_randon_forest_risk = randon_forest_risk.score(X_risk_test, y_risk_test)
result_randon_forest_risk

In [None]:
file.write(f"RANDOM FOREST:({result_randon_forest_return},{result_randon_forest_risk})\n")

#### SVM

##### Real Return

In [None]:
SVM = 'SVM'

svm_real_return = SVC(kernel = 'rbf', C = 99.94849891435051, class_weight = 'balanced')
classifiers_real_return[SVM] = svm_real_return

svm_real_return.fit(X_real_return_train, y_real_return_train)

result_svm_return = svm_real_return.score(X_real_return_test, y_real_return_test)
result_svm_return

##### Risk

In [None]:
svm_risk = SVC(kernel = 'rbf', C = 99.89489576327396, class_weight = 'balanced')
classifiers_risk[SVM] = svm_risk

svm_risk.fit(X_risk_train, y_risk_train)

result_svm_risk = svm_risk.score(X_risk_test, y_risk_test)
result_svm_risk

In [None]:
file.write(f"SVM:({result_svm_return},{result_svm_risk})\n")

#### Decision Tree

##### Real Return

In [None]:
DECISION_TREE = 'DECISION_TREE'

decision_tree_real_return = DecisionTreeClassifier(criterion = 'entropy', splitter = 'best', max_depth = 10, min_samples_split = 1097)
classifiers_real_return[DECISION_TREE] = decision_tree_real_return

decision_tree_real_return.fit(X_real_return_train, y_real_return_train)

result_decision_return = decision_tree_real_return.score(X_real_return_test, y_real_return_test)
result_decision_return

##### Risk

In [None]:
decision_tree_risk = DecisionTreeClassifier(criterion = 'entropy', splitter = 'best', max_depth = 133, min_samples_split = 919)
classifiers_risk[DECISION_TREE] = decision_tree_risk

decision_tree_risk.fit(X_risk_train, y_risk_train)

result_decision_risk = decision_tree_risk.score(X_risk_test, y_risk_test)
result_decision_risk

In [None]:
file.write(f"DECISION TREE:({result_decision_return},{result_decision_risk})\n")

#### Gaussian Naive Bayes

##### Real Return

In [None]:
NAIVE_BAYES = 'NAIVE_BAYES'

nb_real_return = GaussianNB(var_smoothing = 1.1218244619811e-12)
classifiers_real_return[NAIVE_BAYES] = nb_real_return

nb_real_return.fit(X_real_return_train, y_real_return_train)

result_nb_return = nb_real_return.score(X_real_return_test, y_real_return_test)
result_nb_return

##### Risk

In [None]:
nb_risk = GaussianNB(var_smoothing = 1.013375533407592e-12)
classifiers_risk[NAIVE_BAYES] = nb_risk

nb_risk.fit(X_risk_train, y_risk_train)

result_nb_risk = nb_risk.score(X_risk_test, y_risk_test)
result_nb_risk

In [None]:
file.write(f"NAIVE BAYES:({result_nb_return},{result_nb_risk})\n")

#### Rede Neural

##### Real Return

In [None]:
NEURAL_NETWORK = 'NEURAL_NETWORK'
neural_return = MLPClassifier(activation = 'logistic', solver = 'lbfgs', max_iter = 500, hidden_layer_sizes = (300,), learning_rate = 'constant')
classifiers_real_return[NEURAL_NETWORK] = neural_return

neural_return.fit(X_real_return_train, y_real_return_train)

result_neural_return = neural_return.score(X_real_return_test, y_real_return_test)
result_neural_return

##### Risk

In [None]:
neural_risk = MLPClassifier(activation = 'logistic', solver = 'lbfgs', max_iter = 500, hidden_layer_sizes = (300,), learning_rate = 'constant')
classifiers_risk[NEURAL_NETWORK] = neural_risk

neural_risk.fit(X_risk_train, y_risk_train)

result_neural_risk = neural_risk.score(X_risk_test, y_risk_test)
result_neural_risk

In [None]:
file.write(f"NEURAL NETWORK:({result_neural_return},{result_decision_risk})\n")

#### Regressão Logística

##### Real Return

In [None]:
LOGISTIC_REGRESSION ='LOGISTIC_REGRESSION'

rl_return = LogisticRegression(penalty = 'l2', C = 8.82516085005323, class_weight = None, solver = 'newton-cg', max_iter = 5411)
classifiers_real_return[LOGISTIC_REGRESSION] = rl_return

rl_return.fit(X_real_return_train, y_real_return_train)

result_rl_return = rl_return.score(X_real_return_test, y_real_return_test)
result_rl_return

##### Risk

In [None]:
rl_risk = LogisticRegression(penalty = 'l2', C = 12.784017261261628, class_weight = 'balanced', solver = 'liblinear', max_iter = 2936)
classifiers_risk[LOGISTIC_REGRESSION] = rl_risk

rl_risk.fit(X_risk_train, y_risk_train)

result_rl_risk = rl_risk.score(X_risk_test, y_risk_test)
result_rl_risk

In [None]:
file.write(f"LOGISTIC REGRESSION:({result_rl_return},{result_rl_risk})\n")

#### KNeighborsClassifier

##### Real Return

In [None]:
K_NEIGHBORS ='KNeighborsClassifier'

knn_return = KNeighborsClassifier(n_neighbors = 47, weights = 'distance', algorithm = 'kd_tree', leaf_size = 37, p = 1)
classifiers_real_return[K_NEIGHBORS] = knn_return

knn_return.fit(X_real_return_train, y_real_return_train)

result_knn_result = knn_return.score(X_real_return_test, y_real_return_test)
result_knn_result

##### Risk

In [None]:
knn_risk = KNeighborsClassifier(n_neighbors = 47, weights = 'distance', algorithm = 'kd_tree', leaf_size = 37, p = 1)
classifiers_risk[K_NEIGHBORS] = knn_risk

knn_risk.fit(X_risk_train, y_risk_train)

result_knn_risk = knn_risk.score(X_risk_test, y_risk_test)
result_knn_risk

In [None]:
file.write(f"KNEIGHBORS:({result_knn_result},{result_knn_risk})\n")

#### XGboost

##### Real Return

In [None]:
XG_BOOST = 'XG_BOOST'

xg_boost_return = xgboost.XGBClassifier()
classifiers_real_return[XG_BOOST] = xg_boost_return

xg_boost_return.fit(X_real_return_train, y_real_return_train)

result_xgboost_result = xg_boost_return.score(X_real_return_test, y_real_return_test)
result_xgboost_result

##### Risk

In [None]:
xg_boost_risk = xgboost.XGBClassifier()
classifiers_risk[XG_BOOST] = xg_boost_risk

xg_boost_risk.fit(X_risk_train, y_risk_train)

result_xgboost_risk = xg_boost_risk.score(X_risk_test, y_risk_test)
result_xgboost_risk

In [None]:
file.write(f"XGBOOST:({result_xgboost_result},{result_xgboost_risk})\n")

### Cross-validation

In [None]:
CV = 10
result_cv_real_return = {}
result_cv_risk = {}

X_dataset_real_return = dataset[features_real_return]
y_dataset_real_return = dataset[REAL_RETURN_CLASS]

X_dataset_risk = dataset[features_risk]
y_dataset_risk = dataset[RISK_CLASS]

##### Real Return

In [None]:
cv_result = cross_val_score(classifiers_real_return[RANDOM_FOREST], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[RANDOM_FOREST] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[SVM], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[SVM] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[DECISION_TREE], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[DECISION_TREE] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[NAIVE_BAYES], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[NAIVE_BAYES] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[NEURAL_NETWORK], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[NEURAL_NETWORK] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[LOGISTIC_REGRESSION], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[LOGISTIC_REGRESSION] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[K_NEIGHBORS], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[K_NEIGHBORS] = cv_result.mean()

cv_result = cross_val_score(classifiers_real_return[XG_BOOST], X_dataset_real_return, y_dataset_real_return, cv=CV, n_jobs=-1)
result_cv_real_return[XG_BOOST] = cv_result.mean()

result_cv_real_return

##### Risk

In [None]:
cv_result = cross_val_score(classifiers_risk[RANDOM_FOREST], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[RANDOM_FOREST] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[SVM], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[SVM] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[DECISION_TREE], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[DECISION_TREE] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[NAIVE_BAYES], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[NAIVE_BAYES] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[NEURAL_NETWORK], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[NEURAL_NETWORK] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[LOGISTIC_REGRESSION], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[LOGISTIC_REGRESSION] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[K_NEIGHBORS], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[K_NEIGHBORS] = cv_result.mean()

cv_result = cross_val_score(classifiers_risk[XG_BOOST], X_dataset_risk, y_dataset_risk, cv=CV, n_jobs=-1)
result_cv_risk[XG_BOOST] = cv_result.mean()

result_cv_risk

In [None]:
file.write("\n######## CV ########\n")
file.write(f"Real Return: {result_cv_real_return}\n")
file.write(f"Risk: {result_cv_risk}\n")

### Selection Of Cassifier

#### List of Sets

In [None]:
classifiers = list(result_cv_real_return.keys())
list_sets = []

for i in range(len(classifiers)):
  for j in range(i + 1, len(classifiers)):
    list_sets.append((classifiers[i], classifiers[j]))

list_sets


#### Real Return Classifiers

In [None]:
real_return_classifiers = []

for classifier_set in list_sets:  
  set_0 = result_cv_real_return[classifier_set[0]]
  set_1 = result_cv_real_return[classifier_set[1]]

  avg = (set_0 + set_1) / 2
  if avg >= 0.75:
    if classifier_set[0] not in real_return_classifiers:
      real_return_classifiers.append(classifier_set[0])

    if classifier_set[1] not in real_return_classifiers:
      real_return_classifiers.append(classifier_set[1])

real_return_classifiers


#### Risk Classifiers

In [None]:
risk_classifiers = []

for classifier_set in list_sets:  
  set_0 = result_cv_risk[classifier_set[0]]
  set_1 = result_cv_risk[classifier_set[1]]

  avg = (set_0 + set_1) / 2
  if avg >= 0.45:
    if classifier_set[0] not in risk_classifiers:
      risk_classifiers.append(classifier_set[0])

    if classifier_set[1] not in risk_classifiers:
      risk_classifiers.append(classifier_set[1])

risk_classifiers

In [None]:
file.write("\n######## CLASSIFIERS SELECTION ########\n")
file.write(f"Real Return: {real_return_classifiers}\n")
file.write(f"Risk: {risk_classifiers}\n")

### Fusion of Classifiers

#### Training with diversification

##### Whitout diversity

In [None]:
file.write("\n######## WHITOUT DIVERSITY ########\n")

Real Return

In [None]:
real_return_whitout_diversity = {}

for classifier in real_return_classifiers:
  real_return_whitout_diversity[classifier] = classifiers_real_return[classifier]
  real_return_whitout_diversity[classifier].fit(X_real_return_train, y_real_return_train)

  file.write(f"Real Return: {classifier}, {real_return_whitout_diversity[classifier].score(X_real_return_test, y_real_return_test)}\n")
  print(classifier, real_return_whitout_diversity[classifier].score(X_real_return_test, y_real_return_test))

Risk

In [None]:
risk_whitout_diversity = {}

for classifier in risk_classifiers:
  risk_whitout_diversity[classifier] = classifiers_risk[classifier]
  risk_whitout_diversity[classifier].fit(X_risk_train, y_risk_train)

  file.write(f"Risk: {classifier}, {risk_whitout_diversity[classifier].score(X_risk_test, y_risk_test)}\n")
  print(classifier, risk_whitout_diversity[classifier].score(X_risk_test, y_risk_test))

##### Bagging

In [None]:
file.write("\n######## BAGGING ########\n")

Real Return

In [None]:
real_return_bagging = {}
count = 0
for classifier in real_return_classifiers:
  estimator = classifiers_real_return[classifier]
  real_return_bagging[classifier] = BaggingClassifier(estimator=estimator, n_jobs=-1)
  real_return_bagging[classifier].fit(X_real_return_train, y_real_return_train)

  file.write(f"Real Return: {classifier}, {real_return_bagging[classifier].score(X_real_return_test, y_real_return_test)}\n")
  print(classifier, real_return_bagging[classifier].score(X_real_return_test, y_real_return_test))
  

Risk

In [None]:
risk_bagging = {}

for classifier in risk_classifiers:
  estimator = classifiers_risk[classifier]
  risk_bagging[classifier] = BaggingClassifier(estimator=estimator, n_jobs=-1)
  risk_bagging[classifier].fit(X_risk_train, y_risk_train)

  file.write(f"Risk: {classifier}, {risk_bagging[classifier].score(X_risk_test, y_risk_test)}\n")
  print(classifier, risk_bagging[classifier].score(X_risk_test, y_risk_test))

##### AdaBoost

In [None]:
file.write("\n######## ADABOOST ########\n")

Real Return

In [None]:
real_return_adaboost = {}

for classifier in real_return_classifiers:
  estimator = classifiers_real_return[classifier]
  try:
    real_return_adaboost[classifier] = AdaBoostClassifier(estimator=estimator)
    real_return_adaboost[classifier].fit(X_real_return_train, y_real_return_train)
    file.write(f"RealReturn: {classifier}, {real_return_adaboost[classifier].score(X_real_return_test, y_real_return_test)}\n")
    print(classifier, real_return_adaboost[classifier].score(X_real_return_test, y_real_return_test))
  except:
    real_return_adaboost.pop(classifier)
    print(classifier, "Não utilizado")


Risk

In [None]:
risk_adaboost = {}

for classifier in risk_classifiers:
  estimator = classifiers_risk[classifier]
  try:
    risk_adaboost[classifier] = AdaBoostClassifier(estimator=estimator)
    risk_adaboost[classifier].fit(X_risk_train, y_risk_train)
    file.write(f"Risk: {classifier}, {risk_adaboost[classifier].score(X_risk_test, y_risk_test)}\n")
    print(classifier, risk_adaboost[classifier].score(X_risk_test, y_risk_test))
  except:
    risk_adaboost.pop(classifier)
    print(classifier, "Não utilizado")

#### Fusion

In [None]:
model_fusion = xgboost.XGBClassifier()
model_fusion_name = XG_BOOST

In [None]:
file.write("\n######## FUSION ########\n")

In [None]:
def fusion(fusion_model, models, X_data, y_data):
  df_fusion = pd.DataFrame()

  for model in models:
    X_predict = models[model].predict(X_data)
  
    df_fusion[model] = X_predict

  X_train, X_test, y_train, y_test = train_test_split(df_fusion, y_data, test_size=0.3, random_state=42)

  print(X_train)
  fusion_model.fit(X_train, y_train)
  
  return fusion_model.score(X_test, y_test)

##### Whitout diversity

Real Return

In [None]:
whiout_return = fusion(model_fusion, real_return_whitout_diversity, X_dataset_real_return, y_dataset_real_return)
whiout_return

Risk

In [None]:
whiout_risk = fusion(model_fusion, risk_whitout_diversity, X_dataset_risk, y_dataset_risk)
whiout_risk

In [None]:
file.write(f"[WHITOUT] [{model_fusion_name}] RealReturn: {whiout_return}\n")
file.write(f"[WHITOUT] [{model_fusion_name}] Risk: {whiout_risk}\n")

##### Bagging

Real Return

In [None]:
bagging_return = fusion(model_fusion, real_return_bagging, X_dataset_real_return, y_dataset_real_return)
bagging_return

Risk

In [None]:
bagging_risk = fusion(model_fusion, risk_bagging, X_dataset_risk, y_dataset_risk)
bagging_risk

In [None]:
file.write(f"[BAGGING] [{model_fusion_name}] RealReturn: {bagging_return}\n")
file.write(f"[BAGGING] [{model_fusion_name}] Risk: {bagging_risk}\n")

##### AdaBoost

Real Return

In [None]:
ada_return = fusion(model_fusion, real_return_adaboost, X_dataset_real_return, y_dataset_real_return)
ada_return

Risk

In [None]:
ada_risk = fusion(model_fusion, risk_adaboost, X_dataset_risk, y_dataset_risk)
ada_risk

In [None]:
file.write(f"[ADABOOST] [{model_fusion_name}] RealReturn: {ada_return}\n")
file.write(f"[ADABOOST] [{model_fusion_name}] Risk: {ada_risk}\n")

In [None]:
run_end_date = dt.datetime.now()
file.write(f"\nEND: {run_end_date}")
file.write(f"\nTOTAL EXEC: {run_end_date-run_start_date}")

file.close()