# Multiple Classifiers Ensemble System (MCS)

#### Iury Zanonni de Faria

### Imports

#### General imports

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import statistics as st
import matplotlib.pyplot as plt
import warnings
import optuna

#### Feature Selection imports

In [None]:
from sklearn.feature_selection import mutual_info_classif
# Info gain - weka

#### Diversity imports

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

#### Classifiers imports

In [None]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


#### Metrics

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

### Settings

In [None]:
DATA = ['Date', 'Current Ratio','Quick Ratio','Current Assets', 'Long-term debt to equity ratio', 'Share Holder Equity','Debt to Equity Ratio', 'Percentage of net profit to sale',
'Percentage of operating profit to sale','Percentage of Gross profit to sale','ROA','ROE','EPS','P/E','P/S','Stock book value','Stock Price','ROI','MarketReturn', 'Company']

DATA = ['Unnamed: 0', 'revenue','cost-goods-sold','gross-profit','research-development-expenses','selling-general-administrative-expenses','operating-expenses',
'operating-income','total-non-operating-income-expense','pre-tax-income','total-provision-income-taxes','income-after-taxes','income-from-continuous-operations',
'income-from-discontinued-operations','net-income','ebitda','ebit','basic-shares-outstanding','shares-outstanding','eps-basic-net-earnings-per-share',
'eps-earnings-per-share-diluted','cash-on-hand','receivables-total','inventory','other-current-assets','total-current-assets','net-property-plant-equipment',
'long-term-investments','goodwill-intangible-assets-total','other-long-term-assets','total-long-term-assets','total-assets','total-current-liabilities','long-term-debt',
'other-non-current-liabilities','total-long-term-liabilities','total-liabilities','common-stock-net','retained-earnings-accumulated-deficit','comprehensive-income',
'total-share-holder-equity','total-liabilities-share-holders-equity','net-income-loss','total-depreciation-amortization-cash-flow','other-non-cash-items','total-non-cash-items',
'change-in-accounts-receivable','change-in-inventories','change-in-accounts-payable','change-in-assets-liabilities','total-change-in-assets-liabilities',
'cash-flow-from-operating-activities','net-change-in-property-plant-equipment','net-change-in-intangible-assets','net-acquisitions-divestitures','investing-activities-other',
'cash-flow-from-investing-activities','net-long-term-debt','net-current-debt','debt-issuance-retirement-net-total','net-common-equity-issued-repurchased',
'net-total-equity-issued-repurchased','total-common-preferred-stock-dividends-paid','financial-activities-other','cash-flow-from-financial-activities',
'net-cash-flow','stock-based-compensation','common-stock-dividends-paid','current-ratio','long-term-debt-capital','debt-equity-ratio','gross-margin',
'operating-margin','ebit-margin','pre-tax-profit-margin','net-profit-margin','asset-turnover','inventory-turnover','receiveable-turnover','days-sales-in-receivables',
'roe','return-on-tangible-equity','roa','roi','book-value-per-share','operating-cash-flow-per-share','free-cash-flow-per-share','net-change-in-short-term-investments',
'net-change-in-long-term-investments','net-change-in-investments-total','other-operating-income-expenses','pre-paid-expenses','other-share-holders-equity','other-income',
'ebitda-margin']

REAL_RETURN_CLASS = "RealReturnClass"
REAL_RETURN = "RealReturn"
RISK_CLASS = 'RiskClass'
RISK = "Risk"

HIGH = 'high'
MEDIUM = 'medium'
LOW = 'low'

DATE = 'Unnamed: 0'

N_PERIODS = 2
N_FEATURES = 20

DATASET_PATH = 'new_dataset/process_final_{}.csv'.format(N_PERIODS)

N_TRIALS = 5

MUTUAL_INFORMATION = "MUTUAL_INFORMATION"

#Remove os warnings do notebook
warnings.filterwarnings('ignore')

### Import dataset

In [None]:
dataset = pd.read_csv(DATASET_PATH)

In [None]:
dataset = dataset.replace(to_replace=[HIGH], value=2.0)
dataset = dataset.replace(to_replace=[MEDIUM], value=1.0)
dataset = dataset.replace(to_replace=[LOW], value=0.0)

dataset = dataset.replace(to_replace=[np.NaN], value=0.0)

dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, DATE])
dataset_y = dataset.drop(columns=DATA)

### Feature Selection

In [None]:
def readFeatures(typeFeature, typeClass, num):
	file = open(f'./feature_selection/files/{typeFeature}_{typeClass}.txt', 'r')
	result = []
	
	for feature in file:
		result.append(eval(str(feature)))
	    
	file.close()
    
	return result[:num]

#### Real Return

In [None]:
final_ranking_real_return = readFeatures(MUTUAL_INFORMATION, REAL_RETURN, N_FEATURES)

#### Risk

In [None]:
final_ranking_risk  = readFeatures(MUTUAL_INFORMATION, RISK, N_FEATURES)

In [None]:
def getColumnsRank(rank: list):
  ranking = []
  for column in rank:
    ranking.append(column[0])
    
  return ranking

In [None]:
features_real_return = getColumnsRank(final_ranking_real_return)[:N_FEATURES]
features_risk = getColumnsRank(final_ranking_risk)[:N_FEATURES]

### Divisão do dataset

In [None]:
columns_dataset = DATA
columns_dataset.append(REAL_RETURN)
columns_dataset.append(RISK)
columns_dataset.append(REAL_RETURN_CLASS)
columns_dataset.append(RISK_CLASS)

df_train = None
df_test = None

df_train = pd.DataFrame(columns=columns_dataset)
df_test = pd.DataFrame(columns=columns_dataset)

In [None]:
# TRAINING_START_DATE =  dt.datetime.strptime('2009-03-31', "%Y-%m-%d")
# TRAINING_END_DATE =  dt.datetime.strptime('2018-03-31', "%Y-%m-%d")

# TEST_START_DATE =  dt.datetime.strptime('2018-06-30', "%Y-%m-%d")
# TEST_END_DATE =  dt.datetime.strptime('2022-03-31', "%Y-%m-%d")

# dataset_sort = dataset.sort_values(by=DATE)
# count_train = 0
# count_test = 0

# for index, row in dataset_sort.iterrows():
#   date = dt.datetime.strptime(row[DATE], "%Y-%m-%d")
#   if date.year < TEST_START_DATE.year:
#     df_train = df_train.append(row)
#     count_train +=1
#   elif date.year == TEST_START_DATE.year and date.month < TEST_START_DATE.month:
#     df_train = df_train.append(row)
#     count_train +=1
#   else:
#     df_test = df_test.append(row)
#     count_test += 1

# print(count_train)
# print(count_test)

# df_train = df_train.drop(columns=[REAL_RETURN, RISK, DATE])
# df_test = df_test.drop(columns=[REAL_RETURN, RISK, DATE])

In [None]:
df_train = pd.read_csv('./util/dataset_train.csv')
df_test = pd.read_csv('./util/dataset_test.csv')

##### Real Return

In [None]:
X_real_return_train = df_train[features_real_return]
y_real_return_train = df_train[REAL_RETURN_CLASS]

X_real_return_test = df_test[features_real_return]
y_real_return_test = df_test[REAL_RETURN_CLASS]

##### Risk

In [None]:
X_risk_train = df_train[features_risk]
y_risk_train = df_train[RISK_CLASS]

X_risk_test = df_test[features_risk]
y_risk_test = df_test[RISK_CLASS]

### Classificadores Únicos

In [None]:
classifiers_real_return = {}
classifiers_risk = {}

### Optuna

In [None]:
def print_best_result(study, classifier, type):
	print(f'{classifier} - {type}')
	print('Melhor pontuação:', study.best_value)
	print('Melhores hiperparâmetros:', study.best_params)

### Random Forest

##### Real Return

In [None]:
def objective_random_forest_return(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)
    }
    model = RandomForestClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Random Forest', 'Real Return')


##### Risk

In [None]:
def objective_random_forest_risk(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)
    }
    model = RandomForestClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Random Forest', 'Risk')

### SVM

##### Real Return

In [None]:
def objective_svc_return(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
	}
    model = SVC(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'SVM', 'Real Return')

##### Risk

In [None]:
def objective_svc_risk(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
	}
    model = SVC(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_svc_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'SVM', 'Risk')

### Decision Tree

##### Real Return

In [None]:
def objective_decision_tree_return(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2000),
	}
    model = DecisionTreeClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_decision_tree_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'DecisionTreeClassifier', 'Real Return')

##### Risk

In [None]:
def objective_decision_tree_risk(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2000),
	}
    model = DecisionTreeClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_decision_tree_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'DecisionTreeClassifier', 'Risk')


### Naive Bayes

##### Real Return

In [None]:
def objective_naive_bayes_return(trial):
    params = {
        'var_smoothing': trial.suggest_loguniform('var_smoothing', 1e-12, 1e-5)
	}
    model = GaussianNB(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_naive_bayes_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Naive Bayes', 'Real Return')

##### Risk

In [None]:
def objective_naive_bayes_risk(trial):
    params = {
        'var_smoothing': trial.suggest_loguniform('var_smoothing', 1e-12, 1e-5)
	}
    model = GaussianNB(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_naive_bayes_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Naive Bayes', 'Risk')

### Rede Neural

##### Real Return

In [None]:
def objective_neural_network_return(trial):
	params = {
		'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
		'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
		'max_iter': trial.suggest_int('max_iter', 200, 2000),
		'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(100,), (200,), (300,), (400,), (500,), (600,), (700,), (800,), (900,), (1000,)]),
		'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']), 
	}

	model = MLPClassifier(**params)
	model.fit(X_real_return_train, y_real_return_train)
	preds = model.predict(X_real_return_test)
	accuracy = accuracy_score(y_real_return_test, preds)

	return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_neural_network_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Rede Neural', 'Real Return')

##### Risk

In [None]:
def objective_neural_network_risk(trial):
	params = {
		'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
		'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
		'max_iter': trial.suggest_int('max_iter', 200, 2000),
		'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(100,), (200,), (300,), (400,), (500,), (600,), (700,), (800,), (900,), (1000,)]),
		'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']), 
	}

	model = MLPClassifier(**params)
	model.fit(X_risk_train, y_risk_train)
	preds = model.predict(X_risk_test)
	accuracy = accuracy_score(y_risk_test, preds)

	return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_neural_network_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Rede Neural', 'Risk')

### Regressão Logistica

##### Real Return

In [None]:
def objective_logistic_regression_return(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 10000),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)

	}
    model = LogisticRegression(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_logistic_regression_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Logistic Regression', 'Real Return')

##### Risk

In [None]:
def objective_logistic_regression_risk(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C':trial.suggest_float('C', 1, 100),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 10000),
        'n_jobs': trial.suggest_int('n_jobs', 3, 3)

	}
    model = LogisticRegression(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_logistic_regression_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'Logistic Regression', 'Risk')

### KNeighborsClassifier

##### Real Return

In [None]:
def objective_kn_return(trial):
    params = {
		'n_neighbors': trial.suggest_int('n_neighbors', 5, 200),
		'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
		'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
		'leaf_size': trial.suggest_int('leaf_size', 30, 100),
		'p': trial.suggest_int('p', 1, 3),
    	'n_jobs': trial.suggest_int('n_jobs', 3, 3)
	}
    
    model = KNeighborsClassifier(**params)
    model.fit(X_real_return_train, y_real_return_train)
    preds = model.predict(X_real_return_test)
    accuracy = accuracy_score(y_real_return_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_kn_return, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'KNeighborsClassifier', 'Real Return')

##### Risk

In [None]:
def objective_kn_risk(trial):
    params = {
		'n_neighbors': trial.suggest_int('n_neighbors', 5, 200),
		'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
		'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
		'leaf_size': trial.suggest_int('leaf_size', 30, 100),
		'p': trial.suggest_int('p', 1, 3),
    	'n_jobs': trial.suggest_int('n_jobs', 3, 3)
	}
    
    model = KNeighborsClassifier(**params)
    model.fit(X_risk_train, y_risk_train)
    preds = model.predict(X_risk_test)
    accuracy = accuracy_score(y_risk_test, preds)

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_kn_risk, n_trials=N_TRIALS, n_jobs=3)

In [None]:
print_best_result(study, 'KNeighborsClassifier', 'Risk')