# Multiple Classifiers Ensemble System (MCS)

#### Iury Zanonni de Faria

### Imports

#### General imports

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import statistics as st
import matplotlib.pyplot as plt
import warnings

#### Feature Selection imports

In [None]:
from sklearn.feature_selection import mutual_info_classif
# Info gain - App do passarinho

#### Diversity imports

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

#### Classifiers imports

In [None]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


#### k-mean

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans

### Settings

In [None]:
DATA = ['Date', 'Current Ratio','Quick Ratio','Current Assets', 'Long-term debt to equity ratio', 'Share Holder Equity','Debt to Equity Ratio', 'Percentage of net profit to sale',
'Percentage of operating profit to sale','Percentage of Gross profit to sale','ROA','ROE','EPS','P/E','P/S','Stock book value','Stock Price','ROI','MarketReturn', 'Company']

REAL_RETURN_CLASS = "RealReturnClass"
REAL_RETURN = "RealReturn"
RISK_CLASS = 'RiskClass'
RISK = "Risk"

HIGH = 'high'
MEDIUM = 'medium'
LOW = 'low'

N_PERIODS = 2
N_FEATURES = 15

DATASET_PATH = 'dataset/process_final_reverse_{}.csv'.format(N_PERIODS)

#Remove os warnings do notebook
warnings.filterwarnings('ignore')

### Import dataset

In [None]:
dataset = pd.read_csv(DATASET_PATH)

### Data Frequency

In [None]:
def plotResults(dataset:pd.DataFrame, title_1:str, title_2:str):
  fig, axes = plt.subplots(nrows=1, ncols=2)
  fig.set_figwidth(15)
  fig.set_figheight(5)

  x = dataset[REAL_RETURN_CLASS].value_counts()
  x.plot.bar(ax=axes[0])
  axes[0].set_title(title_1)

  x = dataset[RISK_CLASS].value_counts()
  x.plot.bar(ax=axes[1])
  axes[1].set_title(title_2)

plotResults(dataset, "Real Return", "Risk")

### Feature Selection

In [None]:
dataset = dataset.replace(to_replace=[HIGH], value=2.0)
dataset = dataset.replace(to_replace=[MEDIUM], value=1.0)
dataset = dataset.replace(to_replace=[LOW], value=0.0)

dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, "Date", "Company"])
dataset_y = dataset.drop(columns=DATA)

#### Real Return

In [None]:
rank_real_return = mutual_info_classif(dataset_X, dataset_y[REAL_RETURN_CLASS], discrete_features=True)

result_real_return = {}

for i in range(0, len(dataset_X.columns)):
    result_real_return[dataset_X.columns[i]] = rank_real_return[i]

final_ranking_real_return = sorted(result_real_return.items(), key=lambda x: x[1])
final_ranking_real_return.reverse()
final_ranking_real_return

#### Risk

In [None]:
rank_risk = mutual_info_classif(dataset_X, dataset_y[RISK_CLASS], discrete_features=True)

result_risk = {}

for i in range(0, len(dataset_X.columns)):
    result_risk[dataset_X.columns[i]] = rank_risk[i]

final_ranking_risk = sorted(result_risk.items(), key=lambda x: x[1])
final_ranking_risk.reverse()
final_ranking_risk

In [None]:
def getColumnsRank(rank: list):
  ranking = []
  for column in rank:
    ranking.append(column[0])
    
  return ranking

### SSCA

O cálculo será feito com o número total de features ou somente com as 15 mais bem ranqueadas?

In [None]:

SSCAS = []

features_return = getColumnsRank(final_ranking_real_return)[:N_FEATURES]

#dataset_X = dataset.drop(columns=[REAL_RETURN_CLASS, REAL_RETURN, RISK_CLASS, RISK, "Date", "Company"])
dataset_X = dataset[features_return]
dataset_y = dataset.drop(columns=DATA)

dataset_X

In [None]:
# for cluster in range(2, 7):
#   clusterer = KMeans(n_clusters=cluster, random_state=10)

#   cluster_labels = clusterer.fit_predict(dataset_X)

#   silhouette_values = silhouette_samples(dataset_X, cluster_labels)
#   #silhouette_avg = silhouette_score(dataset_X, cluster_labels)

#   sum_count = 0
#   count_2 = 0

#   #Primeiro somatorio
#   for k in range(cluster):
#     count = 0
#     n_j = 0

#     #Segundo somatorio
#     for j in range(len(cluster_labels)):
#       if cluster_labels[j] == k:
#         n_j += 1
#         count += silhouette_values[j]

#     count_2 = (count/n_j)
#     sum_count += count_2

#   SSCA = (sum_count/cluster)
#   SSCAS.append((cluster, round(SSCA, 2)))

# SSCAS

### Classificadores Únicos

<s>RandomForestClassifier</s>

SVC

DecisionTreeClassifier

GaussianNB

MLPClassifier

xgboost

#### Divisão do dataset

In [None]:
columns_dataset = DATA
columns_dataset.append(REAL_RETURN)
columns_dataset.append(RISK)
columns_dataset.append(REAL_RETURN_CLASS)
columns_dataset.append(RISK_CLASS)

df_train = None
df_test = None

df_train = pd.DataFrame(columns=columns_dataset)
df_test = pd.DataFrame(columns=columns_dataset)

In [None]:
TRAINING_START_DATE =  dt.datetime.strptime('2009-03-31', "%Y-%m-%d")
TRAINING_END_DATE =  dt.datetime.strptime('2018-03-31', "%Y-%m-%d")

TEST_START_DATE =  dt.datetime.strptime('2018-06-30', "%Y-%m-%d")
TEST_END_DATE =  dt.datetime.strptime('2022-03-31', "%Y-%m-%d")

dataset_sort = dataset.sort_values(by='Date')
count_train = 0
count_test = 0

for index, row in dataset_sort.iterrows():
  date = dt.datetime.strptime(row['Date'], "%Y-%m-%d")
  if date.year < TEST_START_DATE.year:
    df_train = df_train.append(row)
    count_train +=1
  elif date.year == TEST_START_DATE.year and date.month < TEST_START_DATE.month:
    df_train = df_train.append(row)
    count_train +=1
  else:
    df_test = df_test.append(row)
    count_test += 1

print(count_train)
print(count_test)

df_train = df_train.drop(columns=[REAL_RETURN, RISK, "Date", "Company"])
df_test = df_test.drop(columns=[REAL_RETURN, RISK, "Date", "Company"])

#### Random Forest

In [None]:
X_train = df_train.drop(columns=[REAL_RETURN_CLASS, RISK_CLASS])
y_real_return_train = df_train[REAL_RETURN_CLASS]
y_risk_train = df_train[RISK_CLASS]

plotResults(df_train, "Real Return", "Risk")


In [None]:
X_test = df_test.drop(columns=[REAL_RETURN_CLASS, RISK_CLASS])
y_real_return_test = df_test[REAL_RETURN_CLASS]
y_risk_test = df_test[RISK_CLASS]

plotResults(df_test, "Real Return", "Risk")

##### Real Return

In [None]:
randon_forest_return = RandomForestClassifier(max_depth=10, random_state=42)
randon_forest_return.fit(X_train, y_real_return_train)

randon_forest_return.score(X_test, y_real_return_test)

##### Risk

In [None]:
randon_forest_risk = RandomForestClassifier(max_depth=100, random_state=10)
randon_forest_risk.fit(X_train, y_risk_train)

randon_forest_risk.score(X_test, y_risk_test)

#### SVM

##### Real Return

In [None]:
for kernel in ("linear", "rbf", "poly"):
  svm = SVC(kernel=kernel, gamma=10)
  svm.fit(X_train, y_real_return_train)

  print(kernel, svm.score(X_test, y_real_return_test))

##### Risk

In [None]:
for kernel in ("linear", "rbf", "poly"):
  svm = SVC(kernel=kernel, gamma=10)
  svm.fit(X_train, y_risk_train)

  print(kernel, svm.score(X_test, y_risk_test))