In [1]:
# Numpy como dependência do Scikit-learn e Pandas
# Scikit-learn para os algoritmos de ML e funções cômodas
# Pandas para visualização e manipulação rápida e fácil de dados
import sklearn
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# Versões que estou usando
print(sklearn.__version__ + ' ' + np.__version__ +  ' ' + pd.__version__)

0.20.3 1.16.2 0.24.2


In [3]:
# Criação de um DataFrame do pandas
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

In [4]:
# Dataset Iris
#dataset = datasets.load_iris()

# Dataset Breast Cancer Wisconsin
#dataset = datasets.load_breast_cancer()

# Dataset Wine
dataset = datasets.load_wine()

# Dataset com 1000 imagens de 10 categorias
# csv_dataset = pd.read_csv('https://raw.githubusercontent.com/jgckruger/machine-learning/master/image_classifiers/mono_feat.csv')
# feature_names = list(set(dataset.iloc[::,-1]))
# target = dataset.iloc[::,-1]
# data = dataset.iloc[0::,0:827:]


In [5]:
# Estratificação dos dados, split em teste e treino
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, stratify=dataset.target, test_size = 0.3, random_state=4)
#train_df = pd.DataFrame(X_train)
#train_df['target'] = pd.Series(y_train)
#train_df.head()

In [6]:
# Criação de um DataFrame do Panda a partir do dataset
df_dataset = sklearn_to_df(dataset)
print(df_dataset.shape)
df_dataset.head()

(178, 14)


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [7]:
# Normalização dos dados
scaler = preprocessing.StandardScaler()

# Sem trapaças, calibrar o scaler apenas nos dados de treino
scaler.fit(X_train)

S_train = scaler.transform(X_train)
S_test = scaler.transform(X_test)

In [8]:
# KNN
from sklearn.neighbors import KNeighborsClassifier as KNN

knn = KNN(n_neighbors=5)
knn.fit(S_train, y_train)
knn_predict = knn.predict(S_test)
print(knn.score(S_test, y_test))

knn2 = KNN(n_neighbors=5)
knn2.fit(X_train, y_train)
print(knn2.score(X_test, y_test))

pd.DataFrame(confusion_matrix(y_test, knn_predict),columns = dataset.target_names, index = dataset.target_names)

0.9444444444444444
0.7407407407407407


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,2,18,1
class_2,0,0,15


In [9]:
# Árvore de decisão
from sklearn.tree import DecisionTreeClassifier as DT

dt = DT(criterion="entropy")
dt.fit(S_train, y_train)
dt_predict = dt.predict(S_test)
print(dt.score(S_test, y_test))

pd.DataFrame(confusion_matrix(y_test, dt_predict),columns = dataset.target_names, index = dataset.target_names)

0.9074074074074074


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,1,18,2
class_2,1,1,13


In [10]:
# Máquina de vetor de suporte
from sklearn.svm import LinearSVC as LSVC, NuSVC as NSVC, SVC as SVC

svm = NSVC(kernel='rbf', degree=1)
svm.fit(S_train, y_train)
svm_predict = svm.predict(S_test)
print(svm.score(S_test, y_test))

pd.DataFrame(confusion_matrix(y_test, svm_predict),columns = dataset.target_names, index = dataset.target_names)

0.9814814814814815


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,0,21,0
class_2,0,1,14


In [11]:
# Rede neural
from sklearn.neural_network import MLPClassifier as MLP

mlp = MLP(alpha = 0.001, activation='relu', max_iter=1000)
mlp.fit(S_train, y_train)
mlp_predict = mlp.predict(S_test)
print(svm.score(S_test, y_test))

pd.DataFrame(confusion_matrix(y_test, mlp_predict),columns = dataset.target_names, index = dataset.target_names)

0.9814814814814815


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,0,21,0
class_2,0,0,15


In [12]:
# Processo gaussiano
from sklearn.gaussian_process import GaussianProcessClassifier as GPC

gpc = GPC()
gpc.fit(S_train, y_train)
gpc_predict = gpc.predict(S_test)
print(gpc.score(S_test, y_test))

pd.DataFrame(confusion_matrix(y_test, gpc_predict),columns = dataset.target_names, index = dataset.target_names)

0.9814814814814815


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,0,20,1
class_2,0,0,15


In [13]:
# Ensemble por voto
from sklearn.ensemble import VotingClassifier as EVC

evc = EVC(estimators = [('dt',dt),('svm', svm), ('mlp', mlp)], voting='hard')
evc.fit(S_train, y_train)
evc_predict = evc.predict(S_test)
print(evc.score(S_test, y_test))

pd.DataFrame(confusion_matrix(y_test, evc_predict),columns = dataset.target_names, index = dataset.target_names)

1.0


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,0,21,0
class_2,0,0,15


In [14]:
# Ensemble por bagging
from sklearn.ensemble import BaggingClassifier as BAG

bag = BAG(base_estimator = svm)
bag.fit(S_train, y_train)
bag_predict = bag.predict(S_test)
print(evc.score(S_test, y_test))

pd.DataFrame(confusion_matrix(y_test, bag_predict),columns = dataset.target_names, index = dataset.target_names)

1.0


Unnamed: 0,class_0,class_1,class_2
class_0,18,0,0
class_1,0,21,0
class_2,0,1,14
