In [40]:
# import de bibliotecas
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [26]:
# Leitura do arquivo txt
colunas = ['variance', 'skewness', 'curtosis', 'entropy', 'clazz']
bank_note_df = pd.read_csv('./data/data_banknote_authentication.txt', sep=',', header=None, names=colunas)
bank_note_df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,clazz
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [27]:
bank_note_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   variance  1372 non-null   float64
 1   skewness  1372 non-null   float64
 2   curtosis  1372 non-null   float64
 3   entropy   1372 non-null   float64
 4   clazz     1372 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [28]:
bank_note_df.describe()

Unnamed: 0,variance,skewness,curtosis,entropy,clazz
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657,0.444606
std,2.842763,5.869047,4.31003,2.101013,0.497103
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.773,-1.7082,-1.574975,-2.41345,0.0
50%,0.49618,2.31965,0.61663,-0.58665,0.0
75%,2.821475,6.814625,3.17925,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [37]:
# Porcentagem de cedulas falsas no dataset
(bank_note_df.query('clazz==1').clazz.sum() / len(bank_note_df.clazz)) * 100

44.460641399416915

In [38]:
# Qual o valor da correlação de Pearson entre as variáveis skewness e curtosis
from scipy import stats
stats.pearsonr(bank_note_df['skewness'],bank_note_df['curtosis'])

(-0.7868952243065794, 1.6277753903657872e-289)

In [45]:
# divide em dados de entrada e saida
X = bank_note_df[['variance', 'skewness', 'curtosis', 'entropy']]
y = bank_note_df[['clazz']]

In [46]:
# divide os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [53]:
# a. Algoritmo KNN:
clf_KNN = KNeighborsClassifier(n_neighbors=5)
clf_KNN.fit(X_train, y_train)
y_pred = clf_KNN.predict(X_test)

In [56]:
# Qual a acurácia do KNN no conjunto de teste? 
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9975728155339806

In [57]:
# b. Algoritmo Árvore de Decisão (Decision Tree):
clf_arvore = DecisionTreeClassifier(random_state=1)
clf_arvore.fit(X_train, y_train)
y_pred_arvore = clf_arvore.predict(X_test)

In [58]:
# Qual a acurácia da Árvore de Decisão no conjunto de teste? 
accuracy_score(y_test, y_pred_arvore)

0.9878640776699029

In [59]:
# c. Algoritmo Floresta Aleatória (Random Forest):
clf_floresta = RandomForestClassifier(max_depth= 8, random_state=1)
clf_floresta.fit(X_train, y_train)
y_pred_rand = clf_floresta.predict(X_test)

In [60]:
# Qual a acurácia do Random Forest no conjunto de teste? 
accuracy_score(y_test, y_pred_rand)

0.9951456310679612

In [61]:
# Analisando o valor da importância relativa das features do Random Forest (atributo feature_importances_), qual feature melhor contribuiu para a predição de class? (['variance', 'skewness', 'curtosis', 'entropy'])
clf_floresta.feature_importances_

array([0.56066718, 0.24058589, 0.14022782, 0.05851911])

In [62]:
# d. Algoritmo SVM:
clf_svm = SVC(gamma='auto',kernel='rbf', random_state=1)
clf_svm.fit(X_train, y_train)
y_pred_svc = clf_svm.predict(X_test)

In [63]:
# Qual a acurácia do SVM no conjunto de teste? 
accuracy_score(y_test, y_pred_svc)

1.0

In [64]:
# e. Algoritmo Rede MLP:
clf_mlp = MLPClassifier(hidden_layer_sizes=(2,), solver='lbfgs', random_state=1)
clf_mlp.fit(X_train, y_train)
y_pred_mlp = clf_mlp.predict(X_test)

In [65]:
# Qual a acurácia da rede MLP no conjunto de teste? 
accuracy_score(y_test, y_pred_mlp)

1.0