# Carregamento da Base

In [None]:
import numpy as np
import pandas as pd

In [None]:
#Carregar a base e visualizar primeiras linhas
df1 = pd.read_csv('kddCup.txt', 
                  sep=' ', 
                  header=0, 
                  index_col=0)  #header=-1 indica que não há linha de header
df1.head()

In [None]:
#dimnesões da base
df1.shape

# Converte atributos categóricos em binário

In [None]:
!pip install category-encoders
import category_encoders as ce
encoder = ce.BinaryEncoder()
df_binary = encoder.fit_transform(df1.loc[:,['flag', 'protocol_type']])
df_binary.head()

# Divisão em treino e teste

In [None]:
df1 = pd.concat([df_binary, df1.loc[:,'src_bytes':'out']], axis=1)
df1.head()

In [None]:
np.random.seed(1) #semente inicial
nlinhas = df1.shape[0]
nlinhas

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df1.loc[:,'flag_0':'dst_host_rerror_rate'], df1.loc[:,'out'], test_size=0.2)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

# Normalização

In [None]:
from sklearn import preprocessing
preprocessParams = preprocessing.StandardScaler().fit(x_train)

In [None]:
X_train_normalized = preprocessParams.transform(x_train)
X_test_normalized = preprocessParams.transform(x_test)

# Treinamento da Rede Neural

In [None]:
from keras import Sequential
from keras.layers import Dense

In [None]:
RN = Sequential()
RN.add(Dense(10,input_shape = X_train_normalized.shape[1:],activation = 'sigmoid'))
RN.add(Dense(3,activation = 'sigmoid'))
RN.summary()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
# Transformação Dummy
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(pd.DataFrame(y_train))

y_train = encoder.transform(pd.DataFrame(y_train)).toarray()
y_test = encoder.transform(pd.DataFrame(y_test)).toarray()

In [None]:
y_train[0:5]

In [None]:
RN.compile(optimizer = 'sgd', loss = 'mean_squared_error', metrics = ['accuracy'])
history = RN.fit(X_train_normalized,y_train, epochs = 10, validation_split=0.2) # poucas épocas de treinamento: 10

In [None]:
# Gráfico do treinamento: custo do treino e validação
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Loss de treino e validação')
plt.ylabel('loss')
plt.xlabel('época')
plt.legend();

# Avaliando resultados

In [None]:
#testar rede
score = RN.evaluate(X_test_normalized, y_test, verbose = 0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
# Vamos testar a rede
from sklearn.metrics import confusion_matrix
y_test_predicted = RN.predict(X_test_normalized)
y_test_predicted

In [None]:
y_test_predicted_indexes = np.argmax(y_test_predicted,axis=1)
y_test_indexes = np.argmax(y_test, axis=1)

In [None]:
y_test_predicted_indexes

In [None]:
# Matriz de confusão
confMatrix = confusion_matrix(y_test_predicted_indexes, y_test_indexes)
pd.DataFrame(confMatrix, 
             index=['Pred Neptune', 'Pred Normal', 'Pred Smurf'], 
             columns=['Real Neptune', 'Real Normal', 'Real Smurf'],) #matriz de confusão

In [None]:
import seaborn as sns
ax = plt.subplot()
sns.heatmap(confMatrix, annot=True, fmt=".0f", cmap=plt.cm.Blues)
plt.xlabel('Real')
plt.ylabel('Previsto')
plt.title('Matriz de Confusão')

# Colocar os nomes
ax.xaxis.set_ticklabels(['Neptune', 'Normal', 'Smurf']) 
ax.yaxis.set_ticklabels(['Neptune', 'Normal', 'Smurf'])
plt.show()