## Classes e funções auxiliares

In [14]:
import numpy as np
import pandas as pd

#Classes que englobam os ataques que estao nos dados de treinamento e nos dados de teste
dosAtq = ['back', 'land','neptune', 'pod','smurf', 'teardrop', 'mailbomb', 'processtable', 'udpstorm', 'apache2', 'worm']
probeAtq = ['satan', 'ipsweep', 'nmap', 'portsweep', 'mscan', 'saint']
u2rAtq = ['buffer_overflow', 'loadmodule', 'rootkit', 'perl', 'sqlattack', 'xterm', 'ps']
r2lAtq = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster', 'xlock', 'xsnoop', 'snmpguess', 'snmpgetattack', 'httptunnel', 'sendmail', 'named','spy','warezclient']

def findClass(lista):
    '''
    Classifica os 40 tipos de eventos em uma das 5 classes de eventos:
    normal, probe, u2r e r2l.
    '''
    eventClass = []
    for x in lista:
        if x in dosAtq: eventClass.append('dos')
        elif x in probeAtq: eventClass.append('probe')
        elif x in u2rAtq: eventClass.append('u2r')
        elif x in r2lAtq: eventClass.append('r2l')
        elif x == 'normal': eventClass.append('normal')
        else:
            eventClass.append('erro')
            print('Erro no ataque de tipo: ',x)
    return eventClass

def findExclusives(lista1,lista2):
    '''Retorna os elementos exclusivos da lista1'''
    exclusivos = []
    for elto in lista1:
        if elto not in lista2:
            exclusivos.append(elto)
    return exclusivos

class Filtro:
    def __init__(self,lista):
        self.lista = lista
        
    def contar(self):
        tipos = {}
        for elto in self.lista:
            if not elto in tipos:
                tipos[elto] = 1
            else:
                tipos[elto] += 1
        return tipos
    
class FiltroDataFrame:
    def __init__(self,df):
        self.df = df
        self.headers = df.columns
    
    def contar(self,coluna):
        filtro = Filtro(self.df[coluna])
        return filtro.contar()
    
    def tamanho(self):
        return len(dados)
    
    def listar(self,coluna):
        tipos = []
        for elto in self.df[coluna]:
            if not elto in tipos:
                tipos.append(elto)
        return tipos
    
    
class PrepDados:
    def __init__(self,dados):
        self.dados = dados
        self.y = self.findY()
        self.x = self.findX()
        
    def findX(self):
        dados = self.normalize()
        return dados
    
    def normalize(self):
        dados = self.dados.drop(columns=['event_type','event_score','event_class','protocol_type', 'service', 'flag'])
        #removemos essa coluna pois todos os valores sao iguais a zero
        dados = dados.drop(columns=['num_outbound_cmds'])
        std = dados.std()
        dados = (dados - dados.mean())/std
        return dados
        
    def findY(self):
        return self.dados['event_class']
    
    def getFeatures(self):
        return self.x.columns           
    

## Importando os Dados

Os nomes de cada coluna (feauture) não vieram diretamente no arquivo dos dados, então eles foram inseridos manualmente.

In [15]:
pd.set_option('display.max_columns', None)
pd.reset_option('max_rows')

headers = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate','event_type','event_score']

dados_train = pd.read_csv("../NSL-KDD/KDDTrain+.txt",names=headers)
dados_test = pd.read_csv("../NSL-KDD/KDDTest+.txt",names=headers)

Nos <b>dados de treinamento</b> existem  <b>23</b> tipos de ataque diferentes e nos <b>dados de teste</b> são <b>38</b> tipos de ataques diferentes.

Os <b>dados de treinamento</b> possuem <b>2</b> tipos de ataque exclusivos, enquanto que os <b>dados de teste</b> possuem <b>17</b> tipos de ataque exclusivos.

Assim, somando os dados de treinamento com os de teste, temos um total de <b>40 tipos de eventos</b> (estado normal e 39 ataques).

In [51]:
filtro_dados_test = FiltroDataFrame(dados_test)
filtro_dados_train = FiltroDataFrame(dados_train)

coluna = 'event_type'
(atq_train, atq_test) = (filtro_dados_train.listar(coluna), filtro_dados_test.listar(coluna))
atq_total = atq_test + findExclusives(atq_train,atq_test)
print('Ataques no treinamento: ',len(atq_train))
print('Ataques no teste: ', len(atq_test))
print("Exclusivos do treinamento (%d): " % len(findExclusives(atq_train,atq_test)),findExclusives(atq_train,atq_test))
print("Exclusivos do teste (%d): " % len(findExclusives(atq_test,atq_train)), findExclusives(atq_test,atq_train))

Ataques no treinamento:  23
Ataques no teste:  38
Exclusivos do treinamento (2):  ['warezclient', 'spy']
Exclusivos do teste (17):  ['saint', 'mscan', 'apache2', 'snmpgetattack', 'processtable', 'httptunnel', 'ps', 'snmpguess', 'mailbomb', 'named', 'sendmail', 'xterm', 'worm', 'xlock', 'xsnoop', 'sqlattack', 'udpstorm']


### Agrupando os ataques em classes

Agrupamos os 40 tipos de eventos em 5 classes: dos, probe, u2r, r2l e normal.

In [52]:
eventClass_train = pd.DataFrame(findClass(dados_train['event_type']))
eventClass_test = pd.DataFrame(findClass(dados_test['event_type']))
dados_train['event_class'] = eventClass_train
dados_test['event_class'] = eventClass_test
dados_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,event_type,event_score,event_class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.00,0.00,0.00,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.00,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.00,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19,dos
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.00,0.00,0.00,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.00,0.00,0.09,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,184,25,1.0,1.0,0.0,0.0,0.14,0.06,0.00,255,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20,dos
125969,8,udp,private,SF,105,145,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.00,0.00,0.00,255,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal,21,normal
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.00,0.00,0.00,255,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal,18,normal
125971,0,tcp,klogin,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,144,8,1.0,1.0,0.0,0.0,0.06,0.05,0.00,255,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20,dos


## Preparação dos Dados

Nessa parte é realizada uma codificação one-hot nas colunas com valores string e os valores foram normalizados.
Nos dados de treinamento, agora, existem 6 features que não existem nos dados de teste. Isso aconteceu porque quando foi feito o one-hot encoding, alguns labels não existiam nos dados de test. São os labels mostrados abaixo (todos da coluna service).

In [56]:
treinamento = PrepDados(dados_train)
teste = PrepDados(dados_test)

#Vetores x e y de treinamento prontos para uso
#Colunas com strings passaram poum one-hot encoding e valores foram normalizados
x_train, y_train = (np.array(treinamento.x), np.array(treinamento.y))

#dados para teste ainda receberao tratamento adicional
x_test, y_test = (np.array(teste.x), np.array(teste.y))

A tabela abaixo mostra os dados depois da remoção das três features categóricas e pela normalização Z.

In [57]:
treinamento.x

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,-0.110249,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,-0.809259,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,-0.717043,-0.354341,-0.637207,-0.631927,-0.374361,-0.37443,0.771280,-0.349682,-0.374558,-0.324062,-0.818887,-0.782364,-0.280281,0.069972,-0.289102,-0.639529,-0.624868,-0.224532,-0.376386
1,-0.110249,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,-0.809259,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,-0.620980,-0.368109,-0.637207,-0.631927,-0.374361,-0.37443,-1.321423,0.482199,-0.374558,0.734340,-1.035684,-1.161026,2.736841,2.367728,-0.289102,-0.639529,-0.624868,-0.387633,-0.376386
2,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,-0.809259,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,0.339646,-0.299272,1.602658,1.605097,-0.374361,-0.37443,-1.389663,0.038529,-0.374558,0.734340,-0.809854,-0.938283,-0.174417,-0.480195,-0.289102,1.608753,1.618949,-0.387633,-0.376386
3,-0.110249,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,1.235689,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,-0.690844,-0.313040,-0.189234,-0.184522,-0.374361,-0.37443,0.771280,-0.349682,-0.374558,-1.533663,1.258749,1.066397,-0.439076,-0.383107,0.066252,-0.572081,-0.602430,-0.387633,-0.345083
4,-0.110249,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,1.235689,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,-0.472520,0.058678,-0.637207,-0.631927,-0.374361,-0.37443,0.771280,-0.349682,-0.028179,0.734340,1.258749,1.066397,-0.439076,-0.480195,-0.289102,-0.639529,-0.624868,-0.387633,-0.376386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,-0.809259,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,0.872358,-0.037693,1.602658,1.605097,-0.374361,-0.37443,-1.184942,-0.016930,-0.374558,0.734340,-0.818887,-0.938283,-0.121485,-0.480195,-0.289102,1.608753,1.618949,-0.387633,-0.376386
125969,-0.107177,-0.007744,-0.004883,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,-0.809259,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,-0.717043,-0.354341,-0.637207,-0.631927,-0.374361,-0.37443,0.771280,-0.349682,-0.374558,0.734340,1.159384,0.977300,-0.386144,-0.447832,-0.289102,-0.639529,-0.624868,-0.387633,-0.376386
125970,-0.110249,-0.007382,-0.004823,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,1.235689,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,-0.725776,-0.368109,-0.637207,-0.631927,-0.374361,-0.37443,0.771280,-0.349682,-0.374558,0.734340,-0.773721,-0.893735,-0.121485,-0.480195,-0.289102,0.979234,-0.624868,-0.355013,-0.376386
125971,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095075,-0.027023,-0.809259,-0.011664,-0.036652,-0.024436,-0.012385,-0.02618,-0.01861,-0.041221,-0.002817,-0.097531,0.523039,-0.271738,1.602658,1.605097,-0.374361,-0.37443,-1.366916,-0.072388,-0.374558,0.734340,-0.972451,-1.094203,-0.174417,-0.480195,-0.289102,1.608753,1.618949,-0.387633,-0.376386


## Classificação

### Logistic Regression

In [58]:
from sklearn.linear_model import LogisticRegression
import time

t1 = time.time()

lrclass = LogisticRegression(C=0.5)
lrclass.fit(x_train,y_train)

print("Tempo: %.1f s" % (time.time() - t1))

Tempo: 11.0 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
lrclass.score(x_train,y_train)

0.953355084025942

### SVM

In [48]:
from sklearn import svm

t1 = time.time()

svmclass = svm.SVC(C=0.5,kernel="linear",verbose=True)
svmclass.fit(x_train,y_train)

print("Tempo: %.1f s" % (time.time()-t1))

[LibSVM]Tempo: 217.9 s


In [49]:
svmclass.score(x_train,y_train)

0.9635001151040302

## Avaliação

In [47]:
from sklearn.metrics import f1_score

eventClasses = ['normal','dos','probe','u2r','r2l']
print("=========== Logistic Regression ===========")
print("Acurácia: ", lrclass.score(x_test,y_test) * 100,' %')
f1_lr = f1_score(y_test, lrclass.predict(x_test), average=None, labels=eventClasses)
print(f1_lr)

Acurácia:  74.4499645138396  %
[0.78387339 0.83559967 0.73545416 0.17721519 0.00620262]


In [50]:
print("\n=========== SVM ===========")
print("Acurácia: ", svmclass.score(x_test,y_test) * 100, '%')
f1_svm = f1_score(y_test, svmclass.predict(x_test), average=None, labels=eventClasses)
print(f1_svm)


#Nos casos em que um determinado tipo de ataque NUNCA é "predicted", não e possivel calcular nem a precisao nem a revocacao,
#pois surge uma divisao por zero: precisao = (TP/(TP + FP)) e revocacao = (TP/(TP + FN)).
#Assim, surge um warning vermelho embaixo.


Acurácia:  74.06405251951739 %
[0.78100053 0.83052674 0.73331912 0.10666667 0.00204918]


## Coisas a fazer

- Mexer nos parâmetros das funções de classficação