In [None]:
import zipfile

def descompactar_zip(arquivo_zip, caminho_destino):
    with zipfile.ZipFile(arquivo_zip, 'r') as zip_ref:
        zip_ref.extractall(caminho_destino)
    print("Arquivos descompactados com sucesso!")

# Declaração das variáveis
arquivo_zip = '/content/KDD.zip'  # Substitua pelo caminho do seu arquivo ZIP
caminho_destino = 'kdd'  # Substitua pelo caminho da pasta onde deseja descompactar

# Exemplo de uso
descompactar_zip(arquivo_zip, caminho_destino)


In [None]:
# Importando bibliotecas necessárias
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder, LabelEncoder, MinMaxScaler, OneHotEncoder)
from keras.utils import to_categorical
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer, MaxAbsScaler, RobustScaler, PowerTransformer
import matplotlib.pyplot as plt
import seaborn as sns

train = '/content/KDD/KDD/KDDTrain+.txt'
test = '/content/KDD/KDD/KDDTest+.txt'
test21 = '/content/KDD/KDD/KDDTest-21.arff'

# Definindo variáveis para features e classes
featureV = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot",
            "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
            "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
            "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
            "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
            "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty"]

flagV = ['OTH', 'RSTOS0', 'SF', 'SH', 'RSTO', 'S2', 'S1', 'REJ', 'S3', 'RSTR', 'S0']
protocol_typeV = ['tcp', 'udp', 'icmp']
serviceV = ['http', 'smtp', 'finger', 'domain_u', 'auth', 'telnet', 'ftp', 'eco_i', 'ntp_u', 'ecr_i', 'other', 'private', 'pop_3', 'ftp_data',
            'rje', 'time', 'mtp', 'link', 'remote_job', 'gopher', 'ssh', 'name', 'whois', 'domain', 'login', 'imap4', 'daytime', 'ctf', 'nntp',
            'shell', 'IRC', 'nnsp', 'http_443', 'exec', 'printer', 'efs', 'courier', 'uucp', 'klogin', 'kshell', 'echo', 'discard', 'systat',
            'supdup', 'iso_tsap', 'hostnames', 'csnet_ns', 'pop_2', 'sunrpc', 'uucp_path', 'netbios_ns', 'netbios_ssn', 'netbios_dgm',
            'sql_net', 'vmnet', 'bgp', 'Z39_50', 'ldap', 'netstat', 'urh_i', 'X11', 'urp_i', 'pm_dump', 'tftp_u', 'tim_i', 'red_i', 'icmp',
            'http_2784', 'harvest', 'aol', 'http_8001']

binary_attack = ['normal', 'ipsweep', 'nmap', 'portsweep', 'satan', 'saint', 'mscan', 'back', 'land', 'neptune', 'pod', 'smurf',
                 'teardrop', 'apache2', 'udpstorm', 'processtable', 'mailbomb', 'buffer_overflow', 'loadmodule', 'perl', 'rootkit',
                 'xterm', 'ps', 'sqlattack', 'ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient',
                 'warezmaster', 'snmpgetattack', 'named', 'xlock', 'xsnoop', 'sendmail', 'httptunnel', 'worm', 'snmpguess']

multiclass_attack = { 'normal': 'normal',
                      'probe': ['ipsweep.', 'nmap.', 'portsweep.', 'satan.', 'saint.', 'mscan.'],
                      'dos': ['back.', 'land.', 'neptune.', 'pod.', 'smurf.', 'teardrop.', 'apache2.', 'udpstorm.', 'processtable.', 'mailbomb.'],
                      'u2r': ['buffer_overflow.', 'loadmodule.', 'perl.', 'rootkit.', 'xterm.', 'ps.', 'sqlattack.'],
                      'r2l': ['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'phf.', 'spy.', 'warezclient.', 'warezmaster.', 'snmpgetattack.',
                              'named.', 'xlock.', 'xsnoop.', 'sendmail.', 'httptunnel.', 'worm.', 'snmpguess.']}

# Carregar os dados
train_data = pd.read_csv(train, names=featureV, sep=',', on_bad_lines='skip')
test_data = pd.read_csv(test, names=featureV, sep=',', on_bad_lines='skip')

test_21 = pd.read_csv(test21, names=featureV, sep=',', on_bad_lines='skip',skiprows=3)

# Removendo serviços indesejados dos dados de treino e teste
train_data = train_data.query("service != 'aol' and service != 'harvest' and service != 'http_2784' and service != 'http_8001' and service != 'red_i' and service != 'urh_i' and service != 'printer' and service != 'rje'")
test_data = test_data.query("service != 'printer' and service != 'rje'")

# Função para criar gráficos de barras para visualização dos dados
def bar_graph(feature):
    train_data[feature].value_counts().plot(kind="bar")

# Visualizando as features
bar_graph('protocol_type')
plt.figure(figsize=(15,3))
bar_graph('service')
bar_graph('flag')
bar_graph('logged_in')
bar_graph('label')

# Função de pré-processamento dos dados
def preprocessing(data, cls, df):
    # --------- Categorizar ataques ---------
    data['label'] = data['label'].replace(['normal.', 'normal'], 0)

    # Classificação binária
    if cls == 'binary':
        data['label'] = data['label'].apply(lambda x: 1 if x in binary_attack else 0)

    # Separar features e labels
    y = data['label']
    x = data.drop(columns=['label', 'difficulty'], errors='ignore')

    # Detecta automaticamente colunas categóricas e aplica get_dummies
    categorical_columns = x.select_dtypes(include=['object']).columns
    x = pd.get_dummies(x, columns=categorical_columns)

    # Normalizar as features
    x = MinMaxScaler(feature_range=(0, 1)).fit_transform(x)
    y = pd.get_dummies(y)

    return (x, y) if df == 'train' else (x, y)



# Pré-processamento dos dados
x_train, Y_train = preprocessing(train_data, cls='binary', df='train')
x_test, Y_test = preprocessing(test_data, cls='binary', df='test')
x_21_test, y_21_test = preprocessing(test_21, cls='binary', df='test21')

# Ajustando dimensões das features para CNN
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_21_test = np.reshape(x_21_test, (x_21_test.shape[0], x_21_test.shape[1], 1))

# Construindo o modelo CNN
from keras.models import Sequential
from keras.layers import Convolution1D, Dense, Dropout, Flatten, MaxPooling1D
model = Sequential()

model.add(Convolution1D(32, 3, padding="same", activation="relu", input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=(4)))
model.add(Dropout(0.5))

model.add(Convolution1D(64, 3, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(2, activation="softmax"))

# Compilando o modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Treinando o modelo
model.fit(x_train, Y_train, epochs=100, batch_size=128)

# Avaliando o modelo
pred = model.predict(x_test)
y_pred = np.argmax(pred, axis=1)

