In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [19]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("azalhowaide/iot-dataset-for-intrusion-detection-systems-ids")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/azalhowaide/iot-dataset-for-intrusion-detection-systems-ids/versions/2


In [20]:
import os

# Supondo que o arquivo CSV esteja diretamente em `path`
csv_file = os.path.join(path, 'BoTNeTIoT-L01-v2.csv')
df = pd.read_csv(csv_file)

# Exiba as primeiras linhas do DataFrame
print(df.head())

   MI_dir_L0.1_weight  MI_dir_L0.1_mean  MI_dir_L0.1_variance  H_L0.1_weight  \
0            1.000000         98.000000          0.000000e+00       1.000000   
1            1.931640         98.000000          1.818989e-12       1.931640   
2            2.904273         86.981750          2.311822e+02       2.904273   
3            3.902546         83.655268          2.040614e+02       3.902546   
4            4.902545         81.685828          1.775746e+02       4.902545   

   H_L0.1_mean  H_L0.1_variance  HH_L0.1_weight  HH_L0.1_mean   HH_L0.1_std  \
0    98.000000     0.000000e+00         1.00000          98.0  0.000000e+00   
1    98.000000     1.818989e-12         1.93164          98.0  1.348699e-06   
2    86.981750     2.311822e+02         1.00000          66.0  0.000000e+00   
3    83.655268     2.040614e+02         1.00000          74.0  0.000000e+00   
4    81.685828     1.775746e+02         2.00000          74.0  9.536743e-07   

   HH_L0.1_magnitude  ...  HpHp_L0.1_mean  H

In [21]:
df = df.loc[df['Attack'] != 'gafgyt']
df = df.drop(columns=["Attack", 'Attack_subType'])

In [22]:
categorical_cols = ['Device_Name']
numeric_cols = ['MI_dir_L0.1_weight', 'MI_dir_L0.1_mean', 'MI_dir_L0.1_variance',
       'H_L0.1_weight', 'H_L0.1_mean', 'H_L0.1_variance', 'HH_L0.1_weight',
       'HH_L0.1_mean', 'HH_L0.1_std', 'HH_L0.1_magnitude', 'HH_L0.1_radius',
       'HH_L0.1_covariance', 'HH_L0.1_pcc', 'HH_jit_L0.1_weight',
       'HH_jit_L0.1_mean', 'HH_jit_L0.1_variance', 'HpHp_L0.1_weight',
       'HpHp_L0.1_mean', 'HpHp_L0.1_std', 'HpHp_L0.1_magnitude',
       'HpHp_L0.1_radius', 'HpHp_L0.1_covariance', 'HpHp_L0.1_pcc']

In [23]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [24]:
# Número de amostras desejadas no conjunto balanceado
n_samples = 2000

# Calcule o número de amostras por classe
num_classes = df['label'].nunique()  # substitua 'target_column' pelo nome da sua coluna alvo
samples_per_class = n_samples // num_classes

# Amostre cada classe
df = (
    df.groupby('label', group_keys=False)  # substitua 'target_column' pelo nome da coluna alvo
    .apply(lambda x: x.sample(n=samples_per_class, random_state=42))
)

# Verifique o número de amostras e a distribuição
print(df['label'].value_counts())

label
0    1000
1    1000
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=samples_per_class, random_state=42))


In [25]:
X = df.drop(['label'], axis=1)
y = df['label']

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)  # 'drop="first"' evita a armadilha da variável dummy
    ])

In [27]:
# Aplique a transformação usando o preprocessor
X = preprocessor.fit_transform(X)

In [28]:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

In [29]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_new, y, test_size=0.15, random_state=42)

# 2. Dividir o restante em 70% treino e 15% validação
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42)

# Exibindo os tamanhos dos conjuntos para confirmação
print(f"Tamanho de X_train: {len(X_train)}")   # Aproximadamente 70%
print(f"Tamanho de X_val: {len(X_val)}")       # Aproximadamente 15%
print(f"Tamanho de X_test: {len(X_test)}")     # Aproximadamente 15%

Tamanho de X_train: 1399
Tamanho de X_val: 301
Tamanho de X_test: 300


In [30]:
from sklearn.linear_model import SGDClassifier
def train_val_logistic(X_train, X_test, y_train, y_test):
    logistic = SGDClassifier(max_iter=50, loss='log_loss', learning_rate = 'constant',  eta0 = 0.1, random_state = 0)
    logistic.fit(X_train, y_train)
    y_pred = logistic.predict(X_val)
    report = classification_report(y_val, y_pred, digits=5)
    return report

In [31]:
def train_test_logistic(X_train, X_test, y_train, y_test):
    logistic = SGDClassifier(max_iter=100, loss='log_loss',learning_rate = 'constant', eta0 = 0.1, random_state = 0)
    logistic.fit(X_train, y_train)
    y_pred = logistic.predict(X_test)
    report = classification_report(y_test, y_pred, digits=5)
    return report

In [32]:
# Treinando e testando o modelo de Regressão Logística e calculando o relatório de classificação
report = train_val_logistic(X_train, X_val, y_train, y_val)
print(f"Conjunto de dados: IDS")
print(f"Relatório de Classificação:\n{report}\n")

Conjunto de dados: IDS
Relatório de Classificação:
              precision    recall  f1-score   support

           0    1.00000   1.00000   1.00000       154
           1    1.00000   1.00000   1.00000       147

    accuracy                        1.00000       301
   macro avg    1.00000   1.00000   1.00000       301
weighted avg    1.00000   1.00000   1.00000       301




In [33]:
# Treinando e testando o modelo de Regressão Logística e calculando o relatório de classificação
report = train_test_logistic(X_train, X_test, y_train, y_test)
print(f"Conjunto de dados: IDS")
print(f"Relatório de Classificação:\n{report}\n")

Conjunto de dados: IDS
Relatório de Classificação:
              precision    recall  f1-score   support

           0    1.00000   1.00000   1.00000       149
           1    1.00000   1.00000   1.00000       151

    accuracy                        1.00000       300
   macro avg    1.00000   1.00000   1.00000       300
weighted avg    1.00000   1.00000   1.00000       300




In [34]:
# Função para converter valores numéricos, substituindo '-' por '~' para valores negativos
def convert_negative_to_tilde(value):
    if value < 0:
        return f"~{abs(value)}"
    return f"{value}"

# Função para converter X para o formato CPN, aplicando a transformação nos negativos
def convert_to_cpn_format(X):
    X_list = X.tolist() if isinstance(X, np.ndarray) else X.values.tolist()
    X_cpn = [[convert_negative_to_tilde(val) for val in row] for row in X_list]
    return "1`[" + ", ".join(["[" + ",".join(row) + "]" for row in X_cpn]) + "]"

# Função para converter os rótulos (y) para o formato CPN, aplicando a transformação nos negativos
def convert_labels_to_cpn_format(y):
    y_cpn = [convert_negative_to_tilde(val) for val in y]
    return "1`[" + ",".join(y_cpn) + "]"

# Convertendo os conjuntos para o formato CPN Tools
X_train_cpn = convert_to_cpn_format(X_train)
X_val_cpn = convert_to_cpn_format(X_val)
X_test_cpn = convert_to_cpn_format(X_test)

y_train_cpn = convert_labels_to_cpn_format(y_train)
y_val_cpn = convert_labels_to_cpn_format(y_val)
y_test_cpn = convert_labels_to_cpn_format(y_test)

# Exibindo os conjuntos para verificação
print("\nConjunto de Treino em formato de fichas do CPN Tools:")
print(X_train_cpn)
print("\nConjunto de Validação em formato de fichas do CPN Tools:")
print(X_val_cpn)
print("\nConjunto de Teste em formato de fichas do CPN Tools:")
print(X_test_cpn)

print("\nRótulos do Conjunto de Treino em formato de lista do CPN Tools:")
print(y_train_cpn)
print("\nRótulos do Conjunto de Validação em formato de lista do CPN Tools:")
print(y_val_cpn)
print("\nRótulos do Conjunto de Teste em formato de lista do CPN Tools:")
print(y_test_cpn)

# Salvando os conjuntos em arquivos de texto
with open("X_train_cpn_anomaly.txt", "w") as file:
    file.write(X_train_cpn)

with open("X_val_cpn_anomaly.txt", "w") as file:
    file.write(X_val_cpn)

with open("X_test_cpn_anomaly.txt", "w") as file:
    file.write(X_test_cpn)

with open("y_train_cpn_anomaly.txt", "w") as file:
    file.write(y_train_cpn)

with open("y_val_cpn_anomaly.txt", "w") as file:
    file.write(y_val_cpn)

with open("y_test_cpn_anomaly.txt", "w") as file:
    file.write(y_test_cpn)


Conjunto de Treino em formato de fichas do CPN Tools:
1`[[~0.9067468985958937,~0.3652920033172919,~0.23357221416002205,~0.9067468985958941,~0.3652920033172919,~0.23357221416002943,~0.6013448933134934,~0.3090124355084038,~0.6013448933134934,~0.5259398763379356], [1.3502564021817807,1.2076418634985195,1.2650196894526025,1.350256402181781,1.2076418634985195,1.2650196894526025,1.6014696438394633,1.7420944082218113,1.6014696438394633,~0.5259399001118386], [1.3965184651184355,1.2019253722900842,1.2698826047478298,1.3965184651184355,1.2019253722900842,1.2698826047478298,~0.6020201756541558,~0.6722901507922378,~0.6020201756541558,1.9049208463528244], [0.17817809839608126,0.6837761606794379,1.4747185825724949,0.17817809839608115,0.6837761606794379,1.4747185825724962,~0.6005237569600423,~0.6164517214358033,~0.6005237569600423,~0.46595709870815355], [1.031347092812155,1.321124672436672,1.1574270064228545,1.031347092812155,1.321124672436672,1.1574270064228542,1.3910222017257932,1.7419411335985426