<a href="https://colab.research.google.com/github/jibin1018/-/blob/main/bart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [2]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk
from prettytable import PrettyTable  # To print in tabular format

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

%matplotlib inline

# Reading data

In [3]:
# Creating a empty dict, where I will save all parameters required for test data transformation

saved_dict = {}

In [4]:
# Reading datasets
dfs = []
for i in range(1,5):
    path = '/content/UNSW-NB15_1.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None))
all_data = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

In [5]:
# This csv file contains names of all the features
df_col = pd.read_csv('/content/NUSW-NB15_features.csv', encoding='ISO-8859-1')

In [6]:
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())

In [7]:
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']

In [8]:
# Saving useful info, later this will be used to transform raw test data
saved_dict['columns'] = df_col['Name'][df_col['Name']!='label'].tolist()

In [9]:
del df_col

In [10]:
all_data.shape

(2800004, 49)

In [11]:
all_data.head()

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


# 데이터 로드 및 변환

In [12]:
path = '/content/UNSW-NB15_1.csv'
df = pd.read_csv(path)

# 각 행의 모든 열 데이터를 공백으로 구분된 문자열로 변환
train_data = df.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# 변환된 데이터 출력
print("Train Data:")
print(train_data)

# 파일로 저장하고 싶은 경우
train_data.to_csv('train_data.csv', index=False, header=False)

if 'text' not in df.columns:
    df['text'] = df.iloc[:,0]
#데이터 분리
X = df['text']
if 'attack_cat' in df.columns:
    y = df['attack_cat']
else:
  y = pd.Series([0] * len(df), index=df.index)

  # 데이터셋을 훈련 세트와 테스트 세트로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#데이터 분리
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

Train Data:
0         59.166.0.0 33661 149.171.126.9 1024 udp CON 0....
1         59.166.0.6 1464 149.171.126.7 53 udp CON 0.001...
2         59.166.0.5 3593 149.171.126.5 53 udp CON 0.001...
3         59.166.0.3 49664 149.171.126.0 53 udp CON 0.00...
4         59.166.0.0 32119 149.171.126.9 111 udp CON 0.0...
                                ...                        
699995    59.166.0.8 12520 149.171.126.6 31010 tcp FIN 0...
699996    59.166.0.0 18895 149.171.126.9 80 tcp FIN 1.40...
699997    59.166.0.0 30103 149.171.126.5 5190 tcp FIN 0....
699998    59.166.0.6 30388 149.171.126.5 111 udp CON 0.0...
699999    59.166.0.0 6055 149.171.126.5 54145 tcp FIN 0....
Length: 700000, dtype: object
Training set size: 490000
Test set size: 210000


# 분류기 설정 및 학습

In [None]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the BART tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base", num_labels=len(y.unique()))

batch_size = 32

def batch_tokenize(text_list, batch_size=32):
    encodings = {'input_ids': [], 'attention_mask': []}
    for i in range(0, len(text_list), batch_size):
        batch_encodings = tokenizer(
            text_list[i:i+batch_size], truncation=True, padding='max_length', max_length=128)
        encodings['input_ids'].extend(batch_encodings['input_ids'])
        encodings['attention_mask'].extend(batch_encodings['attention_mask'])
    return encodings

train_encodings = batch_tokenize(X_train.tolist(), batch_size=batch_size)
test_encodings = batch_tokenize(X_test.tolist(), batch_size=batch_size)



Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install torch
import torch

# Prepare dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item
    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = Dataset(train_encodings, y_train.tolist())
test_dataset = Dataset(test_encodings, y_test.tolist())

In [None]:
!pip install wandb -qqq
!pip install transformers

import wandb
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
import wandb
import os
import torch



# Set up training arguments with optimizations and disabled wandb logging
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    report_to="none"  # wandb 로깅 비활성화
)

import random
from torch.utils.data import Subset

sample_size = int(len(train_dataset) * 0.001)
random_indices = random.sample(range(len(train_dataset)), sample_size)
train_dataset = Subset(train_dataset, random_indices)

# Initialize Trainer with optimized settings
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Start training
trainer.train()

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model using the test dataset
eval_results = trainer.evaluate()

# 직접적으로 예측값과 정확도를 계산하려면 다음과 같이 진행하세요
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)  # 가장 높은 확률을 가진 라벨을 선택
true_labels = y_test.tolist()

# 정확도 계산
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# 분류 리포트를 출력하여 더 많은 평가 지표를 확인할 수 있습니다
print(classification_report(true_labels, pred_labels))
