<a href="https://colab.research.google.com/github/juooo1117/cyber_security_project/blob/main/cyber_security_project_sentencepiece.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **데이터 로드**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [48]:
import pandas as pd
import re
import csv
import numpy as np

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score

# **데이터 전처리**

In [50]:
df = pd.read_csv('/content/drive/MyDrive/A_Track_trainset.csv')

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Log_Number    45000 non-null  int64 
 1   payload       45000 non-null  object
 2   label_action  45000 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [52]:
df.head(20)

Unnamed: 0,Log_Number,payload,label_action
0,0,"GET /forum1_professionnel.asp?n=/.\\\""./.\\\""....",System_Cmd_Execution
1,1,POST /owa/auth/logon.aspx?replaceCurrent=1%22%...,System_Cmd_Execution
2,2,GET /goods/goods_search?display_type=list&arr_...,SQL_Injection
3,3,GET / HTTP/1.1\r\n\r\n,HOST_Scan
4,4,GET /sub_04_1_read.php?page=1&id=31%29%3BSELEC...,System_Cmd_Execution
5,5,POST /member/login HTTP/1.1\r\nContent-Length:...,Vulnerability_Scan
6,6,GET /board/?category=&gdviewer=1&goods_seq=69&...,System_Cmd_Execution
7,7,GET /board/board_list?code=welfare&search_type...,SQL_Injection
8,8,GET /vendor/phpunit/phpunit/src/Util/PHP/eval-...,HOST_Scan
9,9,GET / HTTP/1.1\r\n\r\n,HOST_Scan


In [53]:
df.columns

Index(['Log_Number', 'payload', 'label_action'], dtype='object')

In [54]:
# label 종류('label_action') -> 총 9개
df['label_action'].unique()

array(['System_Cmd_Execution', 'SQL_Injection', 'HOST_Scan',
       'Vulnerability_Scan', 'Path_Disclosure', 'Cross_Site_Scripting',
       'Leakage_Through_NW', 'Directory_Indexing',
       'Automatically_Searching_Infor'], dtype=object)

In [55]:
# label category의 각각 값 개수
df['label_action'].value_counts()

Vulnerability_Scan               16867
System_Cmd_Execution              9807
HOST_Scan                         6315
Path_Disclosure                   4775
SQL_Injection                     3395
Cross_Site_Scripting              1348
Automatically_Searching_Infor     1119
Leakage_Through_NW                 956
Directory_Indexing                 418
Name: label_action, dtype: int64

In [56]:
# Log_Number column 삭제
df = df.drop('Log_Number', axis = 1)
df.head(20)

Unnamed: 0,payload,label_action
0,"GET /forum1_professionnel.asp?n=/.\\\""./.\\\""....",System_Cmd_Execution
1,POST /owa/auth/logon.aspx?replaceCurrent=1%22%...,System_Cmd_Execution
2,GET /goods/goods_search?display_type=list&arr_...,SQL_Injection
3,GET / HTTP/1.1\r\n\r\n,HOST_Scan
4,GET /sub_04_1_read.php?page=1&id=31%29%3BSELEC...,System_Cmd_Execution
5,POST /member/login HTTP/1.1\r\nContent-Length:...,Vulnerability_Scan
6,GET /board/?category=&gdviewer=1&goods_seq=69&...,System_Cmd_Execution
7,GET /board/board_list?code=welfare&search_type...,SQL_Injection
8,GET /vendor/phpunit/phpunit/src/Util/PHP/eval-...,HOST_Scan
9,GET / HTTP/1.1\r\n\r\n,HOST_Scan


In [57]:
# 중복값제거
# payload 가 같으나 label_action 이 다른 데이터 -> 해당 payload의 label_action별 카운트 중 빈도가 높은 label_action으로 변경
#                                                                        카운트가 같다면 전체 label_action 별 카운트 빈도가 높은 label_action으로 변경

drop_df = df.drop_duplicates()
x = drop_df[drop_df.duplicated(['payload'], keep=False)].drop_duplicates(subset = ['payload'])['payload']

for i in x:

    v = df[df['payload'] == i]['label_action'].mode().values

    if len(v) == 1:
        df.loc[(df['payload'] == i), 'label_action'] = v[0]

    else:
        fvc = 0
        for j in df[df['payload'] == i]['label_action'].unique():
            vc = df['label_action'].value_counts().loc[j]
            if vc > fvc :
                fvc = vc
                m = j
            else :
                pass
        df.loc[(df['payload'] == i), 'label_action'] = m

In [58]:
# 각 label_action 값 변화된 것 확인 -> 총 개수는 동일(45000개)
df['label_action'].value_counts()

Vulnerability_Scan               16882
System_Cmd_Execution              9797
HOST_Scan                         6315
Path_Disclosure                   4771
SQL_Injection                     3394
Cross_Site_Scripting              1348
Automatically_Searching_Infor     1119
Leakage_Through_NW                 956
Directory_Indexing                 418
Name: label_action, dtype: int64

In [59]:
# label_action: Vulnerability_Scan 인 값들 (취약한 값들 & 제일 값 많음 -> 맨 나중에??)
# 나머지 label_action category 별로도 payload 값을 확인했으나, 기준으로 삼을만한 패턴이 보이지 않음

df.loc[df['label_action'] == 'Vulnerability_Scan', ['label_action','payload']]

Unnamed: 0,label_action,payload
5,Vulnerability_Scan,POST /member/login HTTP/1.1\r\nContent-Length:...
10,Vulnerability_Scan,GET /mod/liens/index.php?config[pathMod]=http:...
16,Vulnerability_Scan,GET /postnuke/html/index.php?name=forums&file=...
19,Vulnerability_Scan,GET /goods/goods_search?display_type=list&arr_...
28,Vulnerability_Scan,GET /library/translation.inc.php?GLOBALS[srcdi...
...,...,...
44984,Vulnerability_Scan,GET /admin/login.php HTTP/1.0\r\nHost: 10.0.17...
44986,Vulnerability_Scan,GET /goods/goods_search?display_type=list&arr_...
44990,Vulnerability_Scan,GET /goods/goods_search?display_type=list&arr_...
44994,Vulnerability_Scan,POST /controller/board.php HTTP/1.1\r\nHost: l...


# **Vocab dictionary 만들기 (sentencepiece 활용)**

In [60]:
!pip3 install sentencepiece



In [62]:
import sentencepiece as spm

In [63]:
# payload 데이터만 .txt 파일로 저장
with open(r'/content/drive/MyDrive/payload.txt', 'w', encoding = 'utf-8') as f:
    for line in df.payload.values:
        try:
            f.write(line+'\n')
        except TypeError as TE:
            print(line, TE)

In [64]:
# payload 전체가 .txt 파일로 저장된 것 확인
with open(r'/content/drive/MyDrive/payload.txt', 'r', encoding = 'utf-8') as f:
    nsmc_text = f.read().split('\n')
print(nsmc_text[0])

GET /forum1_professionnel.asp?n=/.\\\"./.\\\"./.\\\"./.\\\"./.\\\"./boot.ini&amp;nn=100&amp;page=1|234|800a0bcd|Either_BOF_or_EOF_is_True__or_the_current_record_has_been_deleted._Requested_operation_requires_a_current_record. HTTP/1.1\r\nConnection: Keep-Alive\r\nUser-Agent: Mozilla/5.00 (Nikto/2.1.6) (Evasions:None) (Test:002421)\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: 36\r\nHost: 10.0.17.21\r\n\r\n<!--#include virtual=\"/indomainabc063.jsp\"-->


In [65]:
# google sentencepiece 사용하기(토큰화)
input_file = r'/content/drive/MyDrive/payload.txt'

prefix = 'payload'
vocab_size = 32000
model_type = 'bpe'
character_coverage  = 1.0
max_sentence_length = 9999

In [66]:
templates = '--input={} --model_prefix={} --vocab_size={} --model_type={} --character_coverage={} --max_sentence_length={}'
cmd = templates.format(input_file, prefix, vocab_size, model_type, character_coverage, max_sentence_length)
cmd

'--input=/content/drive/MyDrive/payload.txt --model_prefix=payload --vocab_size=32000 --model_type=bpe --character_coverage=1.0 --max_sentence_length=9999'

In [67]:
# SentencePiece를 설정한 값들로(cmd) 훈련시켜서 단어집합 생성
spm.SentencePieceTrainer.Train(cmd)

In [68]:
# vocab 에서 학습된 subwords를 확인
# type(vocab_list) : dataframe

vocab_list = pd.read_csv('payload.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)

In [69]:
# vocab에 저장되어 있는 단어는 총 32000개(설정한 값)
len(vocab_list)

32000

In [70]:
# 전체 payload를 32000개로 잘라서 각각의 단어에 고유한 번호를 부여
vocab_list

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,....,-1
...,...,...
31995,¢,-31992
31996,Ã,-31993
31997,ê,-31994
31998,í,-31995


In [71]:
# 훈련된 SentencePiece를(payload.model) model로 저장함
sp = spm.SentencePieceProcessor()
vocab_file = "payload.model"
sp.load(vocab_file)

True

In [72]:
# 저장된 모델을 사용했을때, SentencePiece가 잘 적용되는지 확인해보자
lines = ["GET /forum1_professionnel.asp?n=/.\\\"./.\\\"./.\\\"./.\\\"./.\\\"./boot.ini&amp;nn=100&amp;page=1|234|800a0bcd|Either_BOF_or_EOF_is_True__or_the_current_record_has_been_deleted._Requested_operation_requires_a_current_record. HTTP/1.1\r\nConnection: Keep-Alive\r\nUser-Agent: Mozilla/5.00 (Nikto/2.1.6) (Evasions:None) (Test:002421)\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: 36\r\nHost: 10.0.17.21\r\n\r\n<!--#include virtual=\"/indomainabc063.jsp\"-->",
         "POST /owa/auth/logon.aspx?replaceCurrent=1%22%29%20AND%209294%3DUTL_INADDR.GET_HOST_ADDRESS%28CHR%28113%29%7C%7CCHR%28112%29%7C%7CCHR%2898%29%7C%7CCHR%28120%29%7C%7CCHR%28113%29%7C%7C%28SELECT%20%28CASE%20WHEN%20%289294%3D9294%29%20THEN%201%20ELSE%200%20END%29%20FROM%20DUAL%29%7C%7CCHR%28113%29%7C%7CCHR%28122%29%7C%7CCHR%28106%29%7C%7CCHR%28107%29%7C%7CCHR%28113%29%29%20AND%20%28%22tdDX%22%3D%22tdDX&reason=3&url= HTTP/1.1\r\nContent-Length: 0\r\nCache-Control: no-cache\r\nUser-Agent: sqlmap/1.6.7#stable (https://sqlmap.org)\r\nReferer: http://owa.college.school:80/owa/auth/logon.aspx\r\nHost: owa.college.school\r\nAccept: */*\r\nAccept-Encoding: gzip,deflate\r\nContent-Type: application/x-www-form-urlencoded; charset=utf-8\r\nConnection: close\r\n\r\n",
         "GET /goods/goods_search?display_type=list&arr_search_list%5B0%5D%5Bsearch_type%5D=-1839%20or%201%3D2&arr_search_list%5B0%5D%5Bsearch%5D=1&arr_search_list%5B1%5D%5Bsearch_type%5D=Gm.name&arr_search_list%5B1%5D%5Bsearch%5D=1&arr_search_list%5B2%5D%5Bsearch_type%5D=Gm.name&arr_search_list%5B2%5D%5Bsearch%5D=1&search_include=y&search=1&search_type=Gm.name HTTP/1.1\r\nHost: www.college.school\r\nAuthorization: Negotiate TlRMTVNTUAABAAAAt4II4gAAAAAAAAAAAAAAAAAAAAAKAF1YAAAADw==\r\nAccept-Encoding: gzip, deflate\r\nUser-Agent: Mozilla/5.0 (Gecko) Arachni/v10.0.0.80\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.8,he;q=0.6\r\nX-Arachni-Scan-Seed: 4818b1cc56353e785f9bfe9b055b7183\r\nCookie: designart_site=cg0q4hq5j1rgo9o2ns7s3nsnjlauq1rp\r\n\r\n"
         ]

# sp.encode_as_pieces() -> 각 payload 잘 토큰화된 것 확인!
for line in lines:
    #print(line)
    print(sp.encode_as_pieces(line))

['▁GET', '▁/', 'forum', '1_', 'professionnel', '.', 'asp', '?', 'n', '=/.', '\\".', '/.', '\\".', '/.', '\\".', '/.', '\\".', '/.', '\\"', './', 'boot', '.', 'ini', '&', 'amp', ';', 'nn', '=100&', 'amp', ';', 'page', '=1|234|800', 'a', '0', 'bcd', '|', 'Either', '_', 'BOF', '_', 'or', '_', 'EOF', '_', 'is', '_', 'True', '__', 'or', '_', 'the', '_', 'current', '_', 'record', '_', 'has', '_', 'been', '_', 'deleted', '._', 'Requested', '_', 'operation', '_', 'requires', '_', 'a', '_', 'current', '_', 'record', '.', '▁HTTP', '/1', '.1', '▁Con', 'nection', ':', '▁Keep', '-', 'Alive', '▁U', 'ser', '-', 'Agent', ':', '▁Mozilla', '/5.00', '▁(', 'Nikto', '/2.1.6)', '▁(', 'Evasions', ':', 'None', ')', '▁(', 'Test', ':00242', '1)', '▁Content', '-', 'Type', ':', '▁application', '/', 'x', '-', 'www', '-', 'form', '-', 'urlencoded', '▁Content', '-', 'Length', ':', '▁36', '▁H', 'ost', ':', '▁10.0.17.21', '▁', '<!--#', 'include', '▁virtual', '=', '"', '/', 'indomainabc', '063.', 'jsp', '"', '-->']
['▁

# **Vocab dictionary 를 적용**

In [73]:
token = []

# df 한줄씩 돌면서 df[payload]를 학습된 sentencepiece 모델을 적용해서 토큰화시킴 -> df[payload_token]을 새로 만들어서 이 토큰들을 저장!
for i in range(len(df)):
  if(i % 5000 == 0):
    print(i, "단계 완료")
  token.append(' '.join(sp.encode_as_pieces(str(df.iloc[i]['payload']))))  # 토큰들은 각각 공백을 기준으로 나열됨

df['payload_token'] = token

0 단계 완료
5000 단계 완료
10000 단계 완료
15000 단계 완료
20000 단계 완료
25000 단계 완료
30000 단계 완료
35000 단계 완료
40000 단계 완료


In [74]:
# df[payload_token] column이 추가된 것 확인!
df.head(5)

Unnamed: 0,payload,label_action,payload_token
0,"GET /forum1_professionnel.asp?n=/.\\\""./.\\\""....",System_Cmd_Execution,▁GET ▁/ forum 1_ professionnel . asp ? n =/.\\...
1,POST /owa/auth/logon.aspx?replaceCurrent=1%22%...,System_Cmd_Execution,▁POST ▁/ owa / auth / logon . aspx ? replaceCu...
2,GET /goods/goods_search?display_type=list&arr_...,SQL_Injection,▁GET ▁/ goods / goods _ search ? display _ typ...
3,GET / HTTP/1.1\r\n\r\n,HOST_Scan,▁GET ▁/ ▁HTTP /1.1\ r \ n \ r \ n
4,GET /sub_04_1_read.php?page=1&id=31%29%3BSELEC...,System_Cmd_Execution,▁GET ▁/ sub _04_1_ read . php ? page =1& id =3...


In [75]:
# df['payload_token'] -> df_payload_token 으로 list (각 line당 공백으로 잘라서 list화)
# df['label_action'] -> df_label로 array

df_payload_token = list(df['payload_token'])
df_label = np.array(df['label_action'])

In [77]:
# len(df_payload_token) -> 45000개
df_payload_token

45000

In [78]:
# CountVectorizer 사용 -> 공백으로 나눔

from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()

train_features=vectorizer.fit_transform(df_payload_token)

In [79]:
# 45000개의 payload 있는 것 확인!
print(train_features)

  (0, 9447)	1
  (0, 9094)	1
  (0, 2013)	1
  (0, 14142)	1
  (0, 6103)	1
  (0, 6639)	1
  (0, 10610)	1
  (0, 5945)	2
  (0, 13005)	1
  (0, 1631)	1
  (0, 13686)	1
  (0, 2628)	1
  (0, 4782)	1
  (0, 6324)	1
  (0, 8382)	1
  (0, 6624)	1
  (0, 13552)	2
  (0, 8498)	1
  (0, 10719)	1
  (0, 16229)	1
  (0, 5377)	1
  (0, 15985)	1
  (0, 7562)	2
  (0, 14713)	2
  (0, 9859)	1
  :	:
  (44999, 8454)	2
  (44999, 16653)	1
  (44999, 9923)	1
  (44999, 13239)	1
  (44999, 15171)	1
  (44999, 15233)	1
  (44999, 3749)	1
  (44999, 6963)	1
  (44999, 4025)	1
  (44999, 4739)	1
  (44999, 6411)	1
  (44999, 1497)	1
  (44999, 4531)	1
  (44999, 12749)	1
  (44999, 7796)	1
  (44999, 15406)	1
  (44999, 7081)	1
  (44999, 10153)	1
  (44999, 14829)	1
  (44999, 13134)	1
  (44999, 13153)	1
  (44999, 14991)	1
  (44999, 8907)	1
  (44999, 7614)	1
  (44999, 7615)	1


In [80]:
train_features.shape

(45000, 17903)

# **모델링 & 정확도확인**

In [81]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [82]:
# 훈련 데이터 셋과 검증 데이터 셋으로 분리

TEST_SIZE = 0.3
RANDOM_SEED = 1117

train_x, val_x, train_y, val_y = train_test_split(train_features, df_label, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [83]:
# RandomForestClassifier (DecisionTreeClassifier는 정확도가 낮아서 사용X)
# n_estimators : boosting을 반복할 횟수(손실함수의 가중치를 몇 번이나 조정할 것인지)
#                값이 클수록 정확도가 증가하나, overfitting의 가능성 또한 증가

forest = RandomForestClassifier(n_estimators=100, random_state = RANDOM_SEED, class_weight = 'balanced')
forest.fit(train_x, train_y)

In [84]:
# val data로 모델 accuracy 검증
forest.score(val_x, val_y)

0.8523703703703703

In [85]:
# crosstab: 각 'label_action' category별로 'pred' & 'real' 값을 교차 확인해서 피팅한 모델로 예측한 값이 얼마나 맞았는지 확인 (diagonal은 맞춘것!)
pred = forest.predict(val_x)
crosstab = pd.crosstab(val_y, pred, rownames=['real'], colnames=['pred'])
crosstab

pred,Automatically_Searching_Infor,Cross_Site_Scripting,Directory_Indexing,HOST_Scan,Leakage_Through_NW,Path_Disclosure,SQL_Injection,System_Cmd_Execution,Vulnerability_Scan
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Automatically_Searching_Infor,360,0,0,0,0,0,0,0,1
Cross_Site_Scripting,0,324,1,3,0,5,0,47,5
Directory_Indexing,0,0,123,0,0,0,0,5,2
HOST_Scan,1,8,2,1572,0,34,0,72,142
Leakage_Through_NW,0,0,0,1,303,0,0,0,0
Path_Disclosure,0,4,0,27,0,1226,0,82,95
SQL_Injection,0,0,2,0,0,0,871,89,30
System_Cmd_Execution,0,46,3,58,0,97,55,2365,339
Vulnerability_Scan,4,8,2,205,0,105,52,361,4363
