## [미니프로젝트] 악성사이트 탐지 머신러닝 모델 개발
<br>

---

## ▣ 데이터 소개
* 웹 크롤링 데이터셋 : Feature_Website.xlsx

## ▣ 웹 크롤링 데이터셋의 변수 소개
* html_code : 크롤링을 활용해 수집한 HTML Code 원본
* repu : 악성사이트 여부 (malicious : 악성사이트, benign : 정상사이트)
<br>

---

## <b>[1단계] 데이터 수집</b>

# <b>Step 0. 본격적인 실습 전 packages 설치
* Beautifulsoup 라이브러리 설치
* openpyxl 라이브러리 설치

In [1]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1273 sha256=7f6b22db02ddb5faf628c1719c918ececa8162fd4a0c7f15c27e7f455e11a5e0
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\75\78\21\68b124549c9bdc94f822c02fb9aa3578a669843f9767776bca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Note: you may need to restart the kernel to use updated packages.


* 데이터 프레임 관련 라이브러리 Import

In [2]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

---
## <b>1. 데이터 불러오기
### 정상/악성 HTML Code가 저장된 엑셀파일 불러오기
- 파일명 : Feature Website.xlsx


In [5]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df = pd.read_excel('Feature_Website.xlsx')

In [6]:
# 데이터 프레임의 info를 확인합니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   html_code  40 non-null     object
 1   repu       40 non-null     object
dtypes: object(2)
memory usage: 768.0+ bytes


In [7]:
# 불러온 데이터를 확인합니다.
df.head()

Unnamed: 0,html_code,repu
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious


---
# <b>Step 1. 데이터 수집

### <span style="color:blue"> Beatuifulsoup 라이브러리를 활용 HTML code를 출력하고 \<title> 태그 길이를 계산 <span>

In [8]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(df['html_code'][0], 'html.parser')

*<span style="color:blue"> html code 출력<span>

In [9]:
print(soup)

<!DOCTYPE html>

<!--[if lt IE 7]> <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
<!--[if IE 7]>    <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
<!--[if IE 8]>    <html lang="en-us" class="a-no-js a-lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="a-no-js" lang="en-us"><!--<![endif]--><head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title dir="ltr">Amazon.com</title>
<meta content="width=device-width" name="viewport"/>
<link href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css" rel="stylesheet"/>
<script>

if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
        ue_furl = "fls-na.amazon.com",
        ue_mid = "ATVPDKIKX0DER"

* <span style="color:blue"> \<title> 태그 출력 및 길이 계산<span>

In [10]:
# <title> 태그 출력
print("* title :",soup.head.title)

# <title> 태그 길이 출력
print("* title 길이 :", len(str(soup.head.title.getText())))

* title : <title dir="ltr">Amazon.com</title>
* title 길이 : 10


In [11]:
def title_length(soup):
    try:
        return(len(str(soup.head.title.getText())))
    except:
        return 0.0

In [12]:
title_len = []

for index, row in df.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    title_len.append(title_length(soup))

In [13]:
df['title_length'] = title_len

In [14]:
df

Unnamed: 0,html_code,repu,title_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10.0
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5.0
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71.0
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0.0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27.0
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17.0
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36.0
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45.0
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77.0
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14.0


---

## <b>2. html 에서 \<script>...\</script> 태그 길이 계산
- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

In [None]:
# # Feature(특징) 데이터를 추출는 함수를 작성합니다.
# def script_length(soup):
#     html_len = str(soup.script)
#     return float(len(html_len))

In [None]:
# # 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
# script_len = []

# for index, row in df.iterrows():
#     soup = BeautifulSoup(row.html_code, 'html.parser')
    

In [23]:
# Feature(특징) 데이터를 추출는 함수를 작성합니다.

soup = BeautifulSoup(df['html_code'][0], 'html.parser')
print("* script :", soup.script)
print("* script 길이 :", len(str(soup.script)))

* script : <script>

if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
        ue_furl = "fls-na.amazon.com",
        ue_mid = "ATVPDKIKX0DER",
        ue_sid = (document.cookie.match(/session-id=([0-9-]+)/) || [])[1],
        ue_sn = "opfcaptcha.amazon.com",
        ue_id = 'BY6PY7346W2THV0MAFYZ';
}
</script>
* script 길이 : 405


In [17]:
df.head()

Unnamed: 0,html_code,repu,title_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10.0
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5.0
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71.0
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0.0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27.0


In [18]:
def script_length(soup):
    try:
        return(len(str(soup.script)))
    except:
        return 0.0

In [19]:
script_len = []

for index, row in df.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    script_len.append(script_length(soup))

In [20]:
df['script_length'] = script_len

In [21]:
df

Unnamed: 0,html_code,repu,title_length,script_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10.0,405
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5.0,579
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71.0,817
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0.0,4
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27.0,425
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17.0,3180
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36.0,5056
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45.0,172
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77.0,1389
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14.0,119


---

## <b>3. html에서 공백 수 계산

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

In [39]:
# Feature(특징) 데이터를 추출하는 함수를 작성합니다.

soup = BeautifulSoup(df['html_code'][0], 'html.parser')

print("* <html>공백수 :", str(soup.html).count(' '))

* <html>공백수 : 471


In [40]:
def space_length(soup):
    try:
        return str(soup.html).count(' ')
    except:
        return 0.0

In [41]:
space_len = []

for index, row in df.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    space_len.append(space_length(soup))

In [42]:
df['space_length'] = space_len

In [43]:
df

Unnamed: 0,html_code,repu,title_length,script_length,body_length,space_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10.0,405,3836,471
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5.0,579,15268,1318
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71.0,817,16542,2718
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0.0,4,4,197
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27.0,425,20314,2771
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17.0,3180,4,856
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36.0,5056,20018,2467
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45.0,172,4,838
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77.0,1389,29661,1398
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14.0,119,4,1510


---

## <b>4. html 에서 body 길이 계산

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

In [27]:
# Feature(특징) 데이터를 추출하는 함수를 작성합니다.

#soup = BeautifulSoup(df['html_code'][0], 'html.parser')
#print("* body :", soup.body)
print("* body 길이 :", len(str(soup.body)))

* body 길이 : 3836


In [28]:
def body_length(soup):
    try:
        return(len(str(soup.body)))
    except:
        return 0.0

In [29]:
body_len = []

for index, row in df.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    body_len.append(body_length(soup))

In [30]:
df['body_length'] = body_len

In [31]:
df

Unnamed: 0,html_code,repu,title_length,script_length,body_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10.0,405,3836
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5.0,579,15268
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71.0,817,16542
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0.0,4,4
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27.0,425,20314
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17.0,3180,4
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36.0,5056,20018
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45.0,172,4
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77.0,1389,29661
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14.0,119,4


---

## <b>5. script 에서 src, href 속성을 가진 태그수

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

In [13]:
# Feature(특징) 데이터를 추출하는 함수를 작성합니다.
def link_in_script(soup):
    Linkcount = len(soup.findAll('script', {"src": True}))
    Linkcount += len(soup.findAll('script', {"href": True}))
    return float(Linkcount)

In [3]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다
tag_count = []

for index, row in df.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    tag_count.append(link_in_script(soup))

df['link_count'] = tag_count

In [1]:
# 추출한 Feature(특징) 데이터를 확인합니다.

df