Skip to content

Commit

Permalink
init project
Browse files Browse the repository at this point in the history
  • Loading branch information
hi-WenR0 committed Feb 2, 2018
0 parents commit 27fe418
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 0 deletions.
1 change: 1 addition & 0 deletions black-list/readme.txt
@@ -0,0 +1 @@
黑名单存放处
54 changes: 54 additions & 0 deletions check.py
@@ -0,0 +1,54 @@
# coding:utf-8
# author: WenR0

from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from utils import load_php_opcode, recursion_load_php_file_opcode
import sys



if __name__ == '__main__':
php_file_name = sys.argv[1]
print 'Checking the file {}'.format(php_file_name)

# 之前的数据
white_file_list = []
black_file_list = []

with open('black_opcodes.txt', 'r') as f:
for line in f:
black_file_list.append(line.strip('\n'))

with open('white_opcodes.txt', 'r') as f:
for line in f:
white_file_list.append(line.strip('\n'))

all_token = []
all_token = white_file_list + black_file_list

# 准备数据
token = load_php_opcode(php_file_name)
all_token.append(token)
X = all_token

# CV 处理
cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
X = cv.fit_transform(X).toarray()


# tf-idf
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(X)
X = x_tfidf.toarray()

# end 准备数据

gnb = joblib.load('save/gnb.pkl')
y_p = gnb.predict(X[-1:])

if y_p == [0]:
print 'Not Webshell'
elif y_p == [1]:
print 'Webshell!'
81 changes: 81 additions & 0 deletions readme.md
@@ -0,0 +1,81 @@
# 机器学习检测Webshell

# 简介

提取PHP执行中的opcode,采用 opcode词袋 + tf-idf 进行关键信息提取

采用朴素贝叶斯算法进行训练。

进行 PHP WebShell 的检测。

# 部署

## step 1. Python环境部署

```
pip install -r requirements.txt
```

## step 2. PHP opcode 部署

开启opcode模式

windows环境

```
1. 下载 vld.dll 插件并存放在php ext 目录下
2. 配置 php.ini 激活vld.dll 文件
```
[VLD.dll下载地址](http://pecl.php.net/package/vld/0.14.0/windows)

[PHP.ini 配置参考文章](http://blog.sina.com.cn/s/blog_4c8c58ce0102wi2h.html
)

# 第一次进行训练

将白名单的文件放入到 white-list 文件夹中
将黑名单文件放入到 black-list 文件夹中

进行第一次训练
```shell
python train.py
```
Note:

避免每次生成opcode 的时间过长,每次训练完成后,会生成两个文件,black_opcodes.txt & white_opcodes.txt。

如果有新的白名单文件或者黑名单文件加入,先删除掉black_opcodes.txt 和 white_opcodes.txt 文件,然后再次进行训练。

训练完成后,会在save文件夹内,生成一个gnb.pkl文件,这个是训练好的缓存文件。


# 检测

检测单个文件
```
python check.py [filename]
```


# 重复训练

1. 提供训练集
在人工得到结果后,可以在white-list & black-list 文件夹中,添加已知的结果,再按照第一次训练的方法,进行再次训练。得到的结果便会更加准确。


# 数据集(参考)

白名单
- https://github.com/WordPress/WordPress
- https://github.com/typecho/typecho
- https://github.com/phpmyadmin/phpmyadmin
- https://github.com/laravel/laravel
- https://github.com/top-think/framework
- https://github.com/symfony/symfony
- https://github.com/bcit-ci/CodeIgniter
- https://github.com/yiisoft/yii2

黑名单
- https://github.com/tennc/webshell
- https://github.com/ysrc/webshell-sample
- https://github.com/xl7dev/WebShell
3 changes: 3 additions & 0 deletions requirements.txt
@@ -0,0 +1,3 @@
numpy
scipy
scikit-learn
1 change: 1 addition & 0 deletions save/readme.txt
@@ -0,0 +1 @@
持久化pkl存放处
99 changes: 99 additions & 0 deletions train.py
@@ -0,0 +1,99 @@
# coding:utf-8
# author: WenR0

import os
from utils import recursion_load_php_file_opcode, load_php_opcode

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib


def prepare_data():
"""
生成需要使用的数据,写入文件后,以供后面应用
:return:
"""
# 生成数据并写入文件
if os.path.exists('white_opcodes.txt') is False:
print '[Info] White opcodes doesnt exists ... generating opcode ..'
white_opcodes_list = recursion_load_php_file_opcode('.\\white-list\\')
with open('white_opcodes.txt', 'w') as f:
for line in white_opcodes_list:
f.write(line + '\n')
else:
print '[Info] White opcodes exists'

if os.path.exists('black_opcodes.txt') is False:
black_opcodes_list = recursion_load_php_file_opcode('.\\black-list\\')
with open('black_opcodes.txt', 'w') as f:
for line in black_opcodes_list:
f.write(line + '\n')
else:
print '[Info] black opcodes exists'

# 使用数据

white_file_list = []
black_file_list = []

with open('black_opcodes.txt', 'r') as f:
for line in f:
black_file_list.append(line.strip('\n'))

with open('white_opcodes.txt', 'r') as f:
for line in f:
white_file_list.append(line.strip('\n'))

len_white_file_list = len(white_file_list)
len_black_file_list = len(black_file_list)

y_white = [0] * len_white_file_list
y_black = [1] * len_black_file_list

X = white_file_list + black_file_list
y = y_white + y_black

print '[Data status] ... ↓'
print '[Data status] X length : {}'.format(len_white_file_list + len_black_file_list)
print '[Data status] White list length : {}'.format(len_white_file_list)
print '[Data status] black list length : {}'.format(len_black_file_list)
# X raw data
# y label
return X, y


def method1():
"""
countVectorizer + TF-IDF 整理数据
朴素贝叶斯算法生成
:return: None
"""
X, y = prepare_data()

cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b')
X = cv.fit_transform(X).toarray()

transformer = TfidfTransformer(smooth_idf=False)
X = transformer.fit_transform(X).toarray()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

gnb = GaussianNB()
gnb.fit(x_train, y_train)
joblib.dump(gnb, 'save/gnb.pkl')
y_pred = gnb.predict(x_test)

print 'Accuracy :{}'.format(metrics.accuracy_score(y_test, y_pred))
print metrics.confusion_matrix(y_test, y_pred)


def main():
method1()


if __name__ == '__main__':
main()
43 changes: 43 additions & 0 deletions utils.py
@@ -0,0 +1,43 @@
# coding:utf-8
# author: WenR0

import os
import re
import subprocess


def load_php_opcode(phpfilename):
"""
获取php opcode 信息
:param phpfilename:
:return:
"""
try:
output = subprocess.check_output(['php.exe', '-dvld.active=1', '-dvld.execute=0', phpfilename], stderr=subprocess.STDOUT)
tokens = re.findall(r'\s(\b[A-Z_]+\b)\s', output)
t = " ".join(tokens)
return t
except:
return " "



def recursion_load_php_file_opcode(dir):
"""
递归获取 php opcde
:param dir: 目录文件
:return:
"""
files_list = []
for root, dirs, files in os.walk(dir):
for filename in files:
if filename.endswith('.php'):
try:
full_path = os.path.join(root, filename)
file_content = load_php_opcode(full_path)
print "[Gen success] {}".format(full_path)
print '--' * 20
files_list.append(file_content)
except:
continue
return files_list
1 change: 1 addition & 0 deletions white-list/readme.txt
@@ -0,0 +1 @@
白名单存放处

0 comments on commit 27fe418

Please sign in to comment.