Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 27fe418
Showing
8 changed files
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
黑名单存放处 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# coding:utf-8 | ||
# author: WenR0 | ||
|
||
from sklearn.externals import joblib | ||
from sklearn.naive_bayes import GaussianNB | ||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | ||
from utils import load_php_opcode, recursion_load_php_file_opcode | ||
import sys | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
php_file_name = sys.argv[1] | ||
print 'Checking the file {}'.format(php_file_name) | ||
|
||
# 之前的数据 | ||
white_file_list = [] | ||
black_file_list = [] | ||
|
||
with open('black_opcodes.txt', 'r') as f: | ||
for line in f: | ||
black_file_list.append(line.strip('\n')) | ||
|
||
with open('white_opcodes.txt', 'r') as f: | ||
for line in f: | ||
white_file_list.append(line.strip('\n')) | ||
|
||
all_token = [] | ||
all_token = white_file_list + black_file_list | ||
|
||
# 准备数据 | ||
token = load_php_opcode(php_file_name) | ||
all_token.append(token) | ||
X = all_token | ||
|
||
# CV 处理 | ||
cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) | ||
X = cv.fit_transform(X).toarray() | ||
|
||
|
||
# tf-idf | ||
transformer = TfidfTransformer(smooth_idf=False) | ||
x_tfidf = transformer.fit_transform(X) | ||
X = x_tfidf.toarray() | ||
|
||
# end 准备数据 | ||
|
||
gnb = joblib.load('save/gnb.pkl') | ||
y_p = gnb.predict(X[-1:]) | ||
|
||
if y_p == [0]: | ||
print 'Not Webshell' | ||
elif y_p == [1]: | ||
print 'Webshell!' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# 机器学习检测Webshell | ||
|
||
# 简介 | ||
|
||
提取PHP执行中的opcode,采用 opcode词袋 + tf-idf 进行关键信息提取 | ||
|
||
采用朴素贝叶斯算法进行训练。 | ||
|
||
进行 PHP WebShell 的检测。 | ||
|
||
# 部署 | ||
|
||
## step 1. Python环境部署 | ||
|
||
``` | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## step 2. PHP opcode 部署 | ||
|
||
开启opcode模式 | ||
|
||
windows环境 | ||
|
||
``` | ||
1. 下载 vld.dll 插件并存放在php ext 目录下 | ||
2. 配置 php.ini 激活vld.dll 文件 | ||
``` | ||
[VLD.dll下载地址](http://pecl.php.net/package/vld/0.14.0/windows) | ||
|
||
[PHP.ini 配置参考文章](http://blog.sina.com.cn/s/blog_4c8c58ce0102wi2h.html | ||
) | ||
|
||
# 第一次进行训练 | ||
|
||
将白名单的文件放入到 white-list 文件夹中 | ||
将黑名单文件放入到 black-list 文件夹中 | ||
|
||
进行第一次训练 | ||
```shell | ||
python train.py | ||
``` | ||
Note: | ||
|
||
避免每次生成opcode 的时间过长,每次训练完成后,会生成两个文件,black_opcodes.txt & white_opcodes.txt。 | ||
|
||
如果有新的白名单文件或者黑名单文件加入,先删除掉black_opcodes.txt 和 white_opcodes.txt 文件,然后再次进行训练。 | ||
|
||
训练完成后,会在save文件夹内,生成一个gnb.pkl文件,这个是训练好的缓存文件。 | ||
|
||
|
||
# 检测 | ||
|
||
检测单个文件 | ||
``` | ||
python check.py [filename] | ||
``` | ||
|
||
|
||
# 重复训练 | ||
|
||
1. 提供训练集 | ||
在人工得到结果后,可以在white-list & black-list 文件夹中,添加已知的结果,再按照第一次训练的方法,进行再次训练。得到的结果便会更加准确。 | ||
|
||
|
||
# 数据集(参考) | ||
|
||
白名单 | ||
- https://github.com/WordPress/WordPress | ||
- https://github.com/typecho/typecho | ||
- https://github.com/phpmyadmin/phpmyadmin | ||
- https://github.com/laravel/laravel | ||
- https://github.com/top-think/framework | ||
- https://github.com/symfony/symfony | ||
- https://github.com/bcit-ci/CodeIgniter | ||
- https://github.com/yiisoft/yii2 | ||
|
||
黑名单 | ||
- https://github.com/tennc/webshell | ||
- https://github.com/ysrc/webshell-sample | ||
- https://github.com/xl7dev/WebShell |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
numpy | ||
scipy | ||
scikit-learn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
持久化pkl存放处 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# coding:utf-8 | ||
# author: WenR0 | ||
|
||
import os | ||
from utils import recursion_load_php_file_opcode, load_php_opcode | ||
|
||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.naive_bayes import GaussianNB | ||
from sklearn import metrics | ||
from sklearn.neural_network import MLPClassifier | ||
from sklearn.externals import joblib | ||
|
||
|
||
def prepare_data(): | ||
""" | ||
生成需要使用的数据,写入文件后,以供后面应用 | ||
:return: | ||
""" | ||
# 生成数据并写入文件 | ||
if os.path.exists('white_opcodes.txt') is False: | ||
print '[Info] White opcodes doesnt exists ... generating opcode ..' | ||
white_opcodes_list = recursion_load_php_file_opcode('.\\white-list\\') | ||
with open('white_opcodes.txt', 'w') as f: | ||
for line in white_opcodes_list: | ||
f.write(line + '\n') | ||
else: | ||
print '[Info] White opcodes exists' | ||
|
||
if os.path.exists('black_opcodes.txt') is False: | ||
black_opcodes_list = recursion_load_php_file_opcode('.\\black-list\\') | ||
with open('black_opcodes.txt', 'w') as f: | ||
for line in black_opcodes_list: | ||
f.write(line + '\n') | ||
else: | ||
print '[Info] black opcodes exists' | ||
|
||
# 使用数据 | ||
|
||
white_file_list = [] | ||
black_file_list = [] | ||
|
||
with open('black_opcodes.txt', 'r') as f: | ||
for line in f: | ||
black_file_list.append(line.strip('\n')) | ||
|
||
with open('white_opcodes.txt', 'r') as f: | ||
for line in f: | ||
white_file_list.append(line.strip('\n')) | ||
|
||
len_white_file_list = len(white_file_list) | ||
len_black_file_list = len(black_file_list) | ||
|
||
y_white = [0] * len_white_file_list | ||
y_black = [1] * len_black_file_list | ||
|
||
X = white_file_list + black_file_list | ||
y = y_white + y_black | ||
|
||
print '[Data status] ... ↓' | ||
print '[Data status] X length : {}'.format(len_white_file_list + len_black_file_list) | ||
print '[Data status] White list length : {}'.format(len_white_file_list) | ||
print '[Data status] black list length : {}'.format(len_black_file_list) | ||
# X raw data | ||
# y label | ||
return X, y | ||
|
||
|
||
def method1(): | ||
""" | ||
countVectorizer + TF-IDF 整理数据 | ||
朴素贝叶斯算法生成 | ||
:return: None | ||
""" | ||
X, y = prepare_data() | ||
|
||
cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b') | ||
X = cv.fit_transform(X).toarray() | ||
|
||
transformer = TfidfTransformer(smooth_idf=False) | ||
X = transformer.fit_transform(X).toarray() | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) | ||
|
||
gnb = GaussianNB() | ||
gnb.fit(x_train, y_train) | ||
joblib.dump(gnb, 'save/gnb.pkl') | ||
y_pred = gnb.predict(x_test) | ||
|
||
print 'Accuracy :{}'.format(metrics.accuracy_score(y_test, y_pred)) | ||
print metrics.confusion_matrix(y_test, y_pred) | ||
|
||
|
||
def main(): | ||
method1() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# coding:utf-8 | ||
# author: WenR0 | ||
|
||
import os | ||
import re | ||
import subprocess | ||
|
||
|
||
def load_php_opcode(phpfilename): | ||
""" | ||
获取php opcode 信息 | ||
:param phpfilename: | ||
:return: | ||
""" | ||
try: | ||
output = subprocess.check_output(['php.exe', '-dvld.active=1', '-dvld.execute=0', phpfilename], stderr=subprocess.STDOUT) | ||
tokens = re.findall(r'\s(\b[A-Z_]+\b)\s', output) | ||
t = " ".join(tokens) | ||
return t | ||
except: | ||
return " " | ||
|
||
|
||
|
||
def recursion_load_php_file_opcode(dir): | ||
""" | ||
递归获取 php opcde | ||
:param dir: 目录文件 | ||
:return: | ||
""" | ||
files_list = [] | ||
for root, dirs, files in os.walk(dir): | ||
for filename in files: | ||
if filename.endswith('.php'): | ||
try: | ||
full_path = os.path.join(root, filename) | ||
file_content = load_php_opcode(full_path) | ||
print "[Gen success] {}".format(full_path) | ||
print '--' * 20 | ||
files_list.append(file_content) | ||
except: | ||
continue | ||
return files_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
白名单存放处 |