diff --git a/black-list/readme.txt b/black-list/readme.txt new file mode 100644 index 0000000..5e6c803 --- /dev/null +++ b/black-list/readme.txt @@ -0,0 +1 @@ +黑名单存放处 diff --git a/check.py b/check.py new file mode 100644 index 0000000..e23074c --- /dev/null +++ b/check.py @@ -0,0 +1,54 @@ +# coding:utf-8 +# author: WenR0 + +from sklearn.externals import joblib +from sklearn.naive_bayes import GaussianNB +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer +from utils import load_php_opcode, recursion_load_php_file_opcode +import sys + + + +if __name__ == '__main__': + php_file_name = sys.argv[1] + print 'Checking the file {}'.format(php_file_name) + + # 之前的数据 + white_file_list = [] + black_file_list = [] + + with open('black_opcodes.txt', 'r') as f: + for line in f: + black_file_list.append(line.strip('\n')) + + with open('white_opcodes.txt', 'r') as f: + for line in f: + white_file_list.append(line.strip('\n')) + + all_token = [] + all_token = white_file_list + black_file_list + + # 准备数据 + token = load_php_opcode(php_file_name) + all_token.append(token) + X = all_token + + # CV 处理 + cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) + X = cv.fit_transform(X).toarray() + + + # tf-idf + transformer = TfidfTransformer(smooth_idf=False) + x_tfidf = transformer.fit_transform(X) + X = x_tfidf.toarray() + + # end 准备数据 + + gnb = joblib.load('save/gnb.pkl') + y_p = gnb.predict(X[-1:]) + + if y_p == [0]: + print 'Not Webshell' + elif y_p == [1]: + print 'Webshell!' diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..6661388 --- /dev/null +++ b/readme.md @@ -0,0 +1,81 @@ +# 机器学习检测Webshell + +# 简介 + +提取PHP执行中的opcode,采用 opcode词袋 + tf-idf 进行关键信息提取 + +采用朴素贝叶斯算法进行训练。 + +进行 PHP WebShell 的检测。 + +# 部署 + +## step 1. Python环境部署 + +``` +pip install -r requirements.txt +``` + +## step 2. PHP opcode 部署 + + 开启opcode模式 + +windows环境 + +``` +1. 下载 vld.dll 插件并存放在php ext 目录下 +2. 配置 php.ini 激活vld.dll 文件 +``` +[VLD.dll下载地址](http://pecl.php.net/package/vld/0.14.0/windows) + +[PHP.ini 配置参考文章](http://blog.sina.com.cn/s/blog_4c8c58ce0102wi2h.html +) + +# 第一次进行训练 + + 将白名单的文件放入到 white-list 文件夹中 + 将黑名单文件放入到 black-list 文件夹中 + +进行第一次训练 +```shell +python train.py +``` +Note: + + 避免每次生成opcode 的时间过长,每次训练完成后,会生成两个文件,black_opcodes.txt & white_opcodes.txt。 + + 如果有新的白名单文件或者黑名单文件加入,先删除掉black_opcodes.txt 和 white_opcodes.txt 文件,然后再次进行训练。 + + 训练完成后,会在save文件夹内,生成一个gnb.pkl文件,这个是训练好的缓存文件。 + + +# 检测 + +检测单个文件 +``` +python check.py [filename] +``` + + +# 重复训练 + +1. 提供训练集 +在人工得到结果后,可以在white-list & black-list 文件夹中,添加已知的结果,再按照第一次训练的方法,进行再次训练。得到的结果便会更加准确。 + + +# 数据集(参考) + +白名单 +- https://github.com/WordPress/WordPress +- https://github.com/typecho/typecho +- https://github.com/phpmyadmin/phpmyadmin +- https://github.com/laravel/laravel +- https://github.com/top-think/framework +- https://github.com/symfony/symfony +- https://github.com/bcit-ci/CodeIgniter +- https://github.com/yiisoft/yii2 + +黑名单 +- https://github.com/tennc/webshell +- https://github.com/ysrc/webshell-sample +- https://github.com/xl7dev/WebShell \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cb5777a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +scipy +scikit-learn diff --git a/save/readme.txt b/save/readme.txt new file mode 100644 index 0000000..48ec68b --- /dev/null +++ b/save/readme.txt @@ -0,0 +1 @@ +持久化pkl存放处 \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..95073a0 --- /dev/null +++ b/train.py @@ -0,0 +1,99 @@ +# coding:utf-8 +# author: WenR0 + +import os +from utils import recursion_load_php_file_opcode, load_php_opcode + +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import GaussianNB +from sklearn import metrics +from sklearn.neural_network import MLPClassifier +from sklearn.externals import joblib + + +def prepare_data(): + """ + 生成需要使用的数据,写入文件后,以供后面应用 + :return: + """ + # 生成数据并写入文件 + if os.path.exists('white_opcodes.txt') is False: + print '[Info] White opcodes doesnt exists ... generating opcode ..' + white_opcodes_list = recursion_load_php_file_opcode('.\\white-list\\') + with open('white_opcodes.txt', 'w') as f: + for line in white_opcodes_list: + f.write(line + '\n') + else: + print '[Info] White opcodes exists' + + if os.path.exists('black_opcodes.txt') is False: + black_opcodes_list = recursion_load_php_file_opcode('.\\black-list\\') + with open('black_opcodes.txt', 'w') as f: + for line in black_opcodes_list: + f.write(line + '\n') + else: + print '[Info] black opcodes exists' + + # 使用数据 + + white_file_list = [] + black_file_list = [] + + with open('black_opcodes.txt', 'r') as f: + for line in f: + black_file_list.append(line.strip('\n')) + + with open('white_opcodes.txt', 'r') as f: + for line in f: + white_file_list.append(line.strip('\n')) + + len_white_file_list = len(white_file_list) + len_black_file_list = len(black_file_list) + + y_white = [0] * len_white_file_list + y_black = [1] * len_black_file_list + + X = white_file_list + black_file_list + y = y_white + y_black + + print '[Data status] ... ↓' + print '[Data status] X length : {}'.format(len_white_file_list + len_black_file_list) + print '[Data status] White list length : {}'.format(len_white_file_list) + print '[Data status] black list length : {}'.format(len_black_file_list) + # X raw data + # y label + return X, y + + +def method1(): + """ + countVectorizer + TF-IDF 整理数据 + 朴素贝叶斯算法生成 + :return: None + """ + X, y = prepare_data() + + cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b') + X = cv.fit_transform(X).toarray() + + transformer = TfidfTransformer(smooth_idf=False) + X = transformer.fit_transform(X).toarray() + + x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) + + gnb = GaussianNB() + gnb.fit(x_train, y_train) + joblib.dump(gnb, 'save/gnb.pkl') + y_pred = gnb.predict(x_test) + + print 'Accuracy :{}'.format(metrics.accuracy_score(y_test, y_pred)) + print metrics.confusion_matrix(y_test, y_pred) + + +def main(): + method1() + + +if __name__ == '__main__': + main() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..fdbd7c6 --- /dev/null +++ b/utils.py @@ -0,0 +1,43 @@ +# coding:utf-8 +# author: WenR0 + +import os +import re +import subprocess + + +def load_php_opcode(phpfilename): + """ + 获取php opcode 信息 + :param phpfilename: + :return: + """ + try: + output = subprocess.check_output(['php.exe', '-dvld.active=1', '-dvld.execute=0', phpfilename], stderr=subprocess.STDOUT) + tokens = re.findall(r'\s(\b[A-Z_]+\b)\s', output) + t = " ".join(tokens) + return t + except: + return " " + + + +def recursion_load_php_file_opcode(dir): + """ + 递归获取 php opcde + :param dir: 目录文件 + :return: + """ + files_list = [] + for root, dirs, files in os.walk(dir): + for filename in files: + if filename.endswith('.php'): + try: + full_path = os.path.join(root, filename) + file_content = load_php_opcode(full_path) + print "[Gen success] {}".format(full_path) + print '--' * 20 + files_list.append(file_content) + except: + continue + return files_list \ No newline at end of file diff --git a/white-list/readme.txt b/white-list/readme.txt new file mode 100644 index 0000000..cc3c4c2 --- /dev/null +++ b/white-list/readme.txt @@ -0,0 +1 @@ +白名单存放处