init project

hi-WenR0 · Feb 2, 2018 · 27fe418 · 27fe418
commit 27fe418
Show file tree

Hide file tree

Showing 8 changed files with 283 additions and 0 deletions.
diff --git a/black-list/readme.txt b/black-list/readme.txt
@@ -0,0 +1 @@
+黑名单存放处
diff --git a/check.py b/check.py
@@ -0,0 +1,54 @@
+# coding:utf-8
+# author: WenR0
+
+from sklearn.externals import joblib
+from sklearn.naive_bayes import GaussianNB
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from utils import load_php_opcode, recursion_load_php_file_opcode
+import sys
+
+
+
+if __name__ == '__main__':
+    php_file_name = sys.argv[1]
+    print 'Checking the file {}'.format(php_file_name)
+
+    # 之前的数据
+    white_file_list = []
+    black_file_list = []
+
+    with open('black_opcodes.txt', 'r') as f:
+        for line in f:
+            black_file_list.append(line.strip('\n'))
+
+    with open('white_opcodes.txt', 'r') as f:
+        for line in f:
+            white_file_list.append(line.strip('\n'))
+
+    all_token = []
+    all_token = white_file_list + black_file_list
+
+    # 准备数据
+    token = load_php_opcode(php_file_name)
+    all_token.append(token)
+    X = all_token
+
+    # CV 处理
+    cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
+    X = cv.fit_transform(X).toarray()
+
+
+    # tf-idf
+    transformer = TfidfTransformer(smooth_idf=False)
+    x_tfidf = transformer.fit_transform(X)
+    X = x_tfidf.toarray()
+
+    # end 准备数据
+
+    gnb = joblib.load('save/gnb.pkl')
+    y_p = gnb.predict(X[-1:])
+
+    if y_p == [0]:
+        print 'Not Webshell'
+    elif y_p == [1]:
+        print 'Webshell!'
diff --git a/readme.md b/readme.md
@@ -0,0 +1,81 @@
+# 机器学习检测Webshell
+
+# 简介
+
+提取PHP执行中的opcode，采用 opcode词袋 + tf-idf 进行关键信息提取
+
+采用朴素贝叶斯算法进行训练。
+
+进行 PHP WebShell 的检测。
+
+# 部署
+
+## step 1. Python环境部署
+
+```
+pip install -r requirements.txt
+```
+
+## step 2. PHP opcode 部署
+
+    开启opcode模式
+
+windows环境
+
+```
+1. 下载 vld.dll 插件并存放在php ext 目录下
+2. 配置 php.ini 激活vld.dll 文件
+```
+[VLD.dll下载地址](http://pecl.php.net/package/vld/0.14.0/windows)
+
+[PHP.ini 配置参考文章](http://blog.sina.com.cn/s/blog_4c8c58ce0102wi2h.html
+)
+
+# 第一次进行训练
+
+    将白名单的文件放入到 white-list 文件夹中
+    将黑名单文件放入到 black-list 文件夹中
+
+进行第一次训练
+```shell
+python train.py
+```
+Note:
+
+    避免每次生成opcode 的时间过长，每次训练完成后，会生成两个文件，black_opcodes.txt & white_opcodes.txt。
+
+    如果有新的白名单文件或者黑名单文件加入，先删除掉black_opcodes.txt 和 white_opcodes.txt 文件，然后再次进行训练。
+
+    训练完成后，会在save文件夹内，生成一个gnb.pkl文件，这个是训练好的缓存文件。
+
+
+# 检测
+
+检测单个文件
+```
+python check.py [filename]
+```
+
+
+# 重复训练
+
+1. 提供训练集
+在人工得到结果后，可以在white-list & black-list 文件夹中，添加已知的结果，再按照第一次训练的方法，进行再次训练。得到的结果便会更加准确。
+
+
+# 数据集（参考）
+
+白名单
+- https://github.com/WordPress/WordPress
+- https://github.com/typecho/typecho
+- https://github.com/phpmyadmin/phpmyadmin
+- https://github.com/laravel/laravel
+- https://github.com/top-think/framework
+- https://github.com/symfony/symfony
+- https://github.com/bcit-ci/CodeIgniter
+- https://github.com/yiisoft/yii2
+
+黑名单
+- https://github.com/tennc/webshell
+- https://github.com/ysrc/webshell-sample
+- https://github.com/xl7dev/WebShell
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+scipy
+scikit-learn
diff --git a/save/readme.txt b/save/readme.txt
@@ -0,0 +1 @@
+持久化pkl存放处
diff --git a/train.py b/train.py
@@ -0,0 +1,99 @@
+# coding:utf-8
+# author: WenR0
+
+import os
+from utils import recursion_load_php_file_opcode, load_php_opcode
+
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB
+from sklearn import metrics
+from sklearn.neural_network import MLPClassifier
+from sklearn.externals import joblib
+
+
+def prepare_data():
+    """
+    生成需要使用的数据，写入文件后，以供后面应用
+    :return:
+    """
+    # 生成数据并写入文件
+    if os.path.exists('white_opcodes.txt') is False:
+        print '[Info] White opcodes doesnt exists ... generating opcode ..'
+        white_opcodes_list = recursion_load_php_file_opcode('.\\white-list\\')
+        with open('white_opcodes.txt', 'w') as f:
+            for line in white_opcodes_list:
+                f.write(line + '\n')
+    else:
+        print '[Info] White opcodes exists'
+
+    if os.path.exists('black_opcodes.txt') is False:
+        black_opcodes_list = recursion_load_php_file_opcode('.\\black-list\\')
+        with open('black_opcodes.txt', 'w') as f:
+            for line in black_opcodes_list:
+                f.write(line + '\n')
+    else:
+        print '[Info] black opcodes exists'
+
+    # 使用数据
+
+    white_file_list = []
+    black_file_list = []
+
+    with open('black_opcodes.txt', 'r') as f:
+        for line in f:
+            black_file_list.append(line.strip('\n'))
+
+    with open('white_opcodes.txt', 'r') as f:
+        for line in f:
+            white_file_list.append(line.strip('\n'))
+
+    len_white_file_list = len(white_file_list)
+    len_black_file_list = len(black_file_list)
+
+    y_white = [0] * len_white_file_list
+    y_black = [1] * len_black_file_list
+
+    X = white_file_list + black_file_list
+    y = y_white + y_black
+
+    print '[Data status] ... ↓'
+    print '[Data status] X length : {}'.format(len_white_file_list + len_black_file_list)
+    print '[Data status] White list length : {}'.format(len_white_file_list)
+    print '[Data status] black list length : {}'.format(len_black_file_list)
+    # X raw data
+    # y label
+    return X, y
+
+
+def method1():
+    """
+    countVectorizer + TF-IDF 整理数据
+    朴素贝叶斯算法生成
+    :return: None
+    """
+    X, y = prepare_data()
+
+    cv = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", token_pattern=r'\b\w+\b')
+    X = cv.fit_transform(X).toarray()
+
+    transformer = TfidfTransformer(smooth_idf=False)
+    X = transformer.fit_transform(X).toarray()
+
+    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
+
+    gnb = GaussianNB()
+    gnb.fit(x_train, y_train)
+    joblib.dump(gnb, 'save/gnb.pkl')
+    y_pred = gnb.predict(x_test)
+
+    print 'Accuracy :{}'.format(metrics.accuracy_score(y_test, y_pred))
+    print metrics.confusion_matrix(y_test, y_pred)
+
+
+def main():
+    method1()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils.py b/utils.py
@@ -0,0 +1,43 @@
+# coding:utf-8
+# author: WenR0
+
+import os
+import re
+import subprocess
+
+
+def load_php_opcode(phpfilename):
+    """
+    获取php opcode 信息
+    :param phpfilename:
+    :return:
+    """
+    try:
+        output = subprocess.check_output(['php.exe', '-dvld.active=1', '-dvld.execute=0', phpfilename], stderr=subprocess.STDOUT)
+        tokens = re.findall(r'\s(\b[A-Z_]+\b)\s', output)
+        t = " ".join(tokens)
+        return t
+    except:
+        return " "
+
+
+
+def recursion_load_php_file_opcode(dir):
+    """
+    递归获取 php opcde
+    :param dir: 目录文件
+    :return:
+    """
+    files_list = []
+    for root, dirs, files in os.walk(dir):
+        for filename in files:
+            if filename.endswith('.php'):
+                try:
+                    full_path = os.path.join(root, filename)
+                    file_content = load_php_opcode(full_path)
+                    print "[Gen success] {}".format(full_path)
+                    print '--' * 20
+                    files_list.append(file_content)
+                except:
+                    continue
+    return files_list
diff --git a/white-list/readme.txt b/white-list/readme.txt
@@ -0,0 +1 @@
+白名单存放处