# Task 7 : 训练模型

In [10]:
import os,json,sys,logging
sys.path.append("./share")
sys.path.append("./common")
import pandas as pd
import json
from tqdm import tqdm
from IoTCommon import CIoTCommon
from IoTTotalFeature import CIoTTotalFeature
from IoTSample import CIoTSample
from SHSample import CSHSample
from SHDataProcess import CSHDataProcess
from SHFeatureSelect import CSHFeature
from Config import g_data_root
from SHDataEDA import CSHDataDistribution,CSHDataTest
from SHModelClassify import CSHModelClassify
from SHEvaluation import CSHROC
import warnings
import h2o
warnings.simplefilter("ignore")
g_sample_root = "%ssample"%g_data_root
h2o.no_progress()
logging.getLogger('h2o').setLevel(logging.ERROR)
h2o.init(nthreads = -1, verbose=False)

! pip install openpyxl

# 提取样本

In [11]:
class CIoTModel:
        
    def __init__(self,attack,protocol):
        self.m_attack = attack
        self.m_protocol = protocol
        self.m_ioTSample = CIoTSample()
        self.m_col_x = []
        self.m_col_y = 'Label'
        self.m_train = pd.DataFrame()
        self.m_test = pd.DataFrame()
        self.m_model = CSHModelClassify()
        
    def get_raw_sample(self,attack,protocol,maxCount = 20000):
        df_attack = self.m_ioTSample.get_attack_sample(attack,protocol)
        if maxCount < df_attack.shape[0]: df_attack = df_attack.sample(maxCount,random_state=42)
        attack_count = df_attack[df_attack['Label'] !=0].shape[0]
        normal_count = int(attack_count/len(self.m_ioTSample.get_sensor_type()))
        df_normal = pd.DataFrame()
        for sensor in self.m_ioTSample.get_sensor_type():
            df_tmp = self.m_ioTSample.get_sensor_sample(sensor,protocol)
            if df_tmp.shape[0] <= 0 :
                continue
            if df_tmp.shape[0] > normal_count:
                df_tmp = df_tmp.sample(n=normal_count,random_state=42)
            df_normal = pd.concat([df_normal,df_tmp],ignore_index=True)
        df_sample = pd.concat([df_normal,df_attack],ignore_index=True)
        return df_sample
        
    def get_select_sample(self,df_sample):
        used_columns = []
        for column_name, dtype in df_sample.dtypes.items():
            if column_name in ['id','frame.time_utc','frame.time_delta']:
                continue
            if dtype == 'object':
                continue
            if not column_name in [self.m_col_y] and df_sample[column_name].nunique() < 2:
                continue
            used_columns.append(column_name)
        
        df_data = df_sample[used_columns]
        return df_data

    def load_sample(self,maxCount = 20000):
        df_sample = self.get_raw_sample(self.m_attack,self.m_protocol,maxCount)
        df_data = self.get_select_sample(df_sample)
        self.m_sample = df_data.sample(frac=1).reset_index(drop=True)
        self.m_col_x = self.m_sample.keys().tolist()
        
        if len(self.m_col_x) < 2:
            print("Must have 2 features (including Label)")
            return False
    
        if self.m_sample[self.m_col_y].nunique() < 2:
            print("Less Label category ( class < 2 )")
            return False
        return True
        
    def preprocess(self):
        
        df_data = self.m_ioTSample.format(self.m_sample)
        df_data.loc[df_data['Label']!=1,'Label'] = 0
        
        self.m_col_x = df_data.keys().tolist()
        
        #df_data = CSHSample.resample_smote(df_data,x_columns=self.m_col_x,y_column=self.m_col_y) 
        self.m_sample, scale_columns = CSHDataProcess.get_scale(df_data,y_column=self.m_col_y)
        
        self.m_train,self.m_test = CSHSample.split_dataset(self.m_sample)
        
        if self.m_train.shape[0] < 20:
            print("Less sample m_train (  < 20")
            return False
        if self.m_test.shape[0] < 10:
            print("Less sample m_test (  < 10")
            return False   
        if self.m_train[self.m_col_y].nunique() < 2:
            print("Less Label m_train category ( class < 2")
            return False
        if self.m_test[self.m_col_y].nunique() < 2:
            print("Less Label m_test category ( class < 2")
            return False
        return True
        
    def train(self):
        self.m_model.train(self.m_train,x_columns=self.m_col_x,y_column=self.m_col_y,train_ratio = 0)
        return self.m_model.importance()

    def test(self):
        return self.m_model.evaluate(self.m_test,x_columns=self.m_col_x,y_column=self.m_col_y)

In [12]:
def do_train_test(attack,protocol):
    model = CIoTModel(attack,protocol)
    if not model.load_sample():
        print("No Sample",attack,protocol)
        return pd.DataFrame(),pd.DataFrame(),pd.DataFrame()
    if not model.preprocess():
        print("No Sample for pregrcess",attack,protocol)
        return pd.DataFrame(),pd.DataFrame(),pd.DataFrame()
 
    df_importance = model.train()
    df_verify = model.test()
    train_sample_true = model.m_train[model.m_train==1].shape[0]
    train_sample_false = model.m_train[model.m_train==0].shape[0]
    test_sample_true = model.m_test[model.m_test==1].shape[0]
    test_sample_false = model.m_test[model.m_test==0].shape[0]
    
    df_model = pd.DataFrame({"attack":attack,"protocol":protocol,"feature":model.m_col_x})
    df_model['train_sample_true'] = train_sample_true
    df_model['train_sample_false'] = train_sample_false
    df_model['test_sample_true'] = test_sample_true
    df_model['test_sample_false'] = test_sample_false
    
    df_importance['attack'] = attack
    df_importance['protocol'] = protocol
    df_verify['attack'] = attack
    df_verify['protocol'] = protocol
    return df_model,df_importance,df_verify

In [13]:
ioTSample = CIoTSample()

In [None]:
result_path = "%s/result/"%g_data_root
for attack in ioTSample.get_attack_type():
    #if not attack in ['Ransomware attack','SQL injection attack','Uploading attack','Vulnerability scanner attack','XSS attacks']:
    #    continue
    df_all_model = pd.DataFrame()
    df_all_importance = pd.DataFrame()
    df_all_verify = pd.DataFrame()
    for protocol in ioTSample.get_attack_protocol(attack):
        df_model,df_importance,df_verify = do_train_test(attack,protocol)
        if df_model.shape[0] > 0 :
            df_all_model = pd.concat([df_all_model,df_model],ignore_index=True)
            df_all_importance = pd.concat([df_all_importance,df_importance],ignore_index=True)
            df_all_verify = pd.concat([df_all_verify,df_verify],ignore_index=True)

    df_all_model = df_all_model.reset_index(drop = True)
    df_all_importance = df_all_importance.reset_index(drop = True)
    df_all_verify = df_all_verify.reset_index(drop = True)
    
    model_path = "%s%s"%(result_path,attack)
    os.makedirs(model_path,exist_ok = True)
    df_all_model.to_csv("%s/model.csv"%model_path)
    df_all_importance.to_csv("%s/importance.csv"%model_path)
    df_all_verify.to_csv("%s/verify.csv"%model_path)

No Normal sample found trafficType='Flame_Sensor',protocol='eth:ethertype:arp'
begin train  dt
end train  dt
begin train  svm
end train  svm
begin train  rf
end train  rf
begin train  ann
end train  ann
begin train  knn
end train  knn
begin train  bayes
end train  bayes
begin train  glm
end train  glm
begin train  gbm
end train  gbm
begin train  xgboost
end train  xgboost
No Normal sample found trafficType='Distance',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficType='Flame_Sensor',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficType='Heart_Rate',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficType='IR_Receiver',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficType='Modbus',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficType='phValue',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficType='Soil_Moisture',protocol='eth:ethertype:ip:tcp:data'
No Normal sample found trafficTyp