# Task 7 : 训练模型

In [1]:
import os,json,sys,logging
sys.path.append("./share")
sys.path.append("./common")
import pandas as pd
import json
import numpy as np
from tqdm.notebook import tqdm
from IoTCommon import CIoTCommon
from SHDataProcess import CSHDataProcess
from IoTSample import CIoTSample
from Config import g_data_root
from SHModelClassify import CSHModelClassify
from IoTToken import CIoTCommonSample
from IoTNoise import CIoTNoise
from SHEvaluation import CSHROC,CSHEvaluate
from IoTNoise import CIoTNoise
from IoTModel import CIoTModel
import warnings
import h2o
warnings.simplefilter("ignore")
g_sample_root = "%ssample"%g_data_root
g_fixed_root = "%sfixed"%g_data_root

np.random.seed(42)
h2o.init(nthreads = -1, verbose=False)

In [2]:
ioTSample = CIoTSample()

# 使用原始标签训练模型

## 训练模型，并进行对单个分类器进行评估

In [3]:
def do_signle_train_test(attack,protocol):
    model = CIoTModel(attack,protocol)
    if not model.load_sample(maxCount = 20000 , noise_ratio = 0.1 , n_class = 0,num_samples = 5000):
        print("No Sample",attack,protocol)
        return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),None
 
    df_importance = model.train()
    sample_uuid,df_test,df_noised = model.get_test()

    df_verify1 = model.test(df_test)
    df_verify2 = model.test(df_noised)

    train_sample_true = model.m_train[model.m_train==1].shape[0]
    train_sample_false = model.m_train[model.m_train==0].shape[0]
    test_sample_true = model.m_test[model.m_test==1].shape[0]
    test_sample_false = model.m_test[model.m_test==0].shape[0]
    
    df_model = pd.DataFrame({"attack":attack,"protocol":protocol,"feature":model.m_col_x})
    df_model['train_sample_true'] = train_sample_true
    df_model['train_sample_false'] = train_sample_false
    df_model['test_sample_true'] = test_sample_true
    df_model['test_sample_false'] = test_sample_false
    
    df_importance['attack'] = attack
    df_importance['protocol'] = protocol
    
    df_verify1['attack'] = attack
    df_verify1['protocol'] = protocol
    df_verify2['attack'] = attack
    df_verify2['protocol'] = protocol
    
    return df_model,df_importance,df_verify1,df_verify2,model.m_model

## 组合分类器评估

In [4]:
def get_combined_predict(attack):
    result_path = "%s/result/"%g_data_root
    model_path = "%s%s"%(result_path,attack)
    df_index = pd.read_csv("%s/single_model.csv"%(model_path),index_col=0)
    df_predict = pd.DataFrame()
    for protocol in df_index['protocol'].unique().tolist():        
        sample = CIoTModel(attack,protocol)
        if not sample.load_sample(maxCount = 20000 , noise_ratio = 0.1 , n_class = 0,num_samples = 5000):
            print("No Sample",attack,protocol)
            continue
        
        model_file_path = "%s/%s/models-single/"%(model_path,protocol.replace(":","-"))
        model = CSHModelClassify()
        model.load(model_file_path)

        col_x,col_y = sample.get_col_x_y()
        sample_uuid,df_test,df_noised = sample.get_test()

        df_test_pred = model.predict(df_test,x_columns = col_x,y_column=col_y)
        df_noised_pred = model.predict(df_noised,x_columns = col_x,y_column=col_y)
        
        for model_name in list(model.m_models.keys()):
            df_tmp = df_test_pred[df_test_pred['model']==model_name].reset_index(drop=True)
            df_tmp['uuid'] = sample_uuid
            df_tmp['type'] = 'test'
            df_tmp['protocol'] = protocol
            df_predict = pd.concat([df_predict,df_tmp],ignore_index=True)

            df_tmp = df_noised_pred[df_noised_pred['model']==model_name].reset_index(drop=True)
            df_tmp['uuid'] = sample_uuid
            df_tmp['type'] = 'noised'
            df_tmp['protocol'] = protocol
            df_predict = pd.concat([df_predict,df_tmp],ignore_index=True)
            
    df_predict = df_predict.reset_index(drop = True)
    
    all_data = []
    for item,df_tmp in tqdm(df_predict.groupby(["type","uuid","model"])):
        type, uuid, model_name= item[0],item[1],item[2]
        if df_tmp['true'].nunique() > 1:
            print("Grand True count > 1",type,uuid,model)
            display(df_tmp)
            continue
        tmp = {}
        tmp['type'] = type
        tmp['uuid'] = uuid
        tmp['model'] = model_name
        tmp['true'] = df_tmp.iloc[0]['true']
        tmp['one'] = 0
        tmp['zero'] = 0
        for protocol,df_tmp1 in df_tmp.groupby("protocol"):
            predict = df_tmp1.iloc[0]['predict']
            tmp[protocol] = predict
            tmp['zero'] = tmp['zero'] + 1 if predict == 0 else tmp['zero']
            tmp['one'] = tmp['one'] + 1 if predict == 1 else tmp['one']   
            
        tmp['predict'] = 1 if tmp['one'] >= tmp['zero'] else 0
        
        all_data.append(tmp.copy())
        
    df_result = pd.DataFrame(all_data)
    
    return df_result,df_predict

In [5]:
def do_combined_test(attack):
    df_result,df_predict = get_combined_predict(attack)
    df_summary = pd.DataFrame()
    for item,df_tmp in df_result.groupby(["model","type"]):
        df_t = CSHEvaluate.evaluate(df_tmp['true'],df_tmp['predict'])
        df_t['model'] = item[0]
        df_t['type'] = item[1]
        df_summary = pd.concat([df_summary,df_t],ignore_index=True)
    df_summary = df_summary.reset_index(drop=True)
    result_path = "%s/result/"%g_data_root
    model_path = "%s%s"%(result_path,attack)
    os.makedirs(model_path,exist_ok = True)
    df_predict.to_csv("%s/combined_predict.csv"%model_path)
    df_result.to_csv("%s/combined_result.csv"%model_path)
    df_summary.to_csv("%s/combined_summary.csv"%model_path)