# Task 5 : 生成训练用的数据集

1. 过滤掉异常数据
2. 生成全局的字典（类别代码）
3. 对数据集进行数据类型转换

In [13]:
import os,json,sys,logging
from numpy import nan
sys.path.append("./share")
sys.path.append("./common")
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from IoTCommon import CIoTCommon
from IoTTotalFeature import CIoTTotalFeature
from Config import g_data_root
import warnings
warnings.simplefilter("ignore")

### 读取有效特征，过滤常量特征，将特征划分到不同的协议

In [14]:
df_features = pd.read_excel("%s/features/features.xlsx"%g_data_root,index_col=0).reset_index()
df_features = df_features[df_features['effective'] !="N"]
df_features = df_features[df_features['count'] > 1].reset_index(drop=True)
totalFeature = CIoTTotalFeature()
df_features = CIoTTotalFeature.get_expand_features(df_features,totalFeature)

In [15]:
def find_type_values(protocol,feature):
    mask = ( df_features['protocol'] == protocol ) & ( df_features['feature'] == feature )
    df_tmp = df_features[mask]
    if df_tmp.shape[0] <= 0: return None,None
    type = df_tmp.iloc[0]['type']
    values = df_tmp.iloc[0]['values']
    return type,values

### 生成类别代码

In [16]:
g_category_map = {}
for protocol in totalFeature.get_protocols():
    if not protocol in g_category_map:
        g_category_map[protocol] = {}
    for feature in totalFeature.get_features(protocol):    
        #if feature != "arp.dst.hw_mac":continue
        if not feature in g_category_map[protocol]:
            g_category_map[protocol][feature] = {}
            
        type,values = find_type_values(protocol,feature)
        
        if type != 'category' : continue
        
        values=json.dumps(eval(values),allow_nan=True)
        values = json.loads(values)
        for i in range(len(values)):
            try:
                v = str(int(values[i]))
            except:
                v = str(values[i])
            v = "None" if v=='nan' else v
            g_category_map[protocol][feature][v] = i
            
        if not "None" in g_category_map[protocol][feature]:
            g_category_map[protocol][feature]["None"] = len(values)
            
with open("%sfeatures/categorymap.json"%g_data_root,'w') as fp:
    fp.write(json.dumps(g_category_map,indent=4))

In [17]:
def format_feature_data(protocol,feature,type,ds_data):
    if type == "string":
        ds_data = ds_data.fillna("None")
        ds = ds_data
    elif type == "int":
        ds = ds_data.fillna('0')
        ds = ds.replace(' ', '0')
        ds = ds.replace('', '0')
        ds = ds.replace('None', '0')
        ds = ds.astype(int)
        #ds = pd.to_numeric(ds, errors='coerce', downcast='integer')
    elif type == "float":
        ds = ds_data.fillna('0.0')
        ds = ds.replace('', '0.0')
        ds = ds.replace(' ', '0.0')
        ds = ds.replace('None', '0.0')
        ds = ds.astype(float)
    elif type == "bool":
        ds = ds_data.fillna("0")
        ds = ds.apply(lambda x: False if str(x).lower() in ["None","none",'0','nan','NaN',nan,None,0] else True)
        ds = ds.astype(bool)
    elif type == "category":
        def category2int(x):
            try:
                v = str(int(x))
            except:
                v = str(x)            
            if not v in g_category_map[protocol][feature]:
                print("Not in category map",protocol,feature,v)
            return g_category_map[protocol][feature][v]
        ds = ds_data.fillna("None")
        ds = ds.apply(lambda x: category2int(x))
        ds = ds.astype(int)
    elif type == "binary":
        ds = ds_data.fillna("")
        #ds = ds.apply(lambda x: bytes(str(x), 'utf-8'))
    elif type == "datetime":
        ds = pd.to_datetime(ds_data)
    else:
        ds = None
        print("unkown data type",type)
    return ds

### 生成新数据集
1. 根据协议划分特征子集
2. 特征值类型转换

In [None]:
temp_folder = "%s/dataset"%g_data_root
for fi in tqdm(CIoTCommon.get_json_files(temp_folder)):
    fi = fi.replace("\\","/")
    tmp = fi.split("/")
    attackType,trafficType,fileName = tmp[-3],tmp[-2],tmp[-1].split(".")[0]
    new_path = "%ssample/%s/%s/%s/"%(g_data_root,attackType,trafficType,fileName)
    os.makedirs(new_path,exist_ok = True)
    
    #if trafficType != "Backdoor_attack" : continue
    
    df_sample = pd.read_json(fi)
    df_sample = df_sample.rename(columns={"frame.protocols":"protocol"})
    df_sample = df_sample[df_sample['protocol'].isin(totalFeature.get_protocols())].reset_index(drop=True)
    df_sample['id'] = df_sample.index
    public_columns = ['id','Label',"protocol","frame.time_utc","frame.time_delta"]
    
    #生成全部协议空间（包括父，子协议）
    total_protocols = []
    for protocol in df_sample['protocol'].unique().tolist():
        for proto in CIoTTotalFeature.get_expand_protocols(protocol):
            if not proto in total_protocols:
                total_protocols.append(proto)
                
    #生成各种协议的样本
    for protocol in totalFeature.get_protocols():
        
        #if protocol != 'eth:ethertype:ip:udp:dns' :continue
        
        if not protocol in total_protocols:
            continue
        new_features = totalFeature.get_features(protocol)
        current_features   = df_sample.keys().tolist()
        proto_columns  = list( set(new_features) & set(current_features) )
        if not proto_columns: continue
        total_columns = list(public_columns)
        for col in proto_columns:
            if not col in total_columns:
                total_columns.append(col)
        df_tmp = df_sample[total_columns]
        df_tmp = df_tmp.dropna(axis=0,subset=proto_columns,how='all')
        
        if len(df_tmp.keys().tolist()) == len(public_columns): continue
        if df_tmp.shape[0] <= 0: continue
        
        for feature in df_tmp:
            
            #if feature != "dns.retransmission" :continue
            
            if feature in ['id','protocol','Label']:
                continue
            if feature in public_columns:
                type,value = find_type_values("eth",feature)
            else:
                type,value = find_type_values(protocol,feature)
            if not type:
                print("No type found",protocol,feature,type)
            else:
                df_tmp[feature] = format_feature_data(protocol,feature,type,df_tmp[feature])

        new_json_file = "%s%s.json"%(new_path,protocol.replace(":","-"))
        df_tmp.to_json(new_json_file,orient="records")

  7%|███████▋                                                                                                           | 138/2076 [15:02<1:29:51,  2.78s/it]