# Task 4 : 特征选择（可跳过这一步，直接下载转换好的特征）
下载地址：https://huggingface.co/datasets/JimXie/IIoTset/resolve/main/features.tar.gz

1. 扫描数据集，发现所有特征和特征值
2. 根据特征值，生成推荐的数据类型，将特征划分至不同的协议
4. 人工选择特征，分配特征的数据类型

In [1]:
import os,json,sys,logging
sys.path.append("./share")
sys.path.append("./common")
import pandas as pd
import numpy as np
from IoTCommon import CIoTCommon
from IoTTotalFeature import CIoTTotalFeature
from Config import g_data_root
from tqdm.notebook import tqdm
import warnings
warnings.simplefilter("ignore")
tqdm.pandas()

### 提取所有特征

In [None]:
all_data = []
for fi in tqdm(CIoTCommon.get_json_files("%s/json/"%g_data_root)):
    tmp = CIoTCommon.get_total_features(fi)
    if tmp:
        all_data.extend(tmp)
df_total_raw_features = pd.DataFrame(all_data)
df_total_raw_features.to_csv("%s/features/total_raw_features.csv"%g_data_root)

### 只保留有效的ipv4特征

In [3]:
df_total_raw_features = pd.read_csv("%s/features/total_raw_features.csv"%g_data_root,index_col=0)
df_tmp = df_total_raw_features.copy()
df_tmp = df_tmp[~df_tmp["frame.protocols"].isin(["eth:ethertype:ip:tcp:http:data-text-lines","eth:ethertype:ip:tcp:tls:x509sat:x509sat:x509sat:x509af:x509sat:x509ce:x509ce:x509ce:x509ce:x509sat:x509sat:x509sat:x509sat:x509ce:x509ce:x509sat:x509sat:x509ce:x509ce:tls"])]
df_tmp = df_tmp[~df_tmp['frame.protocols'].str.contains('ipv6')]
df_tmp = df_tmp[df_tmp['feature']!="\\r\\n"]
df_tmp = df_tmp.sort_values(by="feature").reset_index(drop=True)
all_data = []
for item,df_tmp in df_tmp.groupby(['frame.protocols','feature']):
    protocol = item[0]
    feature = item[1]
    tmp={"protocol":protocol,"feature":feature}    
    all_data.append(tmp)
df_total_base_features = pd.DataFrame(all_data)

### 提取所有特征值

In [None]:
def is_total_features(protocol,feature):
    return df_total_base_features[(df_total_base_features['protocol'] == protocol) & (df_total_base_features['feature']==feature)].shape[0] > 0
    
g_raw_values = {}
for fi in tqdm(CIoTCommon.get_json_files("%s/json/"%g_data_root)):    
    for protocol,df_tmp in pd.DataFrame(CIoTCommon.get_feature_data(fi)).groupby('frame.protocols'):
        if not protocol in g_raw_values:
            g_raw_values[protocol] = {}
            
        for feature in df_tmp.keys().tolist():
            if not is_total_features(protocol,feature):
                continue
            if not feature in g_raw_values[protocol]:
                g_raw_values[protocol][feature] = {}
                g_raw_values[protocol][feature]['count'] = 0
                g_raw_values[protocol][feature]['value'] = []
                            
            for value in df_tmp[feature].unique().tolist():
                if not value in g_raw_values[protocol][feature]['value']:
                    if len(g_raw_values[protocol][feature]['value']) < 256:
                        g_raw_values[protocol][feature]['value'].append(value)
            
            g_raw_values[protocol][feature]['count'] = len(g_raw_values[protocol][feature]['value'])
            
all_feature_values = []
for protocol in g_raw_values:
    for feature in g_raw_values[protocol]:
        value = g_raw_values[protocol][feature]['value']
        count = g_raw_values[protocol][feature]['count']
        tmp = {}
        tmp['protocol'] = protocol
        tmp['feature'] = feature
        tmp['count'] = count
        tmp['value'] = value
        all_feature_values.append(tmp)
        
df_raw_values = pd.DataFrame(all_feature_values)
df_raw_values.to_csv("%s/features/raw_value.csv"%g_data_root)

In [3]:
df_raw_values = pd.read_csv("%s/features/raw_value.csv"%g_data_root,index_col=0)

### 根据特征值生成推荐特征类型

In [4]:
df_tmp_values = df_raw_values
all_data = []
for item,df_tmp in tqdm(df_tmp_values.groupby(["protocol","feature"])):
    tmp = {}
    tmp['protocol'] = item[0]
    tmp['feature'] = item[1]
    tmp['effective'] = 'Y'
    count = df_tmp.iloc[0]['count']
    values = df_tmp.iloc[0]['value']
    if count == 1:
        tmp['recommand_type']="const"
    elif count == 2:
        tmp['recommand_type']="bool"
    elif count < 10:
        tmp['recommand_type']="category"
    else:
        tmp['recommand_type']=""
    tmp['type'] = tmp['recommand_type']
    tmp['count'] = count
    tmp['values'] = values
    all_data.append(tmp)
df_all_data = pd.DataFrame(all_data)
df_all_data.to_excel("%s/features/candidate_base_features.xlsx"%g_data_root,index=False)

  0%|          | 0/3197 [00:00<?, ?it/s]

### 人工选择特征，将特征划分到特定的协议中

In [6]:
pg = CIoTTotalFeature()
df_manual_base_features = pd.read_excel("%s/features/manual_base_features.xlsx"%g_data_root,index_col=0).reset_index()
all_data = []
for protocol in pg.get_protocols():
    for feature in pg.get_features(protocol):    
        if feature in df_manual_base_features['feature'].tolist():
            mask = df_manual_base_features['feature'] == feature
            tmp_feature = df_manual_base_features[mask].iloc[0]
            tmp = {"protocol":protocol,"feature":feature}
            tmp['effective'] = tmp_feature['effective']
            tmp['type'] = tmp_feature['type']
            tmp['recommand_type'] = tmp_feature['recommand_type']
            tmp['count'] = tmp_feature['count']
            tmp['values'] = tmp_feature['values']
            if feature == 'frame.protocols':
                tmp["feature"]= 'protocol'
            all_data.append(tmp)
            
df_base_features = pd.DataFrame(all_data)
df_base_features.to_excel("%s/features/manual_features.xlsx"%g_data_root,index=False)

### 生成有效的特征列表
1. 添加人工分配的字段（effective,type)
2. 过滤掉常量特征

### 有效特征

In [9]:
df_base_features = pd.read_excel("%s/features/features.xlsx"%g_data_root,index_col=0).reset_index()
df_features = df_base_features[df_base_features['effective'] !="N"]
df_features = df_features[df_features['count'] > 1].reset_index(drop=True)
totalFeature = CIoTTotalFeature()
df_features = CIoTTotalFeature.get_expand_features(df_features,totalFeature)

In [10]:
df_features

Unnamed: 0,index,protocol,feature,effective,type,recommand_type,count,values
0,14,eth,eth.src,,string,category,6,"['dc:a6:32:fb:69:b5', 'd8:f2:ca:8e:17:69', '48..."
1,16,eth,eth.dst,,string,string,7,"['d8:f2:ca:8e:17:69', 'dc:a6:32:fb:69:b5', '48..."
2,18,eth,eth.dst_resolved,,category,category,7,"['Intel_8e:17:69', 'RaspberryPiT_fb:69:b5', 'F..."
3,23,eth,eth.src_resolved,,category,category,6,"['RaspberryPiT_fb:69:b5', 'Intel_8e:17:69', 'F..."
4,24,eth,frame.time_delta,,int,int,4352,"['0.036697000', '0.003410000', '0.163624000', ..."
...,...,...,...,...,...,...,...,...
329,616,eth:ethertype:ip:udp:ssdp,http.request.version,,category,bool,14,"[nan, 'HTTP/1.0', 'HTTP/1.1', 'By Dr HTTP/1.1'..."
330,617,eth:ethertype:ip:udp:ssdp,http.request.method,,category,bool,9,"[nan, 'TRACE', 'GET', 'POST', 'OPTIONS', 'PROP..."
331,619,eth:ethertype:ip:udp:ssdp,udp.payload,,binary,bool,1004,['00:41:00:00:00:02:00:00:00:00:00:00:09:5f:32...
332,621,eth:ethertype:ip:udp:ssdp,http.chat,,string,const,2,"['', nan]"
