Adapted from the [Kitsune](https://github.com/ymirsky/Kitsune-py). Code uses raw data with the pcap extension in the pcaps folder to create 115 customized csv files and saves them in the csvs folder.


The Kitsune Study: 

* Paper: https://arxiv.org/abs/1802.09089
* Code:  https://github.com/ymirsky/Kitsune-py


In [27]:
####  import related modules

import warnings
warnings.filterwarnings("ignore")
import os
from tqdm import tqdm
import pandas as pd
from FeatureExtractor import *
import numpy as np

In [28]:
#### Folder creator
def folder(f_name): #this function creates a folder named "attacks" in the program directory.
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")
folder("csvs")

In [29]:
#### File finder
def find_the_way(path,file_format,con=""):
    files_add = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                if con in file:
                    files_add.append(os.path.join(r, file))  
            
    return files_add
path="pcaps"

files_add=find_the_way('./pcaps/',".pcap")
files_add

['./pcaps/16-09-29.pcap',
 './pcaps/16-10-03.pcap',
 './pcaps/16-11-18.pcap',
 './pcaps/16-11-22.pcap',
 './pcaps/18-06-11.pcap',
 './pcaps/18-06-14.pcap',
 './pcaps/18-10-13.pcap',
 './pcaps/18-10-16.pcap']

In [30]:
#### Delete unnecessary files
def killthemall(path,key):
    them=find_the_way(path,key)
    for t in them:
        try:
            os.remove(t)
        except:
            print(f"error about delete {t} file")
killthemall("./pcaps",".tsv")

In [31]:
main_labels=ln=['MI_dir_5_weight', 'MI_dir_5_mean', 'MI_dir_5_std', 'MI_dir_3_weight', 'MI_dir_3_mean', 'MI_dir_3_std', 'MI_dir_1_weight', 'MI_dir_1_mean', 'MI_dir_1_std', 'MI_dir_0.1_weight', 'MI_dir_0.1_mean', 'MI_dir_0.1_std', 'MI_dir_0.01_weight', 'MI_dir_0.01_mean', 'MI_dir_0.01_std', 'HH_5_weight_0', 'HH_5_mean_0', 'HH_5_std_0', 'HH_5_radius_0_1', 'HH_5_magnitude_0_1', 'HH_5_covariance_0_1', 'HH_5_pcc_0_1', 'HH_3_weight_0', 'HH_3_mean_0', 'HH_3_std_0', 'HH_3_radius_0_1', 'HH_3_magnitude_0_1', 'HH_3_covariance_0_1', 'HH_3_pcc_0_1', 'HH_1_weight_0', 'HH_1_mean_0', 'HH_1_std_0', 'HH_1_radius_0_1', 'HH_1_magnitude_0_1', 'HH_1_covariance_0_1', 'HH_1_pcc_0_1', 'HH_0.1_weight_0', 'HH_0.1_mean_0', 'HH_0.1_std_0', 'HH_0.1_radius_0_1', 'HH_0.1_magnitude_0_1', 'HH_0.1_covariance_0_1', 'HH_0.1_pcc_0_1', 'HH_0.01_weight_0', 'HH_0.01_mean_0', 'HH_0.01_std_0', 'HH_0.01_radius_0_1', 'HH_0.01_magnitude_0_1', 'HH_0.01_covariance_0_1', 'HH_0.01_pcc_0_1', 'HH_jit_5_weight', 'HH_jit_5_mean', 'HH_jit_5_std', 'HH_jit_3_weight', 'HH_jit_3_mean', 'HH_jit_3_std', 'HH_jit_1_weight', 'HH_jit_1_mean', 'HH_jit_1_std', 'HH_jit_0.1_weight', 'HH_jit_0.1_mean', 'HH_jit_0.1_std', 'HH_jit_0.01_weight', 'HH_jit_0.01_mean', 'HH_jit_0.01_std', 'HpHp_5_weight_0', 'HpHp_5_mean_0', 'HpHp_5_std_0', 'HpHp_5_radius_0_1', 'HpHp_5_magnitude_0_1', 'HpHp_5_covariance_0_1', 'HpHp_5_pcc_0_1', 'HpHp_3_weight_0', 'HpHp_3_mean_0', 'HpHp_3_std_0', 'HpHp_3_radius_0_1', 'HpHp_3_magnitude_0_1', 'HpHp_3_covariance_0_1', 'HpHp_3_pcc_0_1', 'HpHp_1_weight_0', 'HpHp_1_mean_0', 'HpHp_1_std_0', 'HpHp_1_radius_0_1', 'HpHp_1_magnitude_0_1', 'HpHp_1_covariance_0_1', 'HpHp_1_pcc_0_1', 'HpHp_0.1_weight_0', 'HpHp_0.1_mean_0', 'HpHp_0.1_std_0', 'HpHp_0.1_radius_0_1', 'HpHp_0.1_magnitude_0_1', 'HpHp_0.1_covariance_0_1', 'HpHp_0.1_pcc_0_1', 'HpHp_0.01_weight_0', 'HpHp_0.01_mean_0', 'HpHp_0.01_std_0', 'HpHp_0.01_radius_0_1', 'HpHp_0.01_magnitude_0_1', 'HpHp_0.01_covariance_0_1', 'HpHp_0.01_pcc_0_1']

In [32]:
main_labels=( ",".join( i for i in main_labels ) )
main_labels

'MI_dir_5_weight,MI_dir_5_mean,MI_dir_5_std,MI_dir_3_weight,MI_dir_3_mean,MI_dir_3_std,MI_dir_1_weight,MI_dir_1_mean,MI_dir_1_std,MI_dir_0.1_weight,MI_dir_0.1_mean,MI_dir_0.1_std,MI_dir_0.01_weight,MI_dir_0.01_mean,MI_dir_0.01_std,HH_5_weight_0,HH_5_mean_0,HH_5_std_0,HH_5_radius_0_1,HH_5_magnitude_0_1,HH_5_covariance_0_1,HH_5_pcc_0_1,HH_3_weight_0,HH_3_mean_0,HH_3_std_0,HH_3_radius_0_1,HH_3_magnitude_0_1,HH_3_covariance_0_1,HH_3_pcc_0_1,HH_1_weight_0,HH_1_mean_0,HH_1_std_0,HH_1_radius_0_1,HH_1_magnitude_0_1,HH_1_covariance_0_1,HH_1_pcc_0_1,HH_0.1_weight_0,HH_0.1_mean_0,HH_0.1_std_0,HH_0.1_radius_0_1,HH_0.1_magnitude_0_1,HH_0.1_covariance_0_1,HH_0.1_pcc_0_1,HH_0.01_weight_0,HH_0.01_mean_0,HH_0.01_std_0,HH_0.01_radius_0_1,HH_0.01_magnitude_0_1,HH_0.01_covariance_0_1,HH_0.01_pcc_0_1,HH_jit_5_weight,HH_jit_5_mean,HH_jit_5_std,HH_jit_3_weight,HH_jit_3_mean,HH_jit_3_std,HH_jit_1_weight,HH_jit_1_mean,HH_jit_1_std,HH_jit_0.1_weight,HH_jit_0.1_mean,HH_jit_0.1_std,HH_jit_0.01_weight,HH_jit_0.01_

In [33]:
class Kitsune:
    def __init__(self,file_path,limit,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75,):
        #init packet feature extractor (AfterImage)
        self.FE = FE(file_path,limit)
    def proc_next_packet(self):
        # create feature vector
        x = self.FE.get_next_vector()
        if len(x) == 0:
            return -1 #Error or no packets left
        return x


In [34]:
def howmanylines(file):
    with open(file, 'r') as fp:
        x = len(fp.readlines())
        return x

In [35]:
# KitNET params:
maxAE = 10 #maximum size for any autoencoder in the ensemble layer
FMgrace = 5000 #the number of instances taken to learn the feature mapping (the ensemble's architecture)
ADgrace = 50000 #the number of instances used to train the anomaly detector (ensemble itself)
limit=np.inf

In [36]:
macrenamer={'d0:52:a8:00:67:5e': 'Smart Things',
 '44:65:0d:56:cc:d3': 'Amazon Echo',
 '70:ee:50:18:34:43': 'Netatmo Welcome',
 'f4:f2:6d:93:51:f1': 'TP-Link Day Night Cloud camera',
 '00:16:6c:ab:6b:88': 'Samsung SmartCam',
 '30:8c:fb:2f:e4:b2': 'Dropcam',
 'ec:1a:59:79:f4:89': 'Belkin Wemo switch',
 '50:c7:bf:00:56:39': 'TP-Link Smart plug',
 '74:c6:3b:29:d7:1d': 'iHome',
 'ec:1a:59:83:28:11': 'Belkin wemo motion sensor',
 '18:b4:30:25:be:e4': 'NEST Protect smoke alarm',
 '70:ee:50:03:b8:ac': 'Netatmo weather station',
 'd0:73:d5:01:83:08': 'Light Bulbs LiFX Smart Bulb',
 '18:b7:9e:02:20:44': 'Triby Speaker',
 'e0:76:d0:33:bb:85': 'PIX-STAR Photo-frame',
 '70:5a:0f:e4:9b:c0': 'HP Printer',
 '08:21:ef:3b:fc:e3': 'Samsung Galaxy Tab',
 'ac:bc:32:d4:6f:2f': 'MacBook',
 '14:cc:20:51:33:ea': 'TPLink Router Bridge LAN',
 '00:17:88:2b:9a:25': 'Phillip Hue Lightbulb',
# '7c:70:bc:5d:5e:dc': 'Canary Camera',
 '70:88:6b:10:0f:c6': 'Awair air quality monitor'}


In [37]:
# FE Main function

for i in files_add: # in this section, a file is opened for each attack type and is recorded at a random benign flow.
    output=i.replace("pcap","csv")    
    output=output.replace("\\","/")   
    folder(output[:output.rfind("/")])

    
    ths = open(output, "w")
    
    ths.write(str(main_labels)+"\n")
    
    
        # Build Kitsune
    K = Kitsune(i,limit,maxAE,FMgrace,ADgrace)
    
    sayac = 0
    start = time.time()
    linenum=howmanylines(f"{i}.tsv")
    pbar = tqdm(total=linenum)
    while True:
        sayac+=1
        if sayac % 1000 == 0:
            #print(sayac)
            pbar.update(1000)
        rmse = K.proc_next_packet()
        try:
            if rmse==-1:
                break
        except:
            temp= ",".join( str(j) for j in rmse ) 
            ths.write(str(temp)+"\n")
            #print(rmse)
            #print(len(rmse))

    stop = time.time()
    print("Complete. Time elapsed: "+ str(stop - start))
    ths.close()

    #LAbelling
    output=output.replace(".csv","_label.csv")  
    if " " not in i:
        command=f"tshark -r {i}  -T fields -e eth.src  -E header=y -E separator=, -E quote=d -E occurrence=f >{output}"
    else:
        command=f"tshark -r \"{i}\" -T fields  -e eth.src  -E header=y -E separator=, -E quote=d -E occurrence=f >\"{output}\""
    os.system(command)
    
    
    df=pd.read_csv(output.replace("_label",""))
    label=pd.read_csv(output)
    
    df["Label"]=label["eth.src"]
    df["Label"]=df["Label"].replace(macrenamer)
    df.to_csv(output.replace("_label",""),index=False)




Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/16-09-29.pcap.tsv
counting lines in file...
There are 10001 Packets.


100%|███████████████████████████████████████████████████████████████████████████▉| 10000/10001 [02:14<00:00, 74.13it/s]
100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:06<00:00, 1495.26it/s]

Complete. Time elapsed: 6.434539318084717
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/16-10-03.pcap.tsv
counting lines in file...
There are 10001 Packets.



100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:09<00:00, 1051.55it/s]

 10%|███████▍                                                                   | 1000/10001 [00:00<00:05, 1747.43it/s]
 20%|██████████████▉                                                            | 2000/10001 [00:01<00:04, 1654.39it/s]
 30%|██████████████████████▍                                                    | 3000/10001 [00:01<00:04, 1532.97it/s]
 40%|█████████████████████████████▉                                             | 4000/10001 [00:02<00:03, 1539.36it/s]
 50%|█████████████████████████████████████▍                                     | 5000/10001 [00:03<00:03, 1562.07it/s]
 60%|████████████████████████████████████████████▉                              | 6000/10001 [00:03<00:02, 1544.54it/s]
 70%|████████████████████████████████████████████████████▍                      | 7000/10001 [00:04<00:01, 1535.25it/s]
 80%|█████████████████████████████████

Complete. Time elapsed: 6.548250675201416
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/16-11-18.pcap.tsv
counting lines in file...
There are 10001 Packets.


100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:09<00:00, 1054.25it/s]
100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:06<00:00, 1446.14it/s]

Complete. Time elapsed: 6.752547979354858
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/16-11-22.pcap.tsv
counting lines in file...
There are 10001 Packets.



100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:09<00:00, 1015.80it/s]

 10%|███████▍                                                                   | 1000/10001 [00:00<00:05, 1587.14it/s]
 20%|██████████████▉                                                            | 2000/10001 [00:01<00:05, 1538.64it/s]
 30%|██████████████████████▍                                                    | 3000/10001 [00:01<00:04, 1511.24it/s]
 40%|█████████████████████████████▉                                             | 4000/10001 [00:02<00:04, 1483.14it/s]
 50%|█████████████████████████████████████▍                                     | 5000/10001 [00:03<00:03, 1458.37it/s]
 60%|████████████████████████████████████████████▉                              | 6000/10001 [00:04<00:02, 1460.19it/s]
 70%|████████████████████████████████████████████████████▍                      | 7000/10001 [00:04<00:02, 1464.81it/s]
 80%|█████████████████████████████████

Complete. Time elapsed: 6.792440891265869
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/18-06-11.pcap.tsv
counting lines in file...
There are 10001 Packets.


100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:09<00:00, 1017.93it/s]
100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:06<00:00, 1368.53it/s]

Complete. Time elapsed: 6.803783416748047
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/18-06-14.pcap.tsv
counting lines in file...
There are 10001 Packets.



100%|█████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:09<00:00, 1007.74it/s]

 10%|███████▍                                                                   | 1000/10001 [00:00<00:05, 1664.11it/s]
 20%|██████████████▉                                                            | 2000/10001 [00:01<00:05, 1588.77it/s]
 30%|██████████████████████▍                                                    | 3000/10001 [00:01<00:04, 1542.74it/s]
 40%|█████████████████████████████▉                                             | 4000/10001 [00:02<00:03, 1514.39it/s]
 50%|█████████████████████████████████████▍                                     | 5000/10001 [00:03<00:03, 1504.63it/s]
 60%|████████████████████████████████████████████▉                              | 6000/10001 [00:03<00:02, 1461.11it/s]
 70%|████████████████████████████████████████████████████▍                      | 7000/10001 [00:04<00:02, 1423.27it/s]
 80%|█████████████████████████████████

Complete. Time elapsed: 7.053796052932739
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/18-10-13.pcap.tsv
counting lines in file...
There are 10001 Packets.


100%|██████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:10<00:00, 996.81it/s]
100%|██████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:20<00:00, 339.50it/s]

Complete. Time elapsed: 20.32079577445984
Parsing with tshark...
tshark parsing complete. File saved as: ./pcaps/18-10-16.pcap.tsv
counting lines in file...
There are 10001 Packets.



100%|██████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:23<00:00, 429.21it/s]

 10%|███████▍                                                                   | 1000/10001 [00:00<00:07, 1194.08it/s]
 20%|███████████████▏                                                            | 2000/10001 [00:02<00:11, 690.06it/s]
 30%|██████████████████████▊                                                     | 3000/10001 [00:04<00:12, 564.80it/s]
 40%|██████████████████████████████▍                                             | 4000/10001 [00:08<00:15, 394.70it/s]
 50%|█████████████████████████████████████▉                                      | 5000/10001 [00:11<00:13, 358.54it/s]
 60%|█████████████████████████████████████████████▌                              | 6000/10001 [00:15<00:11, 337.75it/s]
 70%|█████████████████████████████████████████████████████▏                      | 7000/10001 [00:20<00:11, 270.00it/s]
 80%|█████████████████████████████████

Complete. Time elapsed: 37.36553597450256


In [38]:
# In order to cope with missing devices in the experimental environment, appropriate sessions were combined to obtain sessions containing all devices.

In [39]:
them={"DI-S1": ['./csvs/16-09-29.csv', './csvs/16-11-18.csv'],
"DI-S2": ['./csvs/16-10-03.csv','./csvs/16-11-22.csv'],
"AD-S1": ['./csvs/18-06-14.csv',  './csvs/18-10-13.csv'],
"AD-S2": [ './csvs/18-06-11.csv','./csvs/18-10-16.csv']}

In [40]:
for output in them:
    ths = open(f"./csvs/{output}.csv", "w")
    flag=1

    for t in them[output]:
        with open(t, "r") as file:
            while True:
                line=file.readline()
                if line=="":break
                if not line.startswith("dns"):
                    #print(line)
                    ths.write(line)
                    #print(line)
                    #break
                else:
                    if flag:
                        ths.write(line)
                        flag=0
        os.remove(t)
        os.remove(t.replace(".csv","_label.csv"))
    ths.close()


100%|██████████████████████████████████████████████████████████████████████████▉| 10000/10001 [00:57<00:00, 195.32it/s]

In [41]:
macrenamer={'d0:52:a8:00:67:5e': 'Smart Things',
 '44:65:0d:56:cc:d3': 'Amazon Echo',
 '70:ee:50:18:34:43': 'Netatmo Welcome',
 'f4:f2:6d:93:51:f1': 'TP-Link Day Night Cloud camera',
 '00:16:6c:ab:6b:88': 'Samsung SmartCam',
 '30:8c:fb:2f:e4:b2': 'Dropcam',
 'ec:1a:59:79:f4:89': 'Belkin Wemo switch',
 '50:c7:bf:00:56:39': 'TP-Link Smart plug',
 '74:c6:3b:29:d7:1d': 'iHome',
 'ec:1a:59:83:28:11': 'Belkin wemo motion sensor',
 '18:b4:30:25:be:e4': 'NEST Protect smoke alarm',
 '70:ee:50:03:b8:ac': 'Netatmo weather station',
 'd0:73:d5:01:83:08': 'Light Bulbs LiFX Smart Bulb',
 '18:b7:9e:02:20:44': 'Triby Speaker',
 'e0:76:d0:33:bb:85': 'PIX-STAR Photo-frame',
 '70:5a:0f:e4:9b:c0': 'HP Printer',
 '08:21:ef:3b:fc:e3': 'Samsung Galaxy Tab',
 'ac:bc:32:d4:6f:2f': 'MacBook',
 '14:cc:20:51:33:ea': 'TPLink Router Bridge LAN',
 '00:17:88:2b:9a:25': 'Phillip Hue Lightbulb',
# '7c:70:bc:5d:5e:dc': 'Canary Camera',
 '70:88:6b:10:0f:c6': 'Awair air quality monitor'}

devices=list(macrenamer.values())
#devices

In [42]:
# This function reduces the csv file size by eliminating unnecessary or unusable samples (irrelevant device samples)

In [43]:
them=find_the_way('./csvs/',".csv")
them

['./csvs/AD-S1.csv',
 './csvs/AD-S2.csv',
 './csvs/DI-S1.csv',
 './csvs/DI-S2.csv']

In [44]:
proError=[]
for t in them:
    #main_labels="dns.time,http.time,icmp.data_time_relative,icmp.resptime,stun.time,tcp.ack,tcp.ack_raw,tcp.analysis.ack_rtt,tcp.analysis.initial_rtt,tcp.analysis.rto,tcp.nxtseq,tcp.options.timestamp.tsecr,tcp.options.timestamp.tsval,tcp.seq,tcp.seq_raw,tcp.time_delta,tcp.time_relative,udp.time_delta,udp.time_relative,arp.hw.size,arp.hw.type,arp.isannouncement,arp.isgratuitous,arp.isprobe,arp.opcode,arp.proto.size,arp.seconds-since-duplicate-address-frame,dhcp.flags.bc,dhcp.hops,dhcp.hw.addr_padding,dhcp.hw.len,dhcp.option.dhcp,dhcp.option.dhcp_max_message_size,dhcp.option.end,dhcp.option.interface_mtu,dhcp.option.ip_address_lease_time,dhcp.option.padding,dhcp.option.rebinding_time_value,dhcp.option.renewal_time_value,dhcp.secs,dhcp.type,dns.count.add_rr,dns.count.answers,dns.count.auth_rr,dns.count.queries,dns.extraneous,dns.extraneous.data,dns.extraneous.length,dns.flags.authenticated,dns.flags.authoritative,dns.flags.checkdisable,dns.flags.opcode,dns.flags.rcode,dns.flags.recavail,dns.flags.recdesired,dns.flags.response,dns.flags.truncated,dns.flags.z,dns.response_to,dns.retransmission,dns.retransmit_request,dns.retransmit_request_in,dns.retransmit_response,dns.retransmit_response_in,dns.soa.expire_limit,dns.soa.minimum_ttl,dns.soa.refresh_interval,dns.soa.retry_interval,dns.soa.serial_number,dns.unsolicited,eapol.keydes.key_iv,eapol.keydes.key_len,eapol.keydes.replay_counter,eapol.keydes.type,eapol.len,eapol.type,eapol.version,eth.dst.ig,eth.dst.lg,eth.dst.oui,eth.fcs.status,eth.len,eth.padding,eth.padding_bad,eth.src.ig,eth.src.lg,eth.src.oui,http.chat,http.content_length,http.content_length_header,http.notification,http.prev_request_in,http.prev_response_in,http.request,http.request_in,http.request_number,http.response,http.response.code,http.response_number,icmp.checksum.status,icmp.code,icmp.ident,icmp.ident_le,icmp.resp_to,icmp.seq,icmp.seq_le,icmp.type,icmp.unused,icmpv6.checksum.status,icmpv6.code,icmpv6.mldr.nb_mcast_records,icmpv6.opt,icmpv6.opt.length,icmpv6.opt.type,icmpv6.reserved,icmpv6.type,igmp.checksum.status,igmp.max_resp,igmp.num_grp_recs,igmp.version,ip.bogus_ip_length,ip.len,ip.opt.len,ip.opt.ra,ip.opt.type,ip.opt.type.class,ip.opt.type.copy,ip.opt.type.number,ntp.flags.li,ntp.flags.mode,ntp.flags.vn,ntp.ppoll,ntp.precision,ntp.rootdelay,ntp.rootdispersion,ntp.stratum,stun.att.crc32.status,stun.att.error,stun.att.error.class,stun.att.lifetime,stun.attributes,stun.length,stun.network_version,stun.response-to,tcp.analysis,tcp.analysis.ack_lost_segment,tcp.analysis.acks_frame,tcp.analysis.bytes_in_flight,tcp.analysis.duplicate_ack_frame,tcp.analysis.duplicate_ack_num,tcp.analysis.flags,tcp.analysis.keep_alive,tcp.analysis.keep_alive_ack,tcp.analysis.lost_segment,tcp.analysis.out_of_order,tcp.analysis.push_bytes_sent,tcp.analysis.retransmission,tcp.analysis.reused_ports,tcp.analysis.rto_frame,tcp.analysis.spurious_retransmission,tcp.analysis.window_update,tcp.checksum.status,tcp.completeness,tcp.connection.fin,tcp.connection.fin_active,tcp.connection.fin_passive,tcp.connection.rst,tcp.connection.syn,tcp.connection.synack,tcp.dstport,tcp.fin_retransmission,tcp.flags.ack,tcp.flags.ae,tcp.flags.cwr,tcp.flags.ece,tcp.flags.fin,tcp.flags.push,tcp.flags.res,tcp.flags.reset,tcp.flags.syn,tcp.flags.urg,tcp.hdr_len,tcp.len,tcp.options.mss_val,tcp.options.sack.count,tcp.options.sack.dsack,tcp.options.sack.dsack_le,tcp.options.sack.dsack_re,tcp.options.sack_le,tcp.options.sack_perm,tcp.options.sack_re,tcp.options.wscale.multiplier,tcp.options.wscale.shift,tcp.pdu.size,tcp.reassembled.length,tcp.segment.count,tcp.segment.overlap,tcp.segments,tcp.srcport,tcp.stream,tcp.urgent_pointer,tcp.window_size,tcp.window_size_scalefactor,tcp.window_size_value,tls.alert_message,tls.change_cipher_spec,tls.handshake.certificates,tls.handshake.certificates_length,tls.handshake.challenge,tls.handshake.challenge_length,tls.handshake.cipher_spec_len,tls.handshake.cipher_suites_length,tls.handshake.ciphersuites,tls.handshake.client_point_len,tls.handshake.comp_method,tls.handshake.comp_methods,tls.handshake.comp_methods_length,tls.handshake.epms_len,tls.handshake.extension.heartbeat.mode,tls.handshake.extensions_ec_point_formats,tls.handshake.extensions_ec_point_formats_length,tls.handshake.extensions_length,tls.handshake.extensions_reneg_info_len,tls.handshake.extensions_server_name_len,tls.handshake.extensions_server_name_list_len,tls.handshake.extensions_server_name_type,tls.handshake.extensions_supported_groups,tls.handshake.extensions_supported_groups_length,tls.handshake.g_len,tls.handshake.p_len,tls.handshake.server_point_len,tls.handshake.session_id_length,tls.handshake.session_ticket_length,tls.handshake.session_ticket_lifetime_hint,tls.handshake.sig_hash_alg_len,tls.handshake.sig_hash_algs,tls.handshake.sig_len,tls.handshake.yc_len,tls.handshake.ys_len,tls.resumed,tls.sct.scts_length,udp.checksum.status,udp.dstport,udp.length,udp.length.bad,udp.possible_traceroute,udp.srcport,udp.stream,eth.dst,eth.src,stun.cookie,stun.id,stun.value,tcp.options,tcp.options.mss,tcp.options.wscale,ntp.refid,arp.proto.type,dhcp.flags,dhcp.flags.reserved,dhcp.id,dns.flags,dns.id,eth.fcs,eth.type,icmp.checksum,icmpv6.checksum,igmp.checksum,igmp.type,ip.id,ipv6.flow,ipv6.tclass,ntp.flags,stun.att.crc32,stun.att.transp,stun.type,stun.type.class,stun.type.method,stun.type.method-assignment,tcp.checksum,tcp.flags,tls.handshake.server_curve_type,tls.handshake.server_named_curve,tls.handshake.version,udp.checksum,http.connection,http.request.method,http.response.code.desc,http.response.phrase,tcp.flags.str,_ws.col.Protocol,tls.handshake.sig_hash_alg,dhcp.hw.type,ip.dsfield,ip.flags,ipv6.opt.type,ipv6.opt.type.rest,stun.att.family,stun.att.ipv4-xord,stun.att.port-xord,tcp.segment,arp.duplicate-address-detected,arp.duplicate-address-frame,dns.srv.port,dns.srv.priority,dns.srv.weight,eth.addr.oui,eth.ig,eth.lg,ip.checksum.status,ip.dsfield.dscp,ip.dsfield.ecn,ip.flags.df,ip.flags.mf,ip.flags.rb,ip.frag_offset,ip.hdr_len,ip.proto,ip.ttl,ip.version,ipv6.opt,ipv6.opt.length,ipv6.opt.type.action,ipv6.opt.type.change,stun.att.padding,stun.att.port,stun.att.reserved,tcp.analysis.duplicate_ack,tls.record.length,Payload_Entropy,dstport,srcport,dstport_class,srcport_class,Label\n"
    ths = open(t.replace(".csv","_.csv"), "w")
    #ths.write(main_labels)

    with open(t, "r") as file:
        line=file.readline()
        ths.write(line)
        while True:
                line=file.readline()
                if line=="":break
                temp=line[:-1]
                temp=temp.split(",")
                if temp[-1] in devices:
                        ths.write(line)
    ths.close()
    os.remove(t)
    

In [47]:
files_add=find_the_way('./csvs/',".csv")
files_add

['./csvs/AD-S1_.csv',
 './csvs/AD-S2_.csv',
 './csvs/DI-S1_.csv',
 './csvs/DI-S2_.csv']

In [48]:
for i in files_add:
    df=pd.read_csv(i,usecols=["Label"])
    temp=df.groupby("Label").size()
    print(len(temp),temp)
    print("#"*120,"\n\n\n")

16 Label
Amazon Echo                         157
Awair air quality monitor            68
Belkin Wemo switch                  111
Belkin wemo motion sensor          1859
Dropcam                             666
HP Printer                           24
Light Bulbs LiFX Smart Bulb          30
Netatmo Welcome                      67
Phillip Hue Lightbulb               211
Samsung SmartCam                    722
Smart Things                        236
TP-Link Day Night Cloud camera       12
TP-Link Smart plug                    8
TPLink Router Bridge LAN          11454
Triby Speaker                        11
iHome                                31
dtype: int64
######################################################################################################################## 



18 Label
Amazon Echo                          30
Awair air quality monitor            66
Belkin Wemo switch                  127
Belkin wemo motion sensor          1630
Dropcam                             602
HP P

In [51]:
# CREATE SMALLER CSVs (1/20)

#### CSV files may be too large to be processed.  You can sample CSV files using these sections.
#### Since sampling uses proportions, the csvs reduced in this type of sampling will reflect real-life examples.
folder("small")

In [52]:
for i in files_add:
    df=pd.read_csv(i)
    df = df.groupby('Label').apply(lambda x: x.sample(n=min(10000, len(x))))
    df = df.droplevel('Label')
    name=i.replace("csvs","small")
    df.to_csv(name,index=False)