## This file first divides the files in the "csvs" folder into two as train and test. It then applies data augmentation for the required classes.

### Input & Output

`Input Files`: All files with the csv extension in the “./csvs/” folder is read.

`Output Files`: Divides input files as Train and Test. Creates augmented version of these train and test files.

--------------

###  importing relevant libraries

In [20]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
import pandas as pd
import warnings
import os


### Discovering csv extension files under "csvs" folder.

In [21]:
def find_the_way(path,file_format):
    files_add = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                files_add.append(os.path.join(r, file))  
    return files_add
name_list=find_the_way('./csvs','.csv')

### List of csv files to be processed

In [22]:
name_list

['./csvs\\IoTDevID_FP_MAIN.csv']

### Split datasets train and test

In [23]:
for name in name_list:    
    df=pd.read_csv(name)#,header=None) 
    X =df[df.columns[0:-1]]
    df[df.columns[-1]] = df[df.columns[-1]].astype('category')
    y=df[df.columns[-1]]

    # setting up testing and training sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27,stratify=y)

    # concatenate our training data back together
    train = pd.concat([X_train, y_train], axis=1)


    file=name[0:-5]+"_"+"_train.csv"
    train.to_csv(file,index=False)


    test= pd.concat([X_test, y_test], axis=1)

    file=name[0:-5]+"_"+"_test.csv"
    test.to_csv(file,index=False)

### Discovering csv extension files under "csvs" folder dor augmentation.

In [24]:
def find_the_way(path,file_format):
    files_add = []
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                files_add.append(os.path.join(r, file))  
    return files_add
files_add=find_the_way('./csvs/','.csv')

### lists the labels and the number of label for the first file.

In [25]:
df=pd.read_csv(files_add[0]) # 
devices=sorted(list(df[df.columns[-1]].unique()))
device_names={}
for i,ii in enumerate (devices):
    print(i,ii)
    device_names[i]=ii
#device_names
df.groupby("Label").size()

0 Aria
1 D-LinkCam
2 D-LinkDayCam
3 D-LinkDevice
4 D-LinkSensor
5 D-LinkSiren
6 D-LinkSwitch
7 D-LinkWaterSensor
8 EdimaxCam
9 EdimaxPlug1101W
10 EdimaxPlug2101W
11 EdnetCam
12 EdnetGateway
13 HomeMaticPlug
14 Hue-Device
15 Lightify
16 MAXGateway
17 SmarterCoffee
18 TP-LinkPlugHS100
19 TP-LinkPlugHS110
20 WeMoInsightSwitch
21 WeMoLink
22 WeMoSwitch
23 Withings
24 iKettle2


Label
Aria                   520
D-LinkCam             6358
D-LinkDayCam          1235
D-LinkDevice         10678
D-LinkSensor          6633
D-LinkSiren           6289
D-LinkSwitch          6614
D-LinkWaterSensor     6538
EdimaxCam              896
EdimaxPlug1101W       1247
EdimaxPlug2101W       1131
EdnetCam               390
EdnetGateway           850
HomeMaticPlug          639
Hue-Device           32581
Lightify              4384
MAXGateway             634
SmarterCoffee          190
TP-LinkPlugHS100       729
TP-LinkPlugHS110       698
WeMoInsightSwitch     6077
WeMoLink              6769
WeMoSwitch            4607
Withings               777
iKettle2               188
dtype: int64

# Data augmentation with  Resempling

In [26]:
s=[]
for iii in files_add:
    print(iii)
    for ii in range (1):
        for i in device_names:
            print(device_names[i])
            df=pd.read_csv(iii)
            df_1= df[df['Label']==str(device_names[i])]
            df_1["Label"]=np.ones(df_1.shape[0])
            # upsample minority
            if "test" in iii:
                number=3000
            else:number=10000
            print(df_1.shape[0])
            #if sample<df_1.shape[0]:
            #if df_1.shape[0]>number:
            #    df_1 = df_1.sample(n=number)
           # else:
            df_1 = resample(df_1,
                                      replace=True, # sample with replacement
                                      n_samples=number, # match number in majority class
                                      random_state=27) # reproducible results
   
                
                
            y_train = df_1["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
            del df_1["Label"]
            X_train = df_1
            st=device_names[i]
            s=[st for i in range(X_train.shape[0])]
            X_train["Label"]=s
            name=str(iii)+"_sk_resample.csv"
            X_train.to_csv(name, mode="a", index=False,header=False)

    

./csvs/IoTDevID_FP_MAIN.csv
Aria
520
D-LinkCam
6358
D-LinkDayCam
1235
D-LinkDevice
10678
D-LinkSensor
6633
D-LinkSiren
6289
D-LinkSwitch
6614
D-LinkWaterSensor
6538
EdimaxCam
896
EdimaxPlug1101W
1247
EdimaxPlug2101W
1131
EdnetCam
390
EdnetGateway
850
HomeMaticPlug
639
Hue-Device
32581
Lightify
4384
MAXGateway
634
SmarterCoffee
190
TP-LinkPlugHS100
729
TP-LinkPlugHS110
698
WeMoInsightSwitch
6077
WeMoLink
6769
WeMoSwitch
4607
Withings
777
iKettle2
188
./csvs/IoTDevID_FP_MAI__test.csv
Aria
130
D-LinkCam
1590
D-LinkDayCam
309
D-LinkDevice
2670
D-LinkSensor
1658
D-LinkSiren
1572
D-LinkSwitch
1653
D-LinkWaterSensor
1634
EdimaxCam
224
EdimaxPlug1101W
312
EdimaxPlug2101W
283
EdnetCam
97
EdnetGateway
213
HomeMaticPlug
160
Hue-Device
8145
Lightify
1096
MAXGateway
159
SmarterCoffee
47
TP-LinkPlugHS100
182
TP-LinkPlugHS110
175
WeMoInsightSwitch
1519
WeMoLink
1692
WeMoSwitch
1152
Withings
194
iKettle2
47
./csvs/IoTDevID_FP_MAI__train.csv
Aria
390
D-LinkCam
4768
D-LinkDayCam
926
D-LinkDevice
8008
D-

# Data augmentation with SMOTE

In [27]:
for iii in files_add:
    s=[]
    if "test" in iii:
        for ii in range (1):
            for i in device_names:
                print(device_names[i])
                df=pd.read_csv(iii)
                df_1= df[df['Label']==str(device_names[i])]
                df_1["Label"]=np.ones(df_1.shape[0])
                df_0= df[df['Label']!=(device_names[i])]
                df_0["Label"]=np.zeros(df_0.shape[0])
                number=3000
                df_0 = df_0.sample(n=number)
                df = pd.concat([df_1,df_0])
                df=df.reindex(np.random.permutation(df.index))
                y_train = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
                del df["Label"]
                X_train = df
                sm = SMOTE(random_state=27, ratio=1.0)
                X_train, y_train = sm.fit_sample(X_train, y_train)
                X_train=pd.DataFrame(X_train)
                X_train["Label"]=y_train
                X_train= X_train[X_train['Label']==1]
                st=device_names[i]
                s=[st for i in range(X_train.shape[0])]

                X_train["Label"]=s
                print(X_train.shape)
                if X_train.shape[0]>number:
                    X_train = X_train.sample(n=number)
                print(X_train.shape)
                name=iii+"_smoote_resample.csv"
                X_train.to_csv(name, mode="a", index=False,header=False)

    else:
        for ii in range (1):
            for i in device_names:
                print(device_names[i])
                df=pd.read_csv(iii)
                df_1= df[df['Label']==str(device_names[i])]
                df_1["Label"]=np.ones(df_1.shape[0])
                df_0= df[df['Label']!=(device_names[i])]
                df_0["Label"]=np.zeros(df_0.shape[0])
                number=10000
                df_0 = df_0.sample(n=number)
                df = pd.concat([df_1,df_0])
                df=df.reindex(np.random.permutation(df.index))
                y_train = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
                del df["Label"]
                X_train = df
                sm = SMOTE(random_state=27, ratio=1.0)
                X_train, y_train = sm.fit_sample(X_train, y_train)
                X_train=pd.DataFrame(X_train)
                X_train["Label"]=y_train
                X_train= X_train[X_train['Label']==1]
                st=device_names[i]
                s=[st for i in range(X_train.shape[0])]

                X_train["Label"]=s
                print(X_train.shape)
                if X_train.shape[0]>number:
                    X_train = X_train.sample(n=number)
                print(X_train.shape)
                name=iii+"_smoote_resample.csv"
                X_train.to_csv(name, mode="a", index=False,header=False)


    

Aria




(10000, 26)
(10000, 26)
D-LinkCam




(10000, 26)
(10000, 26)
D-LinkDayCam




(10000, 26)
(10000, 26)
D-LinkDevice




(10678, 26)
(10000, 26)
D-LinkSensor




(10000, 26)
(10000, 26)
D-LinkSiren




(10000, 26)
(10000, 26)
D-LinkSwitch




(10000, 26)
(10000, 26)
D-LinkWaterSensor




(10000, 26)
(10000, 26)
EdimaxCam




(10000, 26)
(10000, 26)
EdimaxPlug1101W




(10000, 26)
(10000, 26)
EdimaxPlug2101W




(10000, 26)
(10000, 26)
EdnetCam




(10000, 26)
(10000, 26)
EdnetGateway




(10000, 26)
(10000, 26)
HomeMaticPlug




(10000, 26)
(10000, 26)
Hue-Device




(32581, 26)
(10000, 26)
Lightify




(10000, 26)
(10000, 26)
MAXGateway




(10000, 26)
(10000, 26)
SmarterCoffee




(10000, 26)
(10000, 26)
TP-LinkPlugHS100




(10000, 26)
(10000, 26)
TP-LinkPlugHS110




(10000, 26)
(10000, 26)
WeMoInsightSwitch




(10000, 26)
(10000, 26)
WeMoLink




(10000, 26)
(10000, 26)
WeMoSwitch




(10000, 26)
(10000, 26)
Withings




(10000, 26)
(10000, 26)
iKettle2




(10000, 26)
(10000, 26)
Aria




(3000, 26)
(3000, 26)
D-LinkCam
(3000, 26)
(3000, 26)
D-LinkDayCam
(3000, 26)
(3000, 26)
D-LinkDevice




(3000, 26)
(3000, 26)
D-LinkSensor




(3000, 26)
(3000, 26)
D-LinkSiren
(3000, 26)
(3000, 26)
D-LinkSwitch
(3000, 26)
(3000, 26)




D-LinkWaterSensor
(3000, 26)
(3000, 26)
EdimaxCam




(3000, 26)
(3000, 26)
EdimaxPlug1101W




(3000, 26)
(3000, 26)
EdimaxPlug2101W




(3000, 26)
(3000, 26)
EdnetCam




(3000, 26)
(3000, 26)
EdnetGateway




(3000, 26)
(3000, 26)
HomeMaticPlug
(3000, 26)




(3000, 26)
Hue-Device
(8145, 26)
(3000, 26)
Lightify
(3000, 26)
(3000, 26)




MAXGateway
(3000, 26)
(3000, 26)
SmarterCoffee
(3000, 26)
(3000, 26)
TP-LinkPlugHS100




(3000, 26)
(3000, 26)
TP-LinkPlugHS110
(3000, 26)
(3000, 26)
WeMoInsightSwitch




(3000, 26)
(3000, 26)
WeMoLink




(3000, 26)
(3000, 26)
WeMoSwitch
(3000, 26)
(3000, 26)
Withings
(3000, 26)




(3000, 26)
iKettle2
(3000, 26)
(3000, 26)
Aria




(10000, 26)
(10000, 26)
D-LinkCam




(10000, 26)
(10000, 26)
D-LinkDayCam




(10000, 26)
(10000, 26)
D-LinkDevice




(10000, 26)
(10000, 26)
D-LinkSensor




(10000, 26)
(10000, 26)
D-LinkSiren




(10000, 26)
(10000, 26)
D-LinkSwitch




(10000, 26)
(10000, 26)
D-LinkWaterSensor




(10000, 26)
(10000, 26)
EdimaxCam




(10000, 26)
(10000, 26)
EdimaxPlug1101W




(10000, 26)
(10000, 26)
EdimaxPlug2101W




(10000, 26)
(10000, 26)
EdnetCam




(10000, 26)
(10000, 26)
EdnetGateway




(10000, 26)
(10000, 26)
HomeMaticPlug




(10000, 26)
(10000, 26)
Hue-Device




(24436, 26)
(10000, 26)
Lightify




(10000, 26)
(10000, 26)
MAXGateway




(10000, 26)
(10000, 26)
SmarterCoffee




(10000, 26)
(10000, 26)
TP-LinkPlugHS100




(10000, 26)
(10000, 26)
TP-LinkPlugHS110




(10000, 26)
(10000, 26)
WeMoInsightSwitch




(10000, 26)
(10000, 26)
WeMoLink




(10000, 26)
(10000, 26)
WeMoSwitch




(10000, 26)
(10000, 26)
Withings




(10000, 26)
(10000, 26)
iKettle2




(10000, 26)
(10000, 26)
