# Simulate Data

* Simulate Clients' Data
* Simulate Rules' Data

In [8]:
import pandas as pd
import numpy as np

## Simulate Clients' Data

* Each client will trigger certain rules, these rules may not be the real attack type

In [10]:
client_lst = ['Lego', 'Microsoft', 'Google', 'Facebook', 'Amazon', 'Bank_of_America', 'BMO']

In [4]:
source_df = pd.read_csv('simulate_source.csv')
print(source_df.shape)
print(source_df['y_test'].value_counts())
source_df.head()

(322498, 42)
0     243844
4      72645
15      1506
10      1117
9       1069
5        902
17       466
13       290
8        275
20       268
7         62
6         16
1          9
11         6
19         6
14         4
2          3
22         3
12         2
18         2
21         1
16         1
3          1
Name: y_test, dtype: int64


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,src_dst_bytes_diff,y_test,y_pred
0,0,1,24,9,245,2413,0,0,0,0,...,0.0,0.01,0.02,0.0,0.0,0.0,0.0,2168,0,0
1,0,1,49,1,0,0,0,0,0,0,...,0.07,0.0,0.0,0.0,0.0,1.0,1.0,0,4,4
2,0,1,24,9,266,282,0,0,0,0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16,0,0
3,0,1,49,5,0,0,0,0,0,0,...,0.08,0.0,0.0,1.0,1.0,0.0,0.0,0,4,4
4,0,1,24,9,219,390,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,171,0,0


In [52]:
wrong_pred_df = source_df.loc[source_df['y_test'] != source_df['y_pred']]
print(wrong_pred_df.shape)
right_pred_df = source_df.loc[source_df['y_test'] == source_df['y_pred']]
print(right_pred_df.shape)

(70, 42)
(322428, 42)


In [53]:
wrong_pred_df.reset_index(drop=True, inplace=True)
right_pred_df.reset_index(drop=True, inplace=True)

In [40]:
from sklearn.model_selection import train_test_split

# random split percentage for right_pred_df
mu_right, sigma_right = 0.5, 0.1
rand_right = np.random.normal(mu_right, sigma_right, len(client_lst))
rand_right

array([0.47469487, 0.55646708, 0.44179264, 0.41822246, 0.50157988,
       0.69217533, 0.31781001])

In [29]:
# random split percentage for wrong_pred_df
mu_wrong, sigma_wrong = 0.7, 0.1
rand_wrong = np.random.normal(mu_wrong, sigma_wrong, len(client_lst))
rand_wrong

array([0.78330208, 0.85294667, 0.7354534 , 0.63154804, 0.62601003,
       0.80214872, 0.62242887])

In [54]:
X_wrong = wrong_pred_df.iloc[:, 0:-2]
y_wrong = wrong_pred_df['y_pred']

X_right = right_pred_df.iloc[:, 0:-2]
y_right = right_pred_df['y_pred']

In [57]:
for i in range(len(client_lst)):
    right_perct = rand_right[i]
    wrong_perct = rand_wrong[i]
    client = client_lst[i]
    
    X_right_train, _, y_right_train, _ = train_test_split(X_right, y_right, train_size=right_perct, 
                                              test_size=1-right_perct, random_state=10, shuffle=True)
    X_wrong_train, _, y_wrong_train, _ = train_test_split(X_wrong, y_wrong, train_size=wrong_perct, 
                                              test_size=1-wrong_perct, random_state=10, shuffle=True)
    
    right_df = X_right_train.copy()
    right_df['triggered_rule'] = y_right_train
    wrong_df = X_wrong_train.copy()
    wrong_df['triggered_rule'] = y_wrong_train
    
    client_df = right_df.append(wrong_df)  # keep in mind, in real world, there will be much more wrongly labeled
    print(client)
    print(right_df.shape, wrong_df.shape, client_df.shape)
    print(client_df['triggered_rule'].value_counts())
    
    output_file = 'feature_rule_'+client+'.csv'
    client_df.to_csv(output_file, index=False)

Lego
(153054, 41) (54, 41) (153108, 41)
0     115816
4      34420
15       703
10       550
9        501
5        450
17       218
13       150
20       132
8        124
7         29
19         4
11         4
6          4
14         2
16         1
Name: triggered_rule, dtype: int64
Microsoft
(179420, 41) (59, 41) (179479, 41)
0     135741
4      40401
15       817
10       635
9        590
5        524
17       240
13       169
20       156
8        154
7         34
6          6
11         5
19         4
14         2
16         1
Name: triggered_rule, dtype: int64
Google
(142446, 41) (51, 41) (142497, 41)
0     107811
4      32014
15       653
10       506
9        470
5        423
17       206
13       139
20       124
8        113
7         24
19         4
11         4
6          4
14         2
Name: triggered_rule, dtype: int64
Facebook
(134846, 41) (44, 41) (134890, 41)
0     102005
4      30351
15       620
10       479
9        444
5        410
17       192
13       133
20       

## Simulate Rules' Data