# Create Clients' Similarity

* Create common features for each client, then calculate their similarity

In [10]:
import pandas as pd
from scipy import stats

In [2]:
client_lst = ['Lego', 'Microsoft', 'Google', 'Facebook', 'Amazon', 'Bank_of_America', 'BMO']

In [3]:
client_df_lst = []

for client in client_lst:
    client_df = pd.read_csv('feature_rule_'+client+'.csv')
    print(client, client_df.shape)
    client_df_lst.append(client_df)
    
print(len(client_df_lst))

Lego (153108, 41)
Microsoft (179479, 41)
Google (142497, 41)
Facebook (134890, 41)
Amazon (161766, 41)
Bank_of_America (223232, 41)
BMO (102513, 41)
7


In [6]:
client_df_lst[0].columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'is_guest_login', 'count', 'srv_count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'src_dst_bytes_diff', 'triggered_rule'],
      dtype='object')

In [9]:
client_df_lst[0].iloc[:, 0:10].head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot
0,0,1,49,5,0,0,0,0,0,0
1,0,1,24,9,280,3197,0,0,0,0
2,0,1,24,9,311,2880,0,0,0,0
3,0,1,24,9,303,1841,0,0,0,0
4,0,1,49,5,0,0,0,0,0,0


In [27]:
# for client similarity score, just use t-test to compare each column and aggregate the results
def compare_col(col1, col2):
    """
    Compare the similarity between 2 columns, higher the p-value is, more similar the 2 columns are .
    """
    pvalue = stats.ttest_ind(col1, col2)[1]
    return pvalue

In [31]:
feature_lst = client_df_lst[0].columns[0:-1]
client_similarity_dct = {}

for i in range(len(client_lst)-1):
    clienti = client_df_lst[i]
    for j in range(i+1, len(client_lst)):
        clientj = client_df_lst[j]
        similarity_score = 0.0
        for col in feature_lst:
            similarity_score += compare_col(clienti[col], clientj[col])
        similarity_score /= len(feature_lst)
        client_similarity_dct['_'.join([client_lst[i], client_lst[j]])] = similarity_score
        client_similarity_dct['_'.join([client_lst[j], client_lst[i]])] = similarity_score

In [32]:
client_similarity_dct

{'Lego_Microsoft': 0.8594561743270323,
 'Microsoft_Lego': 0.8594561743270323,
 'Lego_Google': 0.9315755913018533,
 'Google_Lego': 0.9315755913018533,
 'Lego_Facebook': 0.8551215815902846,
 'Facebook_Lego': 0.8551215815902846,
 'Lego_Amazon': 0.8936735879736102,
 'Amazon_Lego': 0.8936735879736102,
 'Lego_Bank_of_America': 0.7919567168983626,
 'Bank_of_America_Lego': 0.7919567168983626,
 'Lego_BMO': 0.735840629820065,
 'BMO_Lego': 0.735840629820065,
 'Microsoft_Google': 0.8386409792456604,
 'Google_Microsoft': 0.8386409792456604,
 'Microsoft_Facebook': 0.8119727455977749,
 'Facebook_Microsoft': 0.8119727455977749,
 'Microsoft_Amazon': 0.8652825836943148,
 'Amazon_Microsoft': 0.8652825836943148,
 'Microsoft_Bank_of_America': 0.8435358010178904,
 'Bank_of_America_Microsoft': 0.8435358010178904,
 'Microsoft_BMO': 0.7147037057136554,
 'BMO_Microsoft': 0.7147037057136554,
 'Google_Facebook': 0.8574145421055276,
 'Facebook_Google': 0.8574145421055276,
 'Google_Amazon': 0.864084258346419,
 'Ama

In [48]:
all_signal_lst = range(1, 23)

In [57]:
# generate signal frequency for each client
def get_signal_frequency(client_df, all_signal_lst):
    signal_frequency_dct = {}
    signal_ct = client_df['triggered_rule'].value_counts()
    for signal in all_signal_lst:
        if signal in client_df['triggered_rule'].value_counts().index:
            signal_frequency_dct[signal] = signal_ct[signal]/sum(client_df['triggered_rule'].value_counts().iloc[1:,])
        else:
            signal_frequency_dct[signal] = 0
    return signal_frequency_dct
    
target_client_idx = 0
all_client_signal_frequency = {}
for i in range(len(client_df_lst)):
    if i == target_client_idx:
        continue
        
    client_df = client_df_lst[i]
    signal_frequency_dct = get_signal_frequency(client_df, all_signal_lst)
    all_client_signal_frequency[client_lst[i]] = signal_frequency_dct

In [58]:
for client, client_frequency in all_client_signal_frequency.items():
    print(client)
    print(client_frequency)
    break

Microsoft
{1: 0, 2: 0, 3: 0, 4: 0.9237047875988843, 5: 0.011980428917645983, 6: 0.0001371804837898395, 7: 0.0007773560748090904, 8: 0.0035209657506058804, 9: 0.013489414239334217, 10: 0.014518267867758014, 11: 0.00011431706982486625, 12: 0, 13: 0.0038639169600804793, 14: 4.57268279299465e-05, 15: 0.018679409209383144, 16: 2.286341396497325e-05, 17: 0.0054872193515935795, 18: 0, 19: 9.1453655859893e-05, 20: 0.003566692578535827, 21: 0, 22: 0}
