# Recommend Signals to Clients

* Recommend signals to a target client based on clients' similarities, signal rating/frequency

In [200]:
import pandas as pd
from scipy import stats
import numpy as np
import operator

In [2]:
client_lst = ['Lego', 'Microsoft', 'Google', 'Facebook', 'Amazon', 'Bank_of_America', 'BMO']

In [3]:
client_df_lst = []

for client in client_lst:
    client_df = pd.read_csv('feature_rule_'+client+'.csv')
    print(client, client_df.shape)
    client_df_lst.append(client_df)
    
print(len(client_df_lst))

Lego (153108, 41)
Microsoft (179479, 41)
Google (142497, 41)
Facebook (134890, 41)
Amazon (161766, 41)
Bank_of_America (223232, 41)
BMO (102513, 41)
7


In [6]:
client_df_lst[0].columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'is_guest_login', 'count', 'srv_count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'src_dst_bytes_diff', 'triggered_rule'],
      dtype='object')

In [63]:
print('Client:', client_lst[0])
client_df_lst[0].iloc[:, -7:].head()

Client: Lego


Unnamed: 0,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,src_dst_bytes_diff,triggered_rule
0,0.0,1.0,1.0,0.0,0.0,0,4
1,0.01,0.0,0.0,0.0,0.0,2917,0
2,0.01,0.0,0.0,0.0,0.0,2569,0
3,0.01,0.0,0.0,0.0,0.0,1538,0
4,0.0,1.0,1.0,0.0,0.0,0,4


In [27]:
# for client similarity score, just use t-test to compare each column and aggregate the results
def compare_col(col1, col2):
    """
    Compare the similarity between 2 columns, higher the p-value is, more similar the 2 columns are.
    """
    pvalue = stats.ttest_ind(col1, col2)[1]
    return pvalue

In [31]:
feature_lst = client_df_lst[0].columns[0:-1]
client_similarity_dct = {}

for i in range(len(client_lst)-1):
    clienti = client_df_lst[i]
    for j in range(i+1, len(client_lst)):
        clientj = client_df_lst[j]
        similarity_score = 0.0
        for col in feature_lst:
            similarity_score += compare_col(clienti[col], clientj[col])
        similarity_score /= len(feature_lst)
        client_similarity_dct['_'.join([client_lst[i], client_lst[j]])] = similarity_score
        client_similarity_dct['_'.join([client_lst[j], client_lst[i]])] = similarity_score

In [207]:
sorted(client_similarity_dct.items(), key=operator.itemgetter(0))

[('Amazon_BMO', 0.7365484950772161),
 ('Amazon_Bank_of_America', 0.8092270979910955),
 ('Amazon_Facebook', 0.881454452090848),
 ('Amazon_Google', 0.864084258346419),
 ('Amazon_Lego', 0.8936735879736102),
 ('Amazon_Microsoft', 0.8652825836943148),
 ('BMO_Amazon', 0.7365484950772161),
 ('BMO_Bank_of_America', 0.6514256358722161),
 ('BMO_Facebook', 0.7663877469709095),
 ('BMO_Google', 0.7613793177815622),
 ('BMO_Lego', 0.735840629820065),
 ('BMO_Microsoft', 0.7147037057136554),
 ('Bank_of_America_Amazon', 0.8092270979910955),
 ('Bank_of_America_BMO', 0.6514256358722161),
 ('Bank_of_America_Facebook', 0.7741339744525769),
 ('Bank_of_America_Google', 0.775981014720559),
 ('Bank_of_America_Lego', 0.7919567168983626),
 ('Bank_of_America_Microsoft', 0.8435358010178904),
 ('Facebook_Amazon', 0.881454452090848),
 ('Facebook_BMO', 0.7663877469709095),
 ('Facebook_Bank_of_America', 0.7741339744525769),
 ('Facebook_Google', 0.8574145421055276),
 ('Facebook_Lego', 0.8551215815902846),
 ('Facebook_Mi

In [48]:
all_signal_lst = range(1, 23)

In [184]:
mu2, sigma2 = -0.3, 0.1
np.random.normal(mu2, sigma2)

-0.28527531046680044

In [192]:
# generate signal frequency for each client
## to mimic the signal frequency in real word, add some randomness
mu1, sigma1 = 30, 10
mu2, sigma2 = -0.05, 0.05
mu3, sigma3 = -0.3, 0.1

def get_signal_frequency(client_df, all_signal_lst):
    signal_frequency_dct = {}
    signal_ct = client_df['triggered_rule'].value_counts()
    for signal in all_signal_lst:
        if signal in client_df['triggered_rule'].value_counts().index:
            signal_frequency_dct[signal] = signal_ct[signal]/sum(client_df['triggered_rule'].value_counts().iloc[1:,])
            if signal_frequency_dct[signal] < 0.05:
                rand = np.random.normal(mu1, sigma1)
                signal_frequency_dct[signal] *= rand
            elif signal_frequency_dct[signal] < 0.8 and signal_frequency_dct[signal] >= 0.05:
                rand = np.random.normal(mu2, sigma2)
                signal_frequency_dct[signal] += rand
            else:
                rand = np.random.normal(mu3, sigma3)
                signal_frequency_dct[signal] += rand
        else:
            signal_frequency_dct[signal] = 0
    return signal_frequency_dct
    
target_client_idx = 0
all_client_signal_frequency = {}
for i in range(len(client_df_lst)):
    client_df = client_df_lst[i]
    signal_frequency_dct = get_signal_frequency(client_df, all_signal_lst)
    all_client_signal_frequency[client_lst[i]] = signal_frequency_dct

In [202]:
for client, client_frequency in all_client_signal_frequency.items():
    print(client)
    sorted_client_frequency = sorted(client_frequency.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_client_frequency)
    print()

Lego
[(4, 0.5710381409997853), (15, 0.5148847759722488), (10, 0.4238081421270834), (5, 0.42278923630247256), (9, 0.3718811397642264), (17, 0.16038304085825608), (8, 0.1262906629867076), (13, 0.11335112352804091), (20, 0.11290833044153772), (7, 0.024075971714223474), (19, 0.003570804839843054), (6, 0.0024425931562765462), (11, 0.002070887376836249), (14, 0.0010109016001502466), (16, 0.0008439926595273739), (1, 0), (2, 0), (3, 0), (12, 0), (18, 0), (21, 0), (22, 0)]

Microsoft
[(15, 0.7054851552455377), (9, 0.6104524770307895), (10, 0.5232505896804764), (4, 0.47011429804352245), (5, 0.42686660375480856), (17, 0.2545844804265569), (8, 0.18650098453578962), (13, 0.17866334157813824), (20, 0.07303335501333971), (7, 0.030314334589995003), (6, 0.004915687322643149), (19, 0.002828181350875369), (14, 0.002084219663490933), (11, 0.001903825747441487), (16, 0.0008316536710304754), (1, 0), (2, 0), (3, 0), (12, 0), (18, 0), (21, 0), (22, 0)]

Google
[(4, 0.7325415972330528), (9, 0.572231747116455),

In [221]:
# Calculate signal ranking score, return top k
def recommend_signals(target_client, client_similarity_dct, all_client_signal_rating, all_signal_lst, k=3):
    signal_recommend_dct = {}
    
    for signal in all_signal_lst:
        total_weighted_rating = 0.0
        total_weights = 0.0
        for client, client_signal_rating in all_client_signal_rating.items():
            if client == target_client:
                continue
            signal_rating = client_signal_rating[signal]
            if signal_rating > 0:
                weight = client_similarity_dct['_'.join([target_client, client])]
                total_weights += weight
                total_weighted_rating += weight * signal_rating
                
        if total_weights > 0:
            recommend_score = total_weighted_rating/total_weights
        else:
            recommend_score = 0
        signal_recommend_dct[signal] = recommend_score
    sorted_recommend_score = sorted(signal_recommend_dct.items(), key=operator.itemgetter(1), reverse=True)
    recommended_signals = [signal_set[0] for signal_set in sorted_recommend_score[0:k]]
    return sorted_recommend_score, recommended_signals

In [222]:
target_client = 'Microsoft'  # most similar to Amazon
sorted_recommend_score, recommended_signals = recommend_signals(target_client, client_similarity_dct, 
                                         all_client_signal_frequency, all_signal_lst, k=3)
print(recommended_signals)
print(sorted_recommend_score)

[4, 10, 9]
[(4, 0.637074146135053), (10, 0.42456722302100997), (9, 0.41498030639398437), (15, 0.4109380725867866), (5, 0.3866639021486581), (17, 0.19191723429917448), (13, 0.11914150224462429), (20, 0.10483976736889156), (8, 0.10056852012978633), (7, 0.02388035771504711), (19, 0.0036467967298500436), (6, 0.0035320281531721377), (11, 0.0033656440263885212), (14, 0.0013415187373657512), (16, 0.0008945185608252312), (1, 0.0004624478140611595), (2, 0), (3, 0), (12, 0), (18, 0), (21, 0), (22, 0)]
