In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from scipy.special import erf
from sklearn.metrics import f1_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.0.2.


# Defining funtions

In [3]:
def oh_encoder_data700k(df, experimento=1, n=400000):
    df = df.rename(columns={'is_malicious': 'label'})
    df = df.drop(['time'], axis=1)
    features = ['src_user', 'src_comp', 'dst_comp', 'label']
    
    df_train = df[features][df.label == 0][0:n]
    #df_test = df[features][df.label == 1]
    df_test = df[features][~df[features].dst_comp.isin(df_train.dst_comp.values) & \
                           ~df[features].src_user.isin(df_train.src_user.values)]
    
    y_test = df_test.label.values
    
    df_train = df_train.drop(['label'], axis=1)
    df_test = df_test.drop(['label'], axis=1)
    
    if(experimento == 1):
      df_train = df_train.drop(['src_comp'], axis=1)
      df_test = df_test.drop(['src_comp'], axis=1)
      features.remove('src_comp')
      fit = OneHotEncoder().fit(df[features[:-1]])
    else:
      df_train = df_train.drop(['src_user'], axis=1)
      df_test = df_test.drop(['src_user'], axis=1)
      features.remove('src_user')
      fit = OneHotEncoder().fit(df[features[:-1]])
    
    
    X_train = fit.transform(df_train.to_numpy()).toarray()
    X_test = fit.transform(df_test.to_numpy()).toarray()
    
    return X_train, X_test, y_test

def oh_encoder_data_unified_auth(df, experimento=1, percentage=0.6):
    df = df.drop(['time'], axis=1)
    features = ['computer_source', 'user', 'computer_dest', 'label']
    
    separator = int(percentage*len(df))

    df_train = df[features][df.label == 0][0:separator]
    #df_test = df[features][df.label == 1]
    df_test = df[features][~df[features].computer_dest.isin(df_train.computer_dest.values) & \
                           ~df[features].user.isin(df_train.user.values)]
    
    y_test = df_test.label.values
    
    df_train = df_train.drop(['label'], axis=1)
    df_test = df_test.drop(['label'], axis=1)
    
    if(experimento == 1):
      df_train = df_train.drop(['computer_source'], axis=1)
      df_test = df_test.drop(['computer_source'], axis=1)
      features.remove('computer_source')
      fit = OneHotEncoder().fit(df[features[:-1]])
    else:
      df_train = df_train.drop(['user'], axis=1)
      df_test = df_test.drop(['user'], axis=1)
      features.remove('user')
      fit = OneHotEncoder().fit(df[features[:-1]])
    
    
    X_train = fit.transform(df_train.to_numpy()).toarray()
    X_test = fit.transform(df_test.to_numpy()).toarray()
    
    return X_train, X_test, y_test

def get_percentile(scores, y_true):
    out_perc = np.size(np.where(y_true==1)) / len(y_true)
    per = np.percentile(scores, 100 * (1 - out_perc))
    return per

def standardizer(X_train, X_test, type='standard'):
    if type == 'standard':
        scaler = StandardScaler().fit(X_train)
    elif type == 'minmax' or type == 'min_max':
        scaler = MinMaxScaler().fit(X_train)
        
    return scaler.transform(X_train), scaler.transform(X_test)

class Lof(LocalOutlierFactor):
    def fit(self, X_train, y=None):
        self.X_train = X_train
        super().fit(X=X_train, y=y)
        return self

    def decision_function(self, X_test):
        return super().decision_function(X_test)

    def predict_proba(self, X_test):
        nof = self.negative_outlier_factor_ * -1
        test_scores = self.decision_function(X_test) * -1
        scaler = MinMaxScaler().fit(nof.reshape(-1, 1))
        proba = scaler.transform(test_scores.reshape(-1, 1))
        return proba.clip(0, 1)

# Handling datasets

In [4]:
df_700k = pd.read_csv("lanl-comprehensive_700k.csv")

FileNotFoundError: ignored

In [13]:
df_700k.head()

Unnamed: 0,time,src_user,src_comp,dst_comp,is_malicious
0,1.0,ANONYMOUS LOGON@C586,C1250,C586,0
1,1.0,ANONYMOUS LOGON@C586,C586,C586,0
2,1.0,C1021$@DOM1,C1021,C625,0
3,1.0,C1035$@DOM1,C1035,C586,0
4,1.0,C1035$@DOM1,C586,C586,0


In [5]:
df_unified_auth = pd.read_csv("lanl-comprehensive-unified-auth.csv")

In [6]:
df_unified_auth.head()

Unnamed: 0,time,computer_source,user,computer_dest,label
0,87151,C288750,U451666,C313779,0
1,87451,C288750,U451666,C313779,0
2,100695,C206546,U457678,C639081,0
3,100695,C206546,U457678,C639081,0
4,101274,C206546,U416717,C825721,0


## Experimento 1 (lanl-comprehensive_700k)

In [14]:
X_train_Ex1, X_test_Ex1, y_test_Ex1 = oh_encoder_data700k(df_700k, experimento=1, n=50000)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [15]:
X_train_norm_Ex1, X_test_norm_Ex1 = standardizer(X_train_Ex1, X_test_Ex1)

## Experimento 2 (lanl-comprehensive_700k)

In [16]:
X_train_Ex2, X_test_Ex2, y_test_Ex2 = oh_encoder_data700k(df_700k, experimento=2, n=50000)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [17]:
X_train_norm_Ex2, X_test_norm_Ex2 = standardizer(X_train_Ex2, X_test_Ex2)

## Experimento 3 (lanl-comprehensive_unified-auth)

In [7]:
X_train_Ex3, X_test_Ex3, y_test_Ex3 = oh_encoder_data_unified_auth(df_unified_auth, experimento=1)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [8]:
X_train_norm_Ex3, X_test_norm_Ex3 = standardizer(X_train_Ex3, X_test_Ex3)

## Experimento 4 (lanl-comprehensive_unified-auth)

In [9]:
X_train_Ex4, X_test_Ex4, y_test_Ex4 = oh_encoder_data_unified_auth(df_unified_auth, experimento=2)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [10]:
X_train_norm_Ex4, X_test_norm_Ex4 = standardizer(X_train_Ex4, X_test_Ex4)

# Train

## Experimento 1 (lanl-comprehensive_700k)

In [None]:
%%time
model_Ex1 = Lof(n_neighbors=10, novelty= True)
model_Ex1.fit(X_train_norm_Ex1)

#train_scores = model.negative_outlier_factor_ * -1
#test_scores = model.decision_function(X_test_norm) * -1

test_scores_Ex1 = model_Ex1.predict_proba(X_test_norm_Ex1)

## Experimento 2 (lanl-comprehensive_700k)

In [None]:
%%time
model_Ex2 = Lof(n_neighbors=10, novelty= True)
model_Ex2.fit(X_train_norm_Ex2)

#train_scores = model.negative_outlier_factor_ * -1
#test_scores = model.decision_function(X_test_norm) * -1

test_scores_Ex2 = model_Ex2.predict_proba(X_test_norm_Ex2)

CPU times: user 19min 19s, sys: 9.27 s, total: 19min 28s
Wall time: 10min 28s


## Experimento 3 (lanl-comprehensive_unified-auth)

In [11]:
%%time
model_Ex3 = Lof(n_neighbors=10, novelty= True)
model_Ex3.fit(X_train_norm_Ex3)

#train_scores = model.negative_outlier_factor_ * -1
#test_scores = model.decision_function(X_test_norm) * -1

test_scores_Ex3 = model_Ex3.predict_proba(X_test_norm_Ex3)

CPU times: user 2min 10s, sys: 4.67 s, total: 2min 14s
Wall time: 1min 42s


## Experimento 4 (lanl-comprehensive_unified-auth)

In [None]:
%%time
model_Ex4 = Lof(n_neighbors=10, novelty= True)
model_Ex4.fit(X_train_norm_Ex4)

#train_scores = model.negative_outlier_factor_ * -1
#test_scores = model.decision_function(X_test_norm) * -1

test_scores_Ex4 = model_Ex4.predict_proba(X_test_norm_Ex4)

# Metrics

## Experimento 1 (lanl-comprehensive_700k)

In [None]:
roc = roc_auc_score(y_test_Ex1, test_scores_Ex1)
ap = average_precision_score(y_test_Ex1, test_scores_Ex1)

print(f"ROC score: {roc}")
print(f"Average precision score: {ap}")

ROC score: 0.7538884524744698
Average precision score: 0.644274413855101


In [None]:
per = get_percentile(test_scores_Ex1, y_test_Ex1.astype(int))    
y_pred = (test_scores_Ex1>=per)

roc_score_auc = roc_auc_score(y_test_Ex1.astype(int), y_pred.astype(int)) 
average_precision = average_precision_score(y_test_Ex1.astype(int), y_pred.astype(int))

precision, recall, f1, _ = precision_recall_fscore_support(y_test_Ex1.astype(int),
                                                            y_pred.astype(int),
                                                            average='binary')
print(f"F1 {f1}")
print(f"Recall: {recall}")

F1 0.5864197530864198
Recall: 1.0


In [None]:
tn, fp, fn, tp = confusion_matrix(y_test_Ex1, y_pred).ravel()

In [None]:
print(f"True positives: {tp}")
print(f"False positives: {fp}")

True positives: 285
False positives: 402


## Experimento 2 (lanl-comprehensive_700k)

In [None]:
roc = roc_auc_score(y_test_Ex2, test_scores_Ex2)
ap = average_precision_score(y_test_Ex2, test_scores_Ex2)

print(f"ROC score: {roc}")
print(f"Average precision score: {ap}")

ROC score: 0.4159334904425242
Average precision score: 0.44427468057272906


In [None]:
per = get_percentile(test_scores_Ex2, y_test_Ex2.astype(int))    
y_pred = (test_scores_Ex2>=per)

roc_score_auc = roc_auc_score(y_test_Ex2.astype(int), y_pred.astype(int)) 
average_precision = average_precision_score(y_test_Ex2.astype(int), y_pred.astype(int))

precision, recall, f1, _ = precision_recall_fscore_support(y_test_Ex2.astype(int),
                                                            y_pred.astype(int),
                                                            average='binary')
print(f"F1 {f1}")
print(f"Recall: {recall}")

F1 0.3868954758190328
Recall: 0.43508771929824563


In [None]:
tn, fp, fn, tp = confusion_matrix(y_test_Ex2, y_pred).ravel()

In [None]:
print(f"True positives: {tp}")
print(f"False positives: {fp}")

True positives: 124
False positives: 232


## Experimento 3 (lanl-comprehensive_unified-auth)

In [None]:
roc_Ex3 = roc_auc_score(y_test_Ex3, test_scores_Ex3)
ap_Ex3 = average_precision_score(y_test_Ex3, test_scores_Ex3)

print(f"ROC score: {roc_Ex3}")
print(f"Average precision score: {ap_Ex3}")

In [None]:
per = get_percentile(test_scores_Ex3, y_test_Ex3.astype(int))    
y_pred_Ex3 = (test_scores_Ex3>=per)

roc_score_auc_Ex3 = roc_auc_score(y_test_Ex3.astype(int), y_pred_Ex3.astype(int)) 
average_precision_Ex3 = average_precision_score(y_test_Ex3.astype(int), y_pred_Ex3.astype(int))

precision_Ex3, recall_Ex3, f1_Ex3, _ = precision_recall_fscore_support(y_test_Ex3.astype(int),
                                                            y_pred_Ex3.astype(int),
                                                            average='binary')
print(f"F1 {f1_Ex3}")
print(f"Recall: {recall_Ex3}")

In [None]:
tn_Ex3, fp_Ex3, fn_Ex3, tp_Ex3 = confusion_matrix(y_test_Ex3, y_pred_Ex3).ravel()

In [None]:
print(f"True positives: {tp_Ex3}")
print(f"False positives: {fp_Ex3}")

## Experimento 4 (lanl-comprehensive_unified-auth)

In [None]:
ap_Ex4 = average_precision_score(y_test_Ex4, test_scores_Ex4)
roc_Ex4 = roc_auc_score(y_test_Ex4, test_scores_Ex4)

print(f"ROC score: {roc_Ex4}")
print(f"Average precision score: {ap_Ex4}")

In [None]:
per = get_percentile(test_scores_Ex4, y_test_Ex4.astype(int))    
y_pred_Ex4 = (test_scores_Ex4>=per)

roc_score_auc_Ex4 = roc_auc_score(y_test_Ex4.astype(int), y_pred_Ex4.astype(int)) 
average_precision_Ex4 = average_precision_score(y_test_Ex4.astype(int), y_pred_Ex4.astype(int))

precision_Ex4, recall_Ex4, f1_Ex4, _ = precision_recall_fscore_support(y_test_Ex4.astype(int),
                                                            y_pred_Ex4.astype(int),
                                                            average='binary')
print(f"F1 {f1_Ex4}")
print(f"Recall: {recall_Ex4}")

In [None]:
tn_Ex4, fp_Ex4, fn_Ex4, tp_Ex4 = confusion_matrix(y_test_Ex4, y_pred_Ex4).ravel()

In [None]:
print(f"True positives: {tp_Ex4}")
print(f"False positives: {fp_Ex4}")

# Graphic representation

In [None]:
experimentos = ['Experimento 1','Experimento 2','Experimento 3','Experimento 4' ]
tps = [tp_Ex1,tp_Ex2,tp_Ex3,tp_Ex4]
fps = [fp_Ex1,fp_Ex2,fp_Ex3,fp_Ex4]

fig, axs = plt.subplots(1,3,sharex=True,figsize =(18, 8))

barWidth = 0.25
br1 = np.arange(len(experimentos))
br2 = [x + barWidth for x in br1]

axs.flat[0].bar(br1,tps,color='r',width = barWidth,label="tp")
axs.flat[0].bar(br2,fps,color='b',width = barWidth,label="fp")
axs.flat[0].set_xticks([r + barWidth for r in range(len(experimentos))],
        experimentos)
axs.flat[0].legend()