In [12]:
import pandas as pd

training_data = pd.read_csv('training.csv')

In [14]:
training_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [33]:
temp = training_data.value_counts(['protocol_type', 'attack'])
temp = temp.rename('count')
temp = temp.reset_index()
temp

Unnamed: 0,protocol_type,attack,count
0,tcp,normal,53600
1,tcp,neptune,41214
2,udp,normal,12434
3,icmp,ipsweep,3117
4,tcp,portsweep,2926
5,icmp,smurf,2646
6,tcp,satan,2184
7,udp,satan,1417
8,icmp,normal,1309
9,icmp,nmap,981


In [35]:
import plotly.express as px
px.treemap(temp, path=['protocol_type', 'attack'], values='count')

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import CountVectorizer

def get_label(label):
    if label == 'normal':
        return 1;
    else:
        return -1;
    
from sklearn.base import BaseEstimator, TransformerMixin

class Get_top_categories(BaseEstimator, TransformerMixin):
    """Create a class to keep the top categories, the rest categories are labeled as 'other'
    """
    
    def __init__(self, top_num = 10): # no *args or **kargs
        """Create a class
        
        Arg:
            top_num (int), the number of top categories kept, default number is 10
        """
        self.top_num = top_num
    
    def fit(self, X, y = None):
        """Fit the class
        
        Arg:
            X (Pandas.Series), a column of a Pandas.DataFrame
            y (None), not used
        """
        temp = X.value_counts()
        self.columns = list(temp[:self.top_num].index)
        return self
    
    def containe(self, s):
        """Process record
        
        Arg:
            s (str), a recod in the categorical column
            
        Return:
            str, return the same string is a recod in the top category list; otherwise, return 'other'
        """
        if s in self.columns:
            return s
        else:
            return 'other_category'
        
    def transform(self, X):
        """Convert a specific categorical column
        
        Arg:
            X (Pandas.Series), a column of a Pandas.DataFrame
            
        Return:
            Pandas.Series, processed column
        """
        temp = X.apply(self.containe)
        return temp
    
class DoNothing(BaseEstimator, TransformerMixin):
    """Do not change anything"""
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        temp = X.copy()
        return temp
    
# process numerical features
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

num_pipeline_gaussian = Pipeline([
    ('quantile', QuantileTransformer(output_distribution='normal', random_state=0)),
    #('std_scaler', StandardScaler()),  
])

# process categorical features with bag of words
cat_pipeline = Pipeline([
    ('bag_of_words', CountVectorizer()),
])

cat_pipeline_five = Pipeline([
    ('more_than_five', Get_top_categories(top_num=5)),
    ('bag_of_words', CountVectorizer()),    
])

cat_pipeline_ten = Pipeline([
    ('more_than_ten', Get_top_categories()),
    ('bag_of_words', CountVectorizer()),    
])

# do not change features
do_nothing_pipeline = Pipeline([
    ('do_nothing', DoNothing())
])

from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
        ("num_pipeline_guassion", num_pipeline_gaussian, ['duration', 'src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']), # 3, pass a DataFrame to num_pipeline
        ("cat_pipeline_protocol_type", cat_pipeline, 'protocol_type'), # 3, pass a Series to cat_pipeline
        ("cat_pipeline_service", cat_pipeline_ten, 'service'), # 11, pass a Series to cat_pipeline_ten  
        ("cat_pipeline_flag", cat_pipeline_five, 'flag'), # 6, pass a Series to cat_pipeline_ten
        ("do_nothing", do_nothing_pipeline, ['land', 'wrong_fragment', 'urgent', 'num_failed_logins', 'logged_in', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']) # 1, pass a DataFrame to num_pipeline
    ])

In [6]:
num_features = ['duration', 'src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count'] # 3, pass a DataFrame to num_pipeline
cat_features = ['protocol_type', 'service', 'flag', 'land', 'wrong_fragment', 'urgent', 'num_failed_logins', 'logged_in', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']

In [13]:
training_data.shape

(125973, 43)

In [8]:
features = num_features + cat_features
len(features)

41

In [11]:
training_data = pd.read_csv('training.csv')
train_x = preprocess_pipeline.fit_transform(training_data) 
train_y = training_data['attack'].apply(get_label)

NameError: name 'pd' is not defined

In [27]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(novelty=True, n_neighbors = 200, algorithm = 'auto', metric = 'manhattan')
lof.fit(train_x[train_y==1])
predict = lof.predict(train_x)

In [28]:
from sklearn.metrics import classification_report
print(classification_report(train_y, predict, target_names=['normal', 'malicious']))

              precision    recall  f1-score   support

      normal       0.88      0.97      0.92     58630
   malicious       0.97      0.88      0.92     67343

    accuracy                           0.92    125973
   macro avg       0.92      0.93      0.92    125973
weighted avg       0.93      0.92      0.92    125973



In [29]:
import joblib
joblib.dump(preprocess_pipeline, 'preprocessing.pkl')
joblib.dump(lof, 'lof.pkl')

['lof.pkl']

In [30]:
preprocessing_pipeline = joblib.load('preprocessing.pkl')
lof = joblib.load('lof.pkl')

In [31]:
test_data = pd.read_csv('test.csv')
test_x = preprocess_pipeline.transform(test_data) 
test_y = test_data['attack'].apply(get_label)

In [32]:
test_pred = lof.predict(test_x)
print(classification_report(test_y, test_pred, target_names=['normal', 'malicious']))

              precision    recall  f1-score   support

      normal       0.87      0.89      0.88     12833
   malicious       0.85      0.83      0.84      9711

    accuracy                           0.86     22544
   macro avg       0.86      0.86      0.86     22544
weighted avg       0.86      0.86      0.86     22544



In [36]:
test_y

NameError: name 'test_y' is not defined