In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

def prepare_nsl_kdd_data(train_path, test_path, validation_split=0.25, random_state=42):
    # Define column names
    columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
               'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
               'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
               'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
               'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
               'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
               'dst_host_srv_rerror_rate','attack','level']
    
    # Load data
    train_df = pd.read_csv(train_path, sep=",", names=columns)
    test_df = pd.read_csv(test_path, sep=",", names=columns)
    
    # Classify attacks
    Trained_attack = train_df.attack.map(lambda a: 0 if a == 'normal' else 1)
    Tested_attack = test_df.attack.map(lambda a: 0 if a == 'normal' else 1)

    train_df['attack_state'] = Trained_attack
    test_df['attack_state'] = Tested_attack
    
    # One-hot encoding
    train_df = pd.get_dummies(train_df,columns=['protocol_type','service','flag'], prefix="", prefix_sep="")
    test_df = pd.get_dummies(test_df,columns=['protocol_type','service','flag'],prefix="",prefix_sep="")
    
    LE = LabelEncoder()
    attack_LE= LabelEncoder()
    train_df['attack'] = attack_LE.fit_transform(train_df["attack"])
    test_df['attack'] = attack_LE.fit_transform(test_df["attack"])
    
    # Data Splitting
    X_train = train_df.drop(['attack', 'level', 'attack_state'], axis=1)
    X_test = test_df.drop(['attack', 'level', 'attack_state'], axis=1)

    Y_train = train_df['attack_state']
    Y_test = test_df['attack_state']
    
    X_train_train, X_test_train, Y_train_train, Y_test_train = train_test_split(X_train, Y_train, test_size= 0.25 , random_state=42)
    X_train_test, X_test_test, Y_train_test, Y_test_test = train_test_split(X_test, Y_test, test_size= 0.25 , random_state=42)
    
    # Data scaling
    Ro_scaler = RobustScaler()
    X_train_train = Ro_scaler.fit_transform(X_train_train) 
    X_test_train= Ro_scaler.transform(X_test_train)
    X_train_test = Ro_scaler.fit_transform(X_train_test) 
    X_test_test= Ro_scaler.transform(X_test_test)
    
    X_train = X_train.astype(int)
    X_test = X_test.astype(int)
    
    return (X_train_train, Y_train_train, X_test_train, Y_test_train, X_test, Y_test)

# Usage example:
X_train_train, Y_train_train, X_test_train, Y_test_train, X_test, Y_test = prepare_nsl_kdd_data("nsl-kdd-data/KDDTrain+.txt", "nsl-kdd-data/KDDTest+.txt")

In [4]:
import pickle
with open('Random_Forest.pkl', 'rb') as file:
    loaded_DT = pickle.load(file)

# Use the loaded model for scoring
training_score = loaded_DT.score(X_train_train, Y_train_train)
validation_score = loaded_DT.score(X_test_train, Y_test_train)

print(f"Training Score: {training_score}")
print(f"Validation Score: {validation_score}")

Training Score: 0.9999788312746748
Validation Score: 0.9987934209690734
