In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

def prepare_nsl_kdd_data(train_path, test_path, validation_split=0.25, random_state=42):
    # Define column names
    columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
               'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
               'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
               'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
               'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
               'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
               'dst_host_srv_rerror_rate','attack','level']
    
    # Load data
    train_df = pd.read_csv(train_path, sep=",", names=columns)
    test_df = pd.read_csv(test_path, sep=",", names=columns)
    
    # Classify attacks
    train_df['attack_state'] = train_df.attack.map(lambda a: 0 if a == 'normal' else 1)
    test_df['attack_state'] = test_df.attack.map(lambda a: 0 if a == 'normal' else 1)
    
    # One-hot encoding
    categorical_columns = ['protocol_type', 'service', 'flag']
    train_df_encoded = pd.get_dummies(train_df, columns=categorical_columns, prefix=categorical_columns, prefix_sep="_")
    test_df_encoded = pd.get_dummies(test_df, columns=categorical_columns, prefix=categorical_columns, prefix_sep="_")
    
    # Ensure both train and test have the same columns
    all_columns = set(train_df_encoded.columns) | set(test_df_encoded.columns)
    for col in all_columns:
        if col not in train_df_encoded.columns:
            train_df_encoded[col] = 0
        if col not in test_df_encoded.columns:
            test_df_encoded[col] = 0
    
    # Ensure columns are in the same order
    train_df_encoded = train_df_encoded.reindex(sorted(train_df_encoded.columns), axis=1)
    test_df_encoded = test_df_encoded.reindex(sorted(train_df_encoded.columns), axis=1)
    
    # Prepare features and target
    drop_columns = ['attack', 'level', 'attack_state']
    X_train = train_df_encoded.drop(drop_columns, axis=1)
    Y_train = train_df_encoded['attack_state']
    X_test = test_df_encoded.drop(drop_columns, axis=1)
    Y_test = test_df_encoded['attack_state']
    
    # Add zero columns for missing features
    current_feature_count = X_train.shape[1]
    missing_feature_count = 124 - current_feature_count
    
    if missing_feature_count > 0:
        for i in range(missing_feature_count):
            column_name = f'added_feature_{i}'
            X_train[column_name] = 0
            X_test[column_name] = 0
    
    # Ensure we have 124 features
    assert X_train.shape[1] == 124, f"Expected 124 features, but got {X_train.shape[1]}"
    assert X_test.shape[1] == 124, f"Expected 124 features, but got {X_test.shape[1]}"
    
    # Split training data into train and validation sets
    X_train_train, X_test_train, Y_train_train, Y_test_train = train_test_split(X_train, Y_train, 
                                                                                test_size=validation_split, 
                                                                                random_state=random_state)
    
    # Scale the features
    scaler = RobustScaler()
    X_train_train_scaled = scaler.fit_transform(X_train_train)
    X_test_train_scaled = scaler.transform(X_test_train)
    X_test_scaled = scaler.transform(X_test)
    
    return (X_train_train_scaled, Y_train_train, X_test_train_scaled, Y_test_train, X_test_scaled, Y_test)

# Usage example:
X_train_train, Y_train_train, X_test_train, Y_test_train, X_test, Y_test = prepare_nsl_kdd_data("nsl-kdd-data/KDDTrain+.txt", "nsl-kdd-data/KDDTest+.txt")

In [9]:
import pickle
from sklearn.metrics import accuracy_score

def load_model_and_predict(model_path, X_train, Y_train, X_test, Y_test):
    # Load the model
    with open(model_path, 'rb') as file:
        loaded_model = pickle.load(file)
    
    # Make predictions
    train_predictions = loaded_model.predict(X_train)
    test_predictions = loaded_model.predict(X_test)
    
    # Calculate scores
    train_score = accuracy_score(Y_train, train_predictions)
    test_score = accuracy_score(Y_test, test_predictions)
    
    return train_score, test_score

# Usage
model_path = "Random_Forest.pkl"  # Path to your saved model
train_score, test_score = load_model_and_predict(model_path, X_train_train, Y_train_train, X_test_train, Y_test_train)

print(f"Training Score: {train_score}")
print(f"Testing Score: {test_score}")

Training Score: 0.5352406354851342
Testing Score: 0.5325776338350162


In [11]:
import pickle
from sklearn.metrics import accuracy_score
import numpy as np

def load_model_and_predict(model_path, X_train, Y_train, X_test, Y_test):
    # Load the model
    with open(model_path, 'rb') as file:
        loaded_model = pickle.load(file)
    
    # Make predictions
    train_predictions = loaded_model.predict(X_train)
    test_predictions = loaded_model.predict(X_test)
    
    # Calculate scores
    train_score = accuracy_score(Y_train, train_predictions)
    test_score = accuracy_score(Y_test, test_predictions)
    
    # Map numeric predictions to labels
    label_map = {0: 'normal', 1: 'attack'}
    train_predictions_labels = np.array([label_map[pred] for pred in train_predictions])
    test_predictions_labels = np.array([label_map[pred] for pred in test_predictions])
    
    return train_score, test_score, train_predictions_labels, test_predictions_labels

# Usage
model_path = "Random_Forest.pkl"  # Path to your saved model
train_score, test_score, train_pred_labels, test_pred_labels = load_model_and_predict(model_path, X_train_train, Y_train_train, X_test_train, Y_test_train)

# print(f"Training Score: {train_score}")
# print(f"Testing Score: {test_score}")
# print(f"Train Predictions (first 10): {train_pred_labels}")
# print(f"Test Predictions (first 10): {test_pred_labels}")

# for i in train_pred_labels:
#     print(i)

Training Score: 0.5351982980344838
Testing Score: 0.5325776338350162
Train Predictions (first 10): ['normal' 'normal' 'normal' ... 'normal' 'normal' 'normal']
Test Predictions (first 10): ['normal' 'normal' 'normal' ... 'normal' 'normal' 'normal']
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
normal
nor