In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

def prepare_nsl_kdd_data(train_path, test_path, validation_split=0.25, random_state=42):
    # Define column names
    columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
               'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
               'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
               'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
               'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
               'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
               'dst_host_srv_rerror_rate','attack','level']
    
    # Load data
    train_df = pd.read_csv(train_path, sep=",", names=columns)
    test_df = pd.read_csv(test_path, sep=",", names=columns)
    
    # Classify attacks
    train_df['attack_state'] = train_df.attack.map(lambda a: 0 if a == 'normal' else 1)
    test_df['attack_state'] = test_df.attack.map(lambda a: 0 if a == 'normal' else 1)
    
    # One-hot encoding
    categorical_columns = ['protocol_type', 'service', 'flag']
    train_df_encoded = pd.get_dummies(train_df, columns=categorical_columns, prefix=categorical_columns, prefix_sep="_")
    test_df_encoded = pd.get_dummies(test_df, columns=categorical_columns, prefix=categorical_columns, prefix_sep="_")
    
    # Ensure both train and test have the same columns
    all_columns = set(train_df_encoded.columns) | set(test_df_encoded.columns)
    for col in all_columns:
        if col not in train_df_encoded.columns:
            train_df_encoded[col] = 0
        if col not in test_df_encoded.columns:
            test_df_encoded[col] = 0
    
    # Ensure columns are in the same order
    train_df_encoded = train_df_encoded.reindex(sorted(train_df_encoded.columns), axis=1)
    test_df_encoded = test_df_encoded.reindex(sorted(train_df_encoded.columns), axis=1)
    
    # Prepare features and target
    drop_columns = ['attack', 'level', 'attack_state']
    X_train = train_df_encoded.drop(drop_columns, axis=1)
    Y_train = train_df_encoded['attack_state']
    X_test = test_df_encoded.drop(drop_columns, axis=1)
    Y_test = test_df_encoded['attack_state']
    
    # Add zero columns for missing features
    current_feature_count = X_train.shape[1]
    missing_feature_count = 124 - current_feature_count
    
    if missing_feature_count > 0:
        for i in range(missing_feature_count):
            column_name = f'added_feature_{i}'
            X_train[column_name] = 0
            X_test[column_name] = 0
    
    # Detailed diagnostic information
    print(f"Total number of features: {X_train.shape[1]}")
    print(f"Number of original features: {current_feature_count}")
    print(f"Number of added zero features: {missing_feature_count}")
    print(f"Number of protocol_type categories: {len([col for col in X_train.columns if col.startswith('protocol_type_')])}")
    print(f"Number of service categories: {len([col for col in X_train.columns if col.startswith('service_')])}")
    print(f"Number of flag categories: {len([col for col in X_train.columns if col.startswith('flag_')])}")
    
    # List all feature names
    print("All feature names:")
    print(X_train.columns.tolist())
    
    # Ensure we have 124 features
    assert X_train.shape[1] == 124, f"Expected 124 features, but got {X_train.shape[1]}"
    assert X_test.shape[1] == 124, f"Expected 124 features, but got {X_test.shape[1]}"
    
    # Split training data into train and validation sets
    X_train_train, X_test_train, Y_train_train, Y_test_train = train_test_split(X_train, Y_train, 
                                                                                test_size=validation_split, 
                                                                                random_state=random_state)
    
    # Scale the features
    scaler = RobustScaler()
    X_train_train_scaled = scaler.fit_transform(X_train_train)
    X_test_train_scaled = scaler.transform(X_test_train)
    X_test_scaled = scaler.transform(X_test)
    
    return (X_train_train_scaled, Y_train_train, X_test_train_scaled, Y_test_train, X_test_scaled, Y_test)

# Usage example:
X_train_train, Y_train_train, X_test_train, Y_test_train, X_test, Y_test = prepare_nsl_kdd_data("nsl-kdd-data/KDDTrain+.txt", "nsl-kdd-data/KDDTest+.txt")

# Print the number of features to verify
print(f"Number of features in X_train_train: {X_train_train.shape[1]}")
print(f"Number of features in X_test: {X_test.shape[1]}")

Total number of features: 124
Number of original features: 122
Number of added zero features: 2
Number of protocol_type categories: 3
Number of service categories: 70
Number of flag categories: 11
All feature names:
['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count', 'dst_host_diff_srv_rate', 'dst_host_rerror_rate', 'dst_host_same_src_port_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_count', 'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate', 'duration', 'flag_OTH', 'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH', 'hot', 'is_guest_login', 'is_host_login', 'land', 'logged_in', 'num_access_files', 'num_compromised', 'num_failed_logins', 'num_file_creations', 'num_outbound_cmds', 'num_root', 'num_shells', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', 'rerror_rate', 'root_shell', 'same_srv_rate', 'serror_rate', 'service_IRC', 'service_X1

In [22]:
import pickle
from sklearn.metrics import accuracy_score

def load_model_and_predict(model_path, X_train, Y_train, X_test, Y_test):
    # Load the model
    with open(model_path, 'rb') as file:
        loaded_model = pickle.load(file)
    
    # Make predictions
    train_predictions = loaded_model.predict(X_train)
    test_predictions = loaded_model.predict(X_test)
    
    # Calculate scores
    train_score = accuracy_score(Y_train, train_predictions)
    test_score = accuracy_score(Y_test, test_predictions)
    
    return train_score, test_score

# Usage
model_path = "Random_Forest.pkl"  # Path to your saved model
train_score, test_score = load_model_and_predict(model_path, X_train_train, Y_train_train, X_test_train, Y_test_train)

print(f"Training Score: {train_score}")
print(f"Testing Score: {test_score}")

Training Score: 0.5351982980344838
Testing Score: 0.5325776338350162
