In [3]:
import pandas as pd

# Load the dataset
file_path = "C:\\Users\\ASUS\\Desktop\\BigData\\Final Project\\CODE\\test\\Cleaned_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type_of_attack
0,0.0,udp,domain_u,SF,43.0,59.0,0.0,0.0,0.0,0.0,...,240.0,0.94,0.01,0.01,0.0,0.0,0.0,0.0,0.0,normal
1,0.0,tcp,http,SF,214.0,306.0,0.0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
2,0.0,udp,domain_u,SF,44.0,74.0,0.0,0.0,0.0,0.0,...,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal
3,0.0,tcp,courier,S0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune
4,0.0,tcp,private,S0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.03,0.08,0.0,0.0,1.0,1.0,0.0,0.0,neptune


In [4]:
# Checking for categorical columns and their unique values
categorical_cols = data.select_dtypes(include=['object']).columns
unique_values = {col: data[col].unique() for col in categorical_cols}

# Checking for missing values in the dataset
missing_values = data.isnull().sum()

categorical_cols, unique_values, missing_values[missing_values > 0]


(Index(['protocol_type', 'service', 'flag', 'type_of_attack'], dtype='object'),
 {'protocol_type': array(['udp', 'tcp', 'icmp', nan], dtype=object),
  'service': array(['domain_u', 'http', 'courier', 'private', 'other', 'name', 'efs',
         'whois', 'exec', 'ldap', 'ftp_data', nan, 'auth', 'gopher', 'bgp',
         'link', 'smtp', 'nnsp', 'mtp', 'ecr_i', 'Z39_50', 'finger',
         'telnet', 'echo', 'vmnet', 'ftp', 'eco_i', 'shell', 'systat',
         'uucp_path', 'kshell', 'remote_job', 'iso_tsap', 'imap4', 'uucp',
         'daytime', 'csnet_ns', 'pop_3', 'ctf', 'http_443', 'supdup',
         'hostnames', 'discard', 'netbios_dgm', 'domain', 'netbios_ns',
         'printer', 'netbios_ssn', 'ssh', 'klogin', 'time', 'urp_i', 'nntp',
         'login', 'sql_net', 'pop_2', 'sunrpc', 'netstat', 'rje', 'ntp_u',
         'X11', 'urh_i', 'IRC', 'red_i', 'tim_i', 'tftp_u'], dtype=object),
  'flag': array(['SF', 'S0', 'REJ', 'RSTO', 'RSTR', 'RSTOS0', nan, 'S1', 'S2', 'S3',
         'OTH'], dt

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Imputing missing values with the most frequent value in each column
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(data[categorical_cols])
data[categorical_cols] = data_imputed

# Now let's apply OneHotEncoder to the categorical features
# But let's first split features and target variable
X = data.drop('type_of_attack', axis=1)  # Features
y = data['type_of_attack']  # Target variable

# Apply OneHotEncoder
# We will one-hot encode the 'protocol_type' and 'flag' columns
# and drop 'service' due to its high cardinality which may result in too many features after one-hot encoding
X = X.drop('service', axis=1)

# Defining the transformer for the OneHotEncoder
transformer = ColumnTransformer(transformers=[
    ('one_hot', OneHotEncoder(), ['protocol_type', 'flag'])
], remainder='passthrough')

# Apply the transformer to the feature set X
X_transformed = transformer.fit_transform(X)

# Convert the transformed features back into a DataFrame
# Get the one-hot encoding feature names
categories = transformer.named_transformers_['one_hot'].get_feature_names_out(input_features=['protocol_type', 'flag'])
numeric_features = X.drop(columns=['protocol_type', 'flag']).columns

# Combine the one-hot encoded columns with the rest of the data
all_features = list(categories) + list(numeric_features)
X_encoded = pd.DataFrame(X_transformed, columns=all_features)

# Checking the encoded features and their first few rows
X_encoded.head()


Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,240.0,0.94,0.01,0.01,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,6.0,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,7.0,0.03,0.08,0.0,0.0,1.0,1.0,0.0,0.0


In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Imputing missing values with the most frequent value in each column
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(data[categorical_cols])
data[categorical_cols] = data_imputed

# Now let's apply OneHotEncoder to the categorical features
# But let's first split features and target variable
X = data.drop('type_of_attack', axis=1)  # Features
y = data['type_of_attack']  # Target variable

# Apply OneHotEncoder
# We will one-hot encode the 'protocol_type' and 'flag' columns
# and drop 'service' due to its high cardinality which may result in too many features after one-hot encoding
X = X.drop('service', axis=1)

# Defining the transformer for the OneHotEncoder
transformer = ColumnTransformer(transformers=[
    ('one_hot', OneHotEncoder(), ['protocol_type', 'flag'])
], remainder='passthrough')

# Apply the transformer to the feature set X
X_transformed = transformer.fit_transform(X)

# Convert the transformed features back into a DataFrame
# Get the one-hot encoding feature names
categories = transformer.named_transformers_['one_hot'].get_feature_names_out(input_features=['protocol_type', 'flag'])
numeric_features = X.drop(columns=['protocol_type', 'flag']).columns

# Combine the one-hot encoded columns with the rest of the data
all_features = list(categories) + list(numeric_features)
X_encoded = pd.DataFrame(X_transformed, columns=all_features)

# Checking the encoded features and their first few rows
X_encoded.head()


Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,240.0,0.94,0.01,0.01,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,6.0,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,7.0,0.03,0.08,0.0,0.0,1.0,1.0,0.0,0.0


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Encode the target variable
target_encoder = OneHotEncoder()
y_encoded = target_encoder.fit_transform(y.values.reshape(-1, 1)).toarray()

# Initialize the random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the random forest classifier
rf.fit(X_encoded, y_encoded)

# Feature importances
importances = rf.feature_importances_

# Create a model for feature selection
selector = SelectFromModel(estimator=rf, prefit=True, threshold="mean")

# Transform the dataset according to the selector
X_selected = selector.transform(X_encoded)

# Get selected feature names
selected_features = X_encoded.columns[(selector.get_support())]

# Number of features selected and the selected feature names
num_selected_features = X_selected.shape[1]
selected_features, num_selected_features



(Index(['flag_S0', 'flag_SF', 'src_bytes', 'dst_bytes', 'logged_in', 'count',
        'serror_rate', 'srv_serror_rate', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate'],
       dtype='object'),
 14)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Since y is categorical, we need to encode it before using it in logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded[selected_features], y_encoded, test_size=0.2, random_state=42
)

# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the Logistic Regression model on the training data
logreg.fit(X_train, y_train)

# Predictions on the test set
y_pred = logreg.predict(X_test)

# Generating a classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

report

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                          precision    recall  f1-score   support\n\nDenial of Service Attack       0.00      0.00      0.00         1\n                 ipsweep       0.00      0.00      0.00        32\n                 neptune       1.00      1.00      1.00      6180\n                    nmap       0.55      0.66      0.60       140\n                  normal       0.99      0.99      0.99      6899\n               portsweep       0.86      0.76      0.81        99\n                   satan       0.98      0.90      0.94       182\n                   smurf       0.96      0.91      0.94        79\n\n                accuracy                           0.99     13612\n               macro avg       0.67      0.65      0.66     13612\n            weighted avg       0.98      0.99      0.98     13612\n'

In [9]:
print(report)

                          precision    recall  f1-score   support

Denial of Service Attack       0.00      0.00      0.00         1
                 ipsweep       0.00      0.00      0.00        32
                 neptune       1.00      1.00      1.00      6180
                    nmap       0.55      0.66      0.60       140
                  normal       0.99      0.99      0.99      6899
               portsweep       0.86      0.76      0.81        99
                   satan       0.98      0.90      0.94       182
                   smurf       0.96      0.91      0.94        79

                accuracy                           0.99     13612
               macro avg       0.67      0.65      0.66     13612
            weighted avg       0.98      0.99      0.98     13612



In [10]:
import pickle

# Since the model training was not completed due to the timeout, we'll use the model with the current state.
# It might not be well trained, but for the purpose of demonstration, we'll proceed with saving it.

# Define the file path for the model
model_file_path = 'logistic_regression_model.pkl'

# Save the model to a file
with open(model_file_path, 'wb') as file:
    pickle.dump(logreg, file)

model_file_path

'logistic_regression_model.pkl'

In [11]:
import pickle

# Define the file path for the label encoder
label_encoder_file_path = 'label_encoder.pkl'

# Save the label encoder to a file
with open(label_encoder_file_path, 'wb') as file:
    pickle.dump(label_encoder, file)

label_encoder_file_path


'label_encoder.pkl'