In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from torch_geometric.nn import GraphConv
from torch_geometric.loader import DataLoader
from torch_geometric.data import HeteroData
from torch.nn import functional as F
from torch.optim import Adam
from torch import nn
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import warnings
import torch.optim as optim
import torch
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
df = pd.read_csv("UNSW-NB15.csv")

In [None]:
# Look at the first few rows of the DataFrame
print(df.head())

# Get summary statistics for numeric columns
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Check data types of each column
print(df.dtypes)

# Count of each type in 'Label' column (assuming this is a categorical variable)
print(df['Label'].value_counts())

# Histograms for numerical columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()
for col in num_cols:
    df[col].hist()
    plt.title(col)
    plt.show()

# Box plots for numerical columns
for col in num_cols:
    df.boxplot(column=col)
    plt.title(col)
    plt.show()

# For categorical columns, we can use count plots
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
elements_to_remove = ['srcip', 'dstip', 'sport', 'dsport']
cat_cols = [col for col in cat_cols if col not in elements_to_remove]

for col in cat_cols:
    sns.countplot(data=df, x=col)
    plt.title(col)
    plt.xticks(rotation=90)  # makes labels readable
    plt.show()

# Correlation heatmap for numerical columns
plt.figure(figsize=(20,20))
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f")
plt.show()

In [None]:
numerical_cols = ['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'Sload',
                  'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
                  'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime',
                  'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
                  'ct_state_ttl', 'ct_flw_http_mthd', 'ct_ftp_cmd',
                  'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm',
                  'ct_dst_sport_ltm', 'ct_dst_src_ltm']
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')
categorical_cols = ['proto', 'state', 'service']
df[categorical_cols] = df[categorical_cols].astype(str)

In [None]:
df['attack_cat'] = df['attack_cat'].replace(np.NaN, 'Normal')

In [None]:
df = pd.get_dummies(df, columns=categorical_cols)
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
def ip_to_binary(df, column_name, new_column_name):
    temp = pd.DataFrame()
    temp[column_name] = df[column_name].astype(str)
    temp[column_name][~temp[column_name].str.contains('\d{1,3}\.', regex=True)] = '0.0.0.0'
    temp = temp[column_name].str.split('.', expand=True).rename(columns = {2: f'{new_column_name}3', 3: f'{new_column_name}4'}).astype(int)[[f'{new_column_name}3', f'{new_column_name}4']]
    temp[new_column_name] = temp[f'{new_column_name}3'].apply(lambda x: format(x, "b").zfill(8)) + temp[f'{new_column_name}4'].apply(lambda x: format(x, "b").zfill(8))
    
    split_binary = temp[new_column_name].str.split('', expand=True)
    split_binary.drop(columns=[0, 17], inplace=True)
    split_binary.columns = [f'{new_column_name}_{i}' for i in range(16)]
    split_binary = split_binary.astype('int32')

    df = df.join(split_binary)
    return df

In [None]:
df = ip_to_binary(df, 'srcip', 'ipsrc')
df = ip_to_binary(df, 'dstip', 'ipdst')

In [None]:
label_encoder = LabelEncoder()
df['attack_cat_encoded'] = label_encoder.fit_transform(df['attack_cat'])

In [None]:
columns_to_exclude = ['attack_cat', 'Label', 'attack_cat_encoded',
                      'sport', 'dsport', 'proto', 'state', 'srcip', 'dstip', 'service', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd']
all_columns = df.columns.to_list()
columns_array = [col for col in all_columns if col not in columns_to_exclude]

In [None]:
df.drop(columns=['attack_cat', "Label", 'dsport', 'sport','ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd'], inplace=True)

In [None]:
df_train, df_test = train_test_split(df, random_state=0, test_size=0.2, stratify=df['attack_cat_encoded'])
df_val, df_test = train_test_split(df_test, random_state=0, test_size=0.5, stratify=df_test['attack_cat_encoded'])

In [None]:
####IMPLEMENT GNN MODEL

In [None]:
BATCH_SIZE = 16
features_host = [f'ipsrc_{i}' for i in range(0, 16)] + [f'ipdst_{i}' for i in range(0, 16)]
features_flow = columns_array

def get_connections(ip_map, src_ip, dst_ip):
    src1 = [ip_map[ip] for ip in src_ip]
    src2 = [ip_map[ip] for ip in dst_ip]
    src = np.column_stack((src1, src2)).flatten()
    dst = list(range(len(src_ip)))
    dst = np.column_stack((dst, dst)).flatten()
    
    return torch.Tensor([src, dst]).int(), torch.Tensor([dst, src]).int()

def create_dataloader(df, subgraph_size=1024):
    data = []
    n_subgraphs = len(df) // subgraph_size
    for i in range(1, n_subgraphs+1):
        subgraph = df[(i-1)*subgraph_size:i*subgraph_size]
        src_ip = subgraph['srcip'].to_numpy()
        dst_ip = subgraph['dstip'].to_numpy()
        
        ip_map = {ip:index for index, ip in enumerate(np.unique(np.append(src_ip, dst_ip)))}
        host_to_flow, flow_to_host = get_connections(ip_map, src_ip, dst_ip)
        
        batch = HeteroData()
        batch['host'].x = torch.Tensor(subgraph[features_host].to_numpy()).float()
        batch['flow'].x = torch.Tensor(subgraph[features_flow].to_numpy()).float()
        batch['flow'].y = torch.Tensor(subgraph['attack_cat_encoded'].to_numpy()).long()
        batch['host','flow'].edge_index = host_to_flow
        batch['flow','host'].edge_index = flow_to_host
        data.append(batch)
        
    return DataLoader(data, batch_size=BATCH_SIZE)


train_loader = create_dataloader(df_train)
val_loader = create_dataloader(df_val)
test_loader = create_dataloader(df_test)


In [None]:
class HeteroGNN(torch.nn.Module):
    def __init__(self, in_channels_host, in_channels_flow, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()

        self.conv_host = GraphConv(in_channels_host, hidden_channels)
        self.conv_flow = GraphConv(in_channels_flow, hidden_channels)
        self.fc = torch.nn.Linear(2*hidden_channels, out_channels)

    def forward(self, data):
        x_host = F.relu(self.conv_host(data['host'].x, data['host','flow'].edge_index))
        x_flow = F.relu(self.conv_flow(data['flow'].x, data['flow','host'].edge_index))
        x = torch.cat([x_host, x_flow], dim=-1)
        x = self.fc(x)

        return F.log_softmax(x, dim=-1)

# Parameters
in_channels_host = len(features_host)
in_channels_flow = len(features_flow)
hidden_channels = 32  # choose as needed
out_channels = 14 # the number of classes
learning_rate = 0.01
num_epochs = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model Initialization
model = HeteroGNN(in_channels_host, in_channels_flow, hidden_channels, out_channels).to(device)

# Optimizer Initialization
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Training Function
def train():
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = F.cross_entropy(out, batch['flow'].y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def test(loader):
    model.eval()
    preds, targets = [], []
    for batch in loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = model(batch)
        pred = out.argmax(dim=1)
        preds.extend(pred.tolist())
        targets.extend(batch['flow'].y.tolist())
    accuracy = (np.array(preds) == np.array(targets)).mean()
    return accuracy, preds, targets


# Training Loop
for epoch in range(num_epochs):
    train_loss = train()
    train_acc, _, _ = test(train_loader)
    val_acc, val_preds, val_targets = test(val_loader)
    val_prec = precision_score(val_targets, val_preds, average='micro')
    val_recall = recall_score(val_targets, val_preds, average='micro')
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Val Precision: {val_prec:.4f}, Val Recall: {val_recall:.4f}')

    # Step with the scheduler
    scheduler.step(train_loss)    
    

# Testing
val_acc, val_preds, val_targets = test(val_loader)
val_prec = precision_score(val_targets, val_preds, average='micro')
val_recall = recall_score(val_targets, val_preds, average='micro')
print(f'Epoch: {epoch+1}, Test Acc: {val_acc:.4f}, Val Precision: {val_prec:.4f}, Val Recall: {val_recall:.4f}')

