In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the labeled dataset
df_labeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/darknet-normal.csv')

df_labeled.replace([np.inf, -np.inf], np.nan, inplace=True)

df_labeled.fillna(0, inplace=True)  # or

# Define the features to keep, based on the extract_features function
features_to_keep = [
    'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
    'Packet Length Min', 'Packet Length Mean', 'Fwd IAT Total',
    'Flow IAT Min', 'Flow IAT Max', 'Fwd IAT Mean', 'Flow Packets/s',
    'Flow Bytes/s', 'Idle Min', 'Idle Max', 'Idle Mean',
    'Idle Std', 'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'ACK Flag Count'
]

# Keep only the relevant features
df_relevant_features = df_labeled[features_to_keep + ['Label']]

# Split the data into features and labels
X = df_relevant_features.drop('Label', axis=1)

y = df_relevant_features['Label'].map({'Normal': 'normal', 'FreeNet': 'darknet', 'I2P': 'darknet', 'Tor': 'darknet', 'ZeroNet': 'darknet'})

df_labeled.fillna(0, inplace=True)

# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply one-hot encoding only to the train dataset to avoid memory issues
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align X_train and X_test to ensure they have the same columns
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Initialize XGBoost classifier
Xgb_classify = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, seed=42)

# Encode the labels with LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train the classifier with the encoded binary labels
Xgb_classify.fit(X_train,y_train_encoded)

# Make predictions with the encoded labels
encoded_predictions = Xgb_classify.predict(X_test)

# Decode the predictions back to original labels
predictions = label_encoder.inverse_transform(encoded_predictions)
accuracy = accuracy_score(y_test_encoded, encoded_predictions)
precision = precision_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])
recall = recall_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])
f1 = f1_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])


print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
