<a href="https://colab.research.google.com/github/harshit0413/Darkweb-Analyzer-/blob/main/Darkweb-Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
!pip install scapy



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the labeled dataset
df_labeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/darknet-normal.csv')

df_labeled.replace([np.inf, -np.inf], np.nan, inplace=True)

df_labeled.fillna(0, inplace=True)  # or

# Define the features to keep, based on the extract_features function
features_to_keep = [
    'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
    'Packet Length Min', 'Packet Length Mean', 'Fwd IAT Total',
    'Flow IAT Min', 'Flow IAT Max', 'Fwd IAT Mean', 'Flow Packets/s',
    'Flow Bytes/s', 'Idle Min', 'Idle Max', 'Idle Mean',
    'Idle Std', 'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'ACK Flag Count'
]

# Keep only the relevant features
df_relevant_features = df_labeled[features_to_keep + ['Label']]

# Split the data into features and labels
X = df_relevant_features.drop('Label', axis=1)

y = df_relevant_features['Label'].map({'Normal': 'normal', 'FreeNet': 'darknet', 'I2P': 'darknet', 'Tor': 'darknet', 'ZeroNet': 'darknet'})

df_labeled.fillna(0, inplace=True)

# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply one-hot encoding only to the train dataset to avoid memory issues
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align X_train and X_test to ensure they have the same columns
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Initialize XGBoost classifier
Xgb_classify = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, seed=42)

# Encode the labels with LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train the classifier with the encoded binary labels
Xgb_classify.fit(X_train,y_train_encoded)

# Make predictions with the encoded labels
encoded_predictions = Xgb_classify.predict(X_test)

# Decode the predictions back to original labels
predictions = label_encoder.inverse_transform(encoded_predictions)
accuracy = accuracy_score(y_test_encoded, encoded_predictions)
precision = precision_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])
recall = recall_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])
f1 = f1_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])


print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9944990484135882
Precision: 0.9959913326110509
Recall: 0.9926037898828484
F1 Score: 0.9942946759321851


In [19]:
#saving the model
Xgb_classify.save_model('xgb_model.json')

In [20]:
#load the model
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgb_model.json')

In [21]:
from scapy.all import rdpcap, IP, TCP
import numpy as np
import pandas as pd


def extract_features(pcap_file):
    packets = rdpcap(pcap_file)
    features = {
        'Flow Duration': 0.0,
        'Total Fwd Packet': 0,
        'Total Bwd packets': 0,
        'Packet Length Min': np.inf,
        'Packet Length Mean': 0.0,
        'Fwd IAT Total': 0.0,
        'Flow IAT Min': np.inf,
        'Flow IAT Max': 0.0,
        'Fwd IAT Mean': 0.0,
        'Flow Packets/s': 0.0,
        'Flow Bytes/s': 0.0,
        'Idle Min': np.inf,
        'Idle Max': 0.0,
        'Idle Mean': 0.0,
        'Idle Std': 0.0,
        'FWD Init Win Bytes': 0,
        'Bwd Init Win Bytes': 0,
        'ACK Flag Count': 0
    }

    if not packets:
        return pd.DataFrame(features, index=[0])

    start_times = []
    packet_lengths = []
    iats = []
    total_bytes = 0

    for packet in packets:
        if IP in packet and TCP in packet:
            packet_length = len(packet)
            packet_lengths.append(packet_length)
            total_bytes += packet_length

            if 'S' in packet[TCP].flags:
                if features['FWD Init Win Bytes'] == 0:
                    features['FWD Init Win Bytes'] = packet[TCP].window
                else:
                    features['Bwd Init Win Bytes'] = packet[TCP].window

            if 'A' in packet[TCP].flags:
                features['ACK Flag Count'] += 1

            start_times.append(float(packet.time))

            if len(start_times) > 1:
                iat = start_times[-1] - start_times[-2]
                iats.append(iat)

    features['Flow Duration'] = max(start_times) - min(start_times)
    features['Total Fwd Packet'] = len([p for p in packets if IP in p and p[IP].src < p[IP].dst])
    features['Total Bwd packets'] = len([p for p in packets if IP in p and p[IP].src > p[IP].dst])
    features['Packet Length Min'] = min(packet_lengths)
    features['Packet Length Mean'] = np.mean(packet_lengths) if packet_lengths else 0
    features['Fwd IAT Total'] = sum(iats)
    features['Flow IAT Min'] = min(iats) if iats else 0
    features['Flow IAT Max'] = max(iats) if iats else 0
    features['Flow IAT Min'] = np.mean(iats) if iats else 0
    features['Flow Packets/s'] = len(packets) / features['Flow Duration'] if features['Flow Duration'] else 0
    features['Flow Bytes/s'] = total_bytes / features['Flow Duration'] if features['Flow Duration'] else 0

    # Handle potential NaNs and infs
    for key, value in features.items():
        if isinstance(value, float) and (np.isinf(value) or np.isnan(value)):
            features[key] = 0

    df_features = pd.DataFrame([features])

# Handle potential NaNs and infs again before returning
    df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_features.fillna(0, inplace=True)

    return df_features


In [22]:
import xgboost as xgb

def classify_traffic(df_features, model_path):
    # Load the trained model
    xgb_model = xgb.XGBClassifier()

    # Load the model
    xgb_model.load_model(model_path)

    # Ensure that the model has been fitted before making predictions
    if not xgb_model.get_booster().attr("n_features"):
        raise ValueError("Model needs to be fitted before making predictions")

    # Predict the traffic class
    predictions = xgb_model.predict(df_features)
    return predictions


In [36]:
import xgboost as xgb

model_path = '/content/xgb_model.json'  # Make sure this is the correct path to your model file
pcap_file_path = '/content/drive/MyDrive/Colab Notebooks/pcap/zeronet-p2p_00001_20200421125502.pcap'

# Loading the trained XGBoost model
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.load_model(model_path)

# Extract features from the pcap file
df_features = extract_features(pcap_file_path)

# Output the predicted class
print("Predicted Class:", predictions[0])
print(label_encoder.classes_)

Predicted Class: 0
['darknet' 'normal']


In [37]:
if isinstance(df_features, pd.Series):
    df_features = df_features.to_frame().transpose()

predictions = xgb_classifier.predict(df_features)

In [38]:
if predictions[0] == 0:
    print("Darknet")
else:
    print("Normal")

Darknet
