# **1st Level Classification**

In [1]:
pip install category_encoders

^C
Note: you may need to restart the kernel to use updated packages.


Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting numpy>=1.14.0 (from category_encoders)
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/61.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/61.0 kB ? eta -:--:--
     ------------------- ------------------ 30.7/61.0 kB 220.2 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.0 kB 262.6 kB/s eta 0:00:01
     -------------------------------------- 61.0/61.0 kB 295.3 kB/s eta 0:00:00
Collecting scikit-learn>=0.20.0 (from category_encoders)
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.0.0 (from category_encoders)
  Downloading scipy-1.12.0-cp311-cp311-win_amd64.whl.met

In [3]:
pip install scapy

Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444328 sha256=2cfd1dda17d847a81ec7f4a1d8f9dbb2be483a3062dcb659596316f00fac7480
  Stored in directory: /root/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.5.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the labeled dataset
df_labeled = pd.read_csv('./csv/darknet-normal.csv')

df_labeled.replace([np.inf, -np.inf], np.nan, inplace=True)

df_labeled.fillna(0, inplace=True)  # or

# Define the features to keep, based on the extract_features function
features_to_keep = [
    'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
    'Packet Length Min', 'Packet Length Mean', 'Fwd IAT Total',
    'Flow IAT Min', 'Flow IAT Max', 'Fwd IAT Mean', 'Flow Packets/s',
    'Flow Bytes/s', 'Idle Min', 'Idle Max', 'Idle Mean',
    'Idle Std', 'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'ACK Flag Count'
]

# Keep only the relevant features
df_relevant_features = df_labeled[features_to_keep + ['Label']]

# Split the data into features and labels
X = df_relevant_features.drop('Label', axis=1)

y = df_relevant_features['Label'].map({'Normal': 'normal', 'FreeNet': 'darknet', 'I2P': 'darknet', 'Tor': 'darknet', 'ZeroNet': 'darknet'})

df_labeled.fillna(0, inplace=True)

# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply one-hot encoding only to the train dataset to avoid memory issues
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align X_train and X_test to ensure they have the same columns
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Initialize XGBoost classifier
Xgb_classify = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, seed=42)

# Encode the labels with LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train the classifier with the encoded binary labels
Xgb_classify.fit(X_train,y_train_encoded)

# Make predictions with the encoded labels
encoded_predictions = Xgb_classify.predict(X_test)

# Decode the predictions back to original labels
predictions = label_encoder.inverse_transform(encoded_predictions)
accuracy = accuracy_score(y_test_encoded, encoded_predictions)
precision = precision_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])
recall = recall_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])
f1 = f1_score(y_test_encoded, encoded_predictions, pos_label=label_encoder.transform(['darknet'])[0])


print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9944990484135882
Precision: 0.9959913326110509
Recall: 0.9926037898828484
F1 Score: 0.994294675932185


In [3]:
#saving the model
Xgb_classify.save_model('xgb_model.json')

In [4]:
#load the model
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgb_model.json')

In [5]:
from scapy.all import rdpcap, IP, TCP
import numpy as np
import pandas as pd


def extract_features(pcap_file):
    packets = rdpcap(pcap_file)
    features = {
        'Flow Duration': 0.0,
        'Total Fwd Packet': 0,
        'Total Bwd packets': 0,
        'Packet Length Min': np.inf,
        'Packet Length Mean': 0.0,
        'Fwd IAT Total': 0.0,
        'Flow IAT Min': np.inf,
        'Flow IAT Max': 0.0,
        'Fwd IAT Mean': 0.0,
        'Flow Packets/s': 0.0,
        'Flow Bytes/s': 0.0,
        'Idle Min': np.inf,
        'Idle Max': 0.0,
        'Idle Mean': 0.0,
        'Idle Std': 0.0,
        'FWD Init Win Bytes': 0,
        'Bwd Init Win Bytes': 0,
        'ACK Flag Count': 0
    }

    if not packets:
        return pd.DataFrame(features, index=[0])

    start_times = []
    packet_lengths = []
    iats = []
    total_bytes = 0

    for packet in packets:
        if IP in packet and TCP in packet:
            packet_length = len(packet)
            packet_lengths.append(packet_length)
            total_bytes += packet_length

            if 'S' in packet[TCP].flags:
                if features['FWD Init Win Bytes'] == 0:
                    features['FWD Init Win Bytes'] = packet[TCP].window
                else:
                    features['Bwd Init Win Bytes'] = packet[TCP].window

            if 'A' in packet[TCP].flags:
                features['ACK Flag Count'] += 1

            start_times.append(float(packet.time))

            if len(start_times) > 1:
                iat = start_times[-1] - start_times[-2]
                iats.append(iat)

    features['Flow Duration'] = max(start_times) - min(start_times)
    features['Total Fwd Packet'] = len([p for p in packets if IP in p and p[IP].src < p[IP].dst])
    features['Total Bwd packets'] = len([p for p in packets if IP in p and p[IP].src > p[IP].dst])
    features['Packet Length Min'] = min(packet_lengths)
    features['Packet Length Mean'] = np.mean(packet_lengths) if packet_lengths else 0
    features['Fwd IAT Total'] = sum(iats)
    features['Flow IAT Min'] = min(iats) if iats else 0
    features['Flow IAT Max'] = max(iats) if iats else 0
    features['Flow IAT Min'] = np.mean(iats) if iats else 0
    features['Flow Packets/s'] = len(packets) / features['Flow Duration'] if features['Flow Duration'] else 0
    features['Flow Bytes/s'] = total_bytes / features['Flow Duration'] if features['Flow Duration'] else 0

    # Handle potential NaNs and infs
    for key, value in features.items():
        if isinstance(value, float) and (np.isinf(value) or np.isnan(value)):
            features[key] = 0

    df_features = pd.DataFrame([features])

# Handle potential NaNs and infs again before returning
    df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_features.fillna(0, inplace=True)

    return df_features




In [6]:
import xgboost as xgb

def classify_traffic(df_features, model_path):
    # Load the trained model
    xgb_model = xgb.XGBClassifier()

    # Load the model
    xgb_model.load_model(model_path)

    # Ensure that the model has been fitted before making predictions
    if not xgb_model.get_booster().attr("n_features"):
        raise ValueError("Model needs to be fitted before making predictions")

    # Predict the traffic class
    predictions = xgb_model.predict(df_features)
    return predictions


In [8]:
import xgboost as xgb

model_path = './xgb_model.json'  # Make sure this is the correct path to your model file
pcap_file_path = './i2p-chat_00001_20200407201731.pcap'

# Loading the trained XGBoost model
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.load_model(model_path)

# Extract features from the pcap file
df_features = extract_features(pcap_file_path)

# Output the predicted class
print("Predicted Class:", predictions[0])
print(label_encoder.classes_)

Predicted Class: darknet
['darknet' 'normal']


In [9]:
if isinstance(df_features, pd.Series):
    df_features = df_features.to_frame().transpose()

predictions = xgb_classifier.predict(df_features)

In [10]:
if predictions[0] == 0:
    print("Darknet")
else:
    print("Normal")

Darknet


# **2nd Level Classification**

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the labeled dataset
df_labeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/darknet-type.csv')

# Replace infinite values with NaN and fill NaN values with zeros
df_labeled.replace([np.inf, -np.inf], np.nan, inplace=True)
df_labeled.fillna(0, inplace=True)

# Define the features to keep
features_to_keep = [
    'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
    'Flow Bytes/s', 'Flow Packets/s', 'Fwd IAT Mean',
    'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
    'Packet Length Min', 'Packet Length Mean', 'ACK Flag Count',
    'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Idle Mean',
    'Idle Std', 'Idle Max', 'Idle Min'
]

# Keep only the relevant features
df_relevant_features = df_labeled[features_to_keep + ['Label']]

# Split the data into features and labels
X = df_relevant_features.drop('Label', axis=1)
y = df_relevant_features['Label'].map({'Normal': 'normal', 'I2P': 'I2P', 'Tor': 'Tor', 'FreeNet': 'freenet','ZeroNet': 'ZeroNet'})

# Ensure all feature columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')
df_labeled.fillna(0, inplace=True)

# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply one-hot encoding only to the train and test datasets
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align X_train and X_test
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Initialize XGBoost classifier
Xgb_level2 = xgb.XGBClassifier(objective='multi:softprob', num_class=5, seed=42)

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train the classifier
Xgb_level2.fit(X_train, y_train_encoded)

# Make predictions
encoded_predictions = Xgb_level2.predict(X_test)

# Decode the predictions
predictions = label_encoder.inverse_transform(encoded_predictions)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test_encoded, encoded_predictions)
f1 = f1_score(y_test_encoded, encoded_predictions, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/csv/darknet-type.csv'

In [25]:
print(predictions[1])

Tor


In [26]:
# Save the model to a binary file
Xgb_level2.save_model('xgblevel2_model.json')

In [33]:
# To load the model from the file
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgblevel2_model.json')

In [34]:
import xgboost as xgb

def classify_traffic(df_features, model_path):
    # Load the trained model
    xgb_model = xgb.XGBClassifier()

    # Load the model
    xgb_model.load_model(model_path)

    # Ensure that the model has been fitted before making predictions
    if not xgb_model.get_booster():
        raise ValueError("Model needs to be fitted before making predictions")

    # Predict the traffic class
    predictions = xgb_model.predict(df_features)
    return predictions

In [47]:
import xgboost as xgb

# Define the path to your model and pcap file
model_path = '/content/xgblevel2_model.json'  # Make sure this is the correct path to your model file
pcap_file_path = '/content/drive/MyDrive/Colab Notebooks/pcap/i2p-p2p_00001_20200506213643.pcap'

# Load the trained XGBoost model
Xgb_level2 = xgb.XGBClassifier()
Xgb_level2.load_model(model_path)

# Extract features from the pcap file
df_features = extract_features(pcap_file_path)

# Prepare the features for the model (make sure df_features is a DataFrame with the correct shape)
# The model expects a 2D array-like structure. If df_features is a Series, convert it to a DataFrame

# Check the predicted class and print the corresponding label


# Output the predicted class
#print("Predicted Class:", predictions[0])


In [None]:
import xgboost as xgb
import pandas as pd

# Assuming the extract_features function is defined elsewhere and correctly implemented
# from your_feature_extraction_module import extract_features

model_path = '/content/xgblevel2_model.json'  # Correct path to your model file
pcap_file_path = '/content/drive/MyDrive/Colab Notebooks/pcap/freenet-file1_00001_20200416195121.pcap'

# Load the trained XGBoost model
Xgb_level2 = xgb.XGBClassifier()
Xgb_level2.load_model(model_path)

# Extract features from the pcap file
df_features = extract_features(pcap_file_path)

# Ensure df_features is correctly formatted as a DataFrame
if isinstance(df_features, pd.Series):
    df_features = df_features.to_frame().T  # Convert Series to DataFrame if necessary

# Prepare the features for the model (e.g., scaling, encoding) as was done for the training data
# Ensure df_features has the correct columns in the correct order
expected_feature_names = ['Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
                          'Flow Bytes/s', 'Flow Packets/s', 'Fwd IAT Mean',
                          'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
                          'Packet Length Min', 'Packet Length Mean', 'ACK Flag Count',
                          'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Idle Mean',
                          'Idle Std', 'Idle Max', 'Idle Min']

df_features = df_features[expected_feature_names]

# Now, predict the traffic class using the model
predictions = Xgb_level2.predict(df_features)

# Assuming the predictions need to be mapped back to readable class labels
# This step requires you to have the mapping or use the original encoder used during model training
# Assuming label_encoder is your LabelEncoder instance used during model training
original_label = label_encoder.inverse_transform([predictions[0]])
print("Predicted Class:", original_label[0])



# **Flask**

In [17]:
!pip install flask
!pip install scapy



In [18]:
!pip install flask flask-ngrok




In [19]:
!pip install flask-ngrok



In [20]:
!pip install pyngrok



In [21]:
!ngrok authtoken '2cpp1zJAcAZMCHxMkJkMa0cx3yL_6MW1WjyGsbC6JCuEFmm8B'


Authtoken saved to configuration file: C:\Users\hp\AppData\Local/ngrok/ngrok.yml


In [26]:
pip install flask-cors


Collecting flask-cors
  Downloading Flask_Cors-4.0.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-4.0.0


In [None]:
from pyngrok import ngrok

# Setup a tunnel to the Flask application on port 5000
ngrok_tunnel = ngrok.connect(5000)


t=2024-02-25T06:06:52+0530 lvl=eror msg="heartbeat timeout, terminating session" obj=tunnels.session obj=csess id=b4d319fc75a5 clientid=6c1748ce15e8c4ade5984fae366bafb3
t=2024-02-25T06:06:52+0530 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=08ab938f3a88 err="session closed"
t=2024-02-25T06:06:52+0530 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=08ab938f3a88 err="failed to dial ngrok server with address \"connect.ngrok-agent.com:443\": dial tcp: lookup connect.ngrok-agent.com: no such host"
t=2024-02-25T06:06:53+0530 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=08ab938f3a88 err="failed to dial ngrok server with address \"connect.ngrok-agent.com:443\": dial tcp: lookup connect.ngrok-agent.com: no such host"
t=2024-02-25T06:06:54+0530 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=08ab938f3a88 err="failed to dial ngrok server with address \"connect.ngrok-ag

In [35]:
from flask import Flask, request, jsonify, render_template
from werkzeug.utils import secure_filename
from flask_cors import CORS
import os
import numpy as np
import pandas as pd
from scapy.all import rdpcap, IP, TCP
import xgboost as xgb

app = Flask(__name__, template_folder='.')
app.config['UPLOAD_FOLDER'] = 'uploads'
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
CORS(app)

MODEL_PATH = './xgb_model.json'
xgb_model = xgb.XGBClassifier()
xgb_model.load_model(MODEL_PATH)

def preprocess_pcap(file_path):
    packets = rdpcap(file_path)
    features = {
        'Flow Duration': 0.0,
        'Total Fwd Packet': 0,
        'Total Bwd packets': 0,
        'Packet Length Min': np.inf,
        'Packet Length Mean': 0.0,
        'Fwd IAT Total': 0.0,
        'Flow IAT Min': np.inf,
        'Flow IAT Max': 0.0,
        'Fwd IAT Mean': 0.0,
        'Flow Packets/s': 0.0,
        'Flow Bytes/s': 0.0,
        'Idle Min': np.inf,
        'Idle Max': 0.0,
        'Idle Mean': 0.0,
        'Idle Std': 0.0,
        'FWD Init Win Bytes': 0,
        'Bwd Init Win Bytes': 0,
        'ACK Flag Count': 0
    }

    if not packets:
        return pd.DataFrame(features, index=[0])

    start_times = []
    packet_lengths = []
    iats = []
    total_bytes = 0

    for packet in packets:
        if IP in packet and TCP in packet:
            packet_length = len(packet)
            packet_lengths.append(packet_length)
            total_bytes += packet_length

            if 'S' in packet[TCP].flags:
                if features['FWD Init Win Bytes'] == 0:
                    features['FWD Init Win Bytes'] = packet[TCP].window
                else:
                    features['Bwd Init Win Bytes'] = packet[TCP].window

            if 'A' in packet[TCP].flags:
                features['ACK Flag Count'] += 1

            start_times.append(float(packet.time))

            if len(start_times) > 1:
                iat = start_times[-1] - start_times[-2]
                iats.append(iat)

    features['Flow Duration'] = max(start_times) - min(start_times)
    features['Total Fwd Packet'] = len([p for p in packets if IP in p and p[IP].src < p[IP].dst])
    features['Total Bwd packets'] = len([p for p in packets if IP in p and p[IP].src > p[IP].dst])
    features['Packet Length Min'] = min(packet_lengths)
    features['Packet Length Mean'] = np.mean(packet_lengths) if packet_lengths else 0
    features['Fwd IAT Total'] = sum(iats)
    features['Flow IAT Min'] = min(iats) if iats else 0
    features['Flow IAT Max'] = max(iats) if iats else 0
    features['Fwd IAT Mean'] = np.mean(iats) if iats else 0
    features['Flow Packets/s'] = len(packets) / features['Flow Duration'] if features['Flow Duration'] else 0
    features['Flow Bytes/s'] = total_bytes / features['Flow Duration'] if features['Flow Duration'] else 0

    df_features = pd.DataFrame([features])
    df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_features.fillna(0, inplace=True)
    return df_features

def predict_darknet(file_path):
    try:
        df_features = preprocess_pcap(file_path)
        predictions = xgb_model.predict(df_features)
        return predictions.tolist()  # Convert numpy array to list for JSON serialization
    except Exception as e:
        print("Error predicting:", e)
        return None

@app.route("/")
def home():
    return render_template("index.html")

@app.route('/predict', methods=['POST'])
def predict():
    try:
        if 'file' not in request.files:
            return jsonify({'error': 'No file part'}), 400

        file = request.files['file']
        if file.filename == '':
            return jsonify({'error': 'No selected file'}), 400

        filename = secure_filename(file.filename)
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(file_path)

        predictions = predict_darknet(file_path)
        if predictions is not None:
            return jsonify({'prediction': predictions})
        else:
            return jsonify({'error': 'Failed to make predictions'}), 500
    except Exception as e:
        print("Error processing file:", e)
        return jsonify({'error': 'Failed to process the file. Please try again.'}), 500

if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [37]:
# Pseudocode for REST API design

# Endpoint: /predict
# Method: POST
# Description: Receives a file upload and returns predictions from an XGBoost model.

# Define the REST API interface
class RestAPIInterface:
    
    # Constructor to load the XGBoost model
    def _init_(self, model_path):
        self.model = self.load_model(model_path)
    
    # Method to load the model
    def load_model(self, model_path):
        # Load the XGBoost model
        model = xgb.XGBClassifier()
        model.load_model(model_path)
        return model
    
    # Method to preprocess pcap file and extract features
    def preprocess_pcap(self, file_path):
        # Preprocessing logic here
        pass
    
    # Method to predict using the preprocessed data
    def predict_darknet(self, file_path):
        features = self.preprocess_pcap(file_path)
        # Predict and return results
        pass
    
    # Endpoint to handle file upload and prediction
    def predict(self, file):
        # Logic to handle file upload
        # Logic to save the file
        file_path = self.save_file(file)
        
        # Make prediction
        predictions = self.predict_darknet(file_path)
        
        # Logic to return the prediction result
        return predictions

# This interface would be used by a web framework (like Flask, Django, FastAPI) to handle requests
# and responses. For example, in Flask, you would have:

from flask import Flask, request, jsonify

app = Flask(__name__)
api = RestAPIInterface('/path/to/model.json')
@app.route('/predict', methods=['POST'])
def predict_route():
    file = request.files['file']
    predictions = api.predict(file)
    return jsonify(predictions)

TypeError: RestAPIInterface() takes no arguments