In [1]:
from datasets import load_dataset
import os
dataset = load_dataset('parquet', data_files='5G_SecOps_Data_10K.parquet')
api_key = os.getenv('OPENAI_API_KEY')
org_key = os.getenv('OPENAI_ORG_ID')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 10000 examples [00:00, 396167.45 examples/s]


In [2]:
train_data = dataset['train']
train_data[0]

{'Cluster': '5G_Core_Cluster_eu-west1',
 'Namespace': 'ericsson',
 'Policy Violations': 5,
 'Suspicious Process Executions': 1,
 'Image Vulnerabilities': 7,
 'Service Configurations': 'Good',
 'Service Reachability': 'External',
 'NEP Vulnerabilities': 5,
 'RBAC Configuration': 'Medium',
 'Protocol Anomalies': 1,
 'Routing Table Changes': 1,
 'DoS Attacks': 2,
 'Firewall Events': 3,
 'Authentication Failures': 0,
 'Authorization Failures': 0,
 'SSL/TLS Certificate Expiry': 0,
 'NTP Synchronization': 0,
 'Unauthorized Device Inventory': 0,
 'Backup Integrity': 0,
 'NMS Alerts': 3,
 'Risk Score': 55.0}

In [3]:
import openai

class RiskScorePredictor:
    def __init__(self, api_key=api_key, model="gpt-3.5-turbo-1106", org_key=org_key):
        openai.api_key = api_key
        openai.organization = org_key
        self.model = model
        self.system_message = {
            "role": "system",
            "content": "You are an expert AI assistant that predicts the risk score based on provided system characteristics over a historical window."
        }

    def create_user_message(self, historical_window):
        """Creates a user message with a historical window of data."""
        message_content = "Given the following historical system characteristics, predict the Risk Score:\n\n"
        
        for i, entry in enumerate(historical_window):
            message_content += (
                f"--- Entry {i+1} ---\n"
                f"Cluster: {entry['Cluster']}\n"
                f"Namespace: {entry['Namespace']}\n"
                f"Policy Violations: {entry['Policy Violations']}\n"
                f"Suspicious Process Executions: {entry['Suspicious Process Executions']}\n"
                f"Image Vulnerabilities: {entry['Image Vulnerabilities']}\n"
                f"Service Configurations: {entry['Service Configurations']}\n"
                f"Service Reachability: {entry['Service Reachability']}\n"
                f"NEP Vulnerabilities: {entry['NEP Vulnerabilities']}\n"
                f"RBAC Configuration: {entry['RBAC Configuration']}\n"
                f"Protocol Anomalies: {entry['Protocol Anomalies']}\n"
                f"Routing Table Changes: {entry['Routing Table Changes']}\n"
                f"DoS Attacks: {entry['DoS Attacks']}\n"
                f"Firewall Events: {entry['Firewall Events']}\n"
                f"Authentication Failures: {entry['Authentication Failures']}\n"
                f"Authorization Failures: {entry['Authorization Failures']}\n"
                f"SSL/TLS Certificate Expiry: {entry['SSL/TLS Certificate Expiry']}\n"
                f"NTP Synchronization: {entry['NTP Synchronization']}\n"
                f"Unauthorized Device Inventory: {entry['Unauthorized Device Inventory']}\n"
                f"Backup Integrity: {entry['Backup Integrity']}\n"
                f"NMS Alerts: {entry['NMS Alerts']}\n\n"
            )

        message_content += "What is the predicted Risk Score for next entry? Only output your predicted score (float number). Only output a single float number. No other text."
        
        return {
            "role": "user",
            "content": message_content
        }

    def create_user_message_w_reasoning(self, historical_window):
        """Creates a user message with a historical window of data."""
        message_content = "Given the following historical system characteristics, predict the Risk Score:\n\n"
        
        for i, entry in enumerate(historical_window):
            message_content += (
                f"--- Entry {i+1} ---\n"
                f"Cluster: {entry['Cluster']}\n"
                f"Namespace: {entry['Namespace']}\n"
                f"Policy Violations: {entry['Policy Violations']}\n"
                f"Suspicious Process Executions: {entry['Suspicious Process Executions']}\n"
                f"Image Vulnerabilities: {entry['Image Vulnerabilities']}\n"
                f"Service Configurations: {entry['Service Configurations']}\n"
                f"Service Reachability: {entry['Service Reachability']}\n"
                f"NEP Vulnerabilities: {entry['NEP Vulnerabilities']}\n"
                f"RBAC Configuration: {entry['RBAC Configuration']}\n"
                 f"Protocol Anomalies: {entry['Protocol Anomalies']}\n"
                f"Routing Table Changes: {entry['Routing Table Changes']}\n"
                f"DoS Attacks: {entry['DoS Attacks']}\n"
                f"Firewall Events: {entry['Firewall Events']}\n"
                f"Authentication Failures: {entry['Authentication Failures']}\n"
                f"Authorization Failures: {entry['Authorization Failures']}\n"
                f"SSL/TLS Certificate Expiry: {entry['SSL/TLS Certificate Expiry']}\n"
                f"NTP Synchronization: {entry['NTP Synchronization']}\n"
                f"Unauthorized Device Inventory: {entry['Unauthorized Device Inventory']}\n"
                f"Backup Integrity: {entry['Backup Integrity']}\n"
                f"NMS Alerts: {entry['NMS Alerts']}\n\n"
            )
        message_content += "What is the predicted Risk Score for next entry? Do some reasoning. Only output your predicted score (float number). Only output a single float number. No other text."
        
        return {
            "role": "user",
            "content": message_content
        }


    def predict_risk_score(self, historical_window):
        """Predicts the risk score based on a historical window of entries."""
        messages = [
            self.system_message,
            #self.create_user_message(historical_window),
            self.create_user_message_w_reasoning(historical_window),
            {"role": "assistant", "content": ""}
        ]
        
        response = openai.chat.completions.create(
            model=self.model,
            messages=messages,
            max_tokens=50,
            temperature=0.0
        )
        
        return response.choices[0].message.content

In [4]:
import numpy as np

def evaluate_predictions(predictor, dataset, n=10, window_size=3):
    predictions = []
    ground_truth = []
    
    for i in range(window_size, n):
        # Prepare the historical window
        historical_window = [dataset[j] for j in range(i-window_size, i)]
        
        # Predict the risk score
        predicted_score = float(predictor.predict_risk_score(historical_window))
        
        # Extract the ground truth risk score
        actual_score = dataset[i]['Risk Score']
        
        # Append the results for evaluation
        predictions.append(float(predicted_score))
        ground_truth.append(actual_score)
        
        print(f"Predicted Risk Score: {predicted_score}, Actual Risk Score: {actual_score}")
    
    # Calculate the Mean Squared Error (MSE) as an evaluation metric
    mse = np.mean((np.array(predictions) - np.array(ground_truth)) ** 2)
    print(f"Mean Squared Error over first {n} entries: {mse}")

## zero shot raw prediction

In [5]:
# Initialize the predictor
predictor = RiskScorePredictor(model="gpt-3.5-turbo-1106")

# Assume we want to use a window of the last 3 entries to predict the risk score
historical_window = [train_data[i] for i in range(10)]

# Predict the risk score based on the historical window
predicted_risk_score = predictor.predict_risk_score(historical_window)
print(f"Predicted Risk Score: {predicted_risk_score}")

evaluate_predictions(predictor, train_data, n=10, window_size=3)

AttributeError: module 'openai' has no attribute 'chat'

## with reasoning

In [11]:
# Initialize the predictor
predictor = RiskScorePredictor(model="gpt-3.5-turbo")

# Assume we want to use a window of the last 3 entries to predict the risk score
historical_window = [train_data[i] for i in range(10)]

# Predict the risk score based on the historical window
predicted_risk_score = predictor.predict_risk_score(historical_window)
print(f"Predicted Risk Score: {predicted_risk_score}")

evaluate_predictions(predictor, train_data, n=10, window_size=3)

Predicted Risk Score: 6.5
Predicted Risk Score: 7.3, Actual Risk Score: 46.5
Predicted Risk Score: 7.2, Actual Risk Score: 15.0
Predicted Risk Score: 7.2, Actual Risk Score: 49.0
Predicted Risk Score: 7.5, Actual Risk Score: 30.0
Predicted Risk Score: 7.5, Actual Risk Score: 46.0
Predicted Risk Score: 6.8, Actual Risk Score: 20.0
Predicted Risk Score: 4.2, Actual Risk Score: 40.0
Mean Squared Error over first 10 entries: 969.8714285714286
