In [1]:
import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(42)
num_samples = 5000
tef = np.random.uniform(0.1, 1.0, num_samples)  # Threat Event Frequency
vulnerability = np.random.uniform(0, 1, num_samples)  # Vulnerability
asset_value = np.random.uniform(1000, 1000000, num_samples)  # Asset Value

# Calculate risk level (likelihood only)
risk_level = tef * vulnerability

# Create DataFrame
data = pd.DataFrame({
    'TEF': tef,
    'Vulnerability': vulnerability,
    'Asset_Value': asset_value,
    'Risk_Level': risk_level
})

# Categorize Risk_Level into High, Medium, Low
# Using quantiles for categorization
data['Risk_Category'] = pd.qcut(data['Risk_Level'], 
                                q=3, 
                                labels=['Low', 'Medium', 'High'])

# Display the first few rows of the dataset
print(data.head())


        TEF  Vulnerability    Asset_Value  Risk_Level Risk_Category
0  0.437086       0.393636  374267.177648    0.172053        Medium
1  0.955643       0.473436  333579.184135    0.452435          High
2  0.758795       0.854547  176977.758590    0.648426          High
3  0.638793       0.340004  607659.403431    0.217192        Medium
4  0.240417       0.869650  477147.536348    0.209078        Medium


In [2]:
X = data[['TEF', 'Vulnerability', 'Asset_Value']]
y = data['Risk_Category']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)


In [4]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [5]:
# Predict on the test set
predictions = model.predict(X_test)


In [6]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.984


In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate the best model
best_predictions = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, best_predictions)
print(f"Best Model Accuracy: {best_accuracy}")

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Model Accuracy: 0.984


In [8]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores)}")

Cross-Validation Accuracy Scores: [0.987 0.978 0.982 0.976 0.978]
Mean Cross-Validation Accuracy: 0.9802


In [9]:
#Predict Risk Scenarios

# Generate new synthetic data
new_tef = np.random.uniform(0.1, 1.0, 100)  # Threat Event Frequency
new_vulnerability = np.random.uniform(0, 1, 100)  # Vulnerability
new_asset_value = np.random.uniform(1000, 1000000, 100)  # Asset Value

# Create DataFrame for new data
new_data = pd.DataFrame({
    'TEF': new_tef,
    'Vulnerability': new_vulnerability,
    'Asset_Value': new_asset_value
})

In [10]:
# Predict risk levels for new data
new_predictions = best_model.predict(new_data)

# Add predictions to the new data DataFrame
new_data['Predicted_Risk_Category'] = new_predictions

# Display the first few rows of the new data with predictions
print(new_data.head())

        TEF  Vulnerability    Asset_Value Predicted_Risk_Category
0  0.549703       0.465872  637262.073193                  Medium
1  0.772072       0.057683  522305.646889                     Low
2  0.606400       0.186405  434642.317803                     Low
3  0.174972       0.560509  192783.530353                     Low
4  0.267022       0.866189  488731.139840                  Medium


In [17]:
# Export predictions to CSV

new_data.to_csv('predicted_risk_scenarios_updated.csv', index=False)

In [11]:
# Export predictions to CSV
predictions_df = pd.DataFrame(predictions, columns=['Predicted Risk'])
predictions_df.to_csv('predicted_risk_scenarios.csv', index=False)

In [13]:
print(predictions_df.tail())

     Predicted Risk
1495           High
1496         Medium
1497         Medium
1498           High
1499         Medium


In [30]:
new_data["Asset_Value"].median()

538390.5272271815