# Notebook 3

In [1]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
# Load engineered dataset
working_data = pd.read_csv("../data/processed/dataset_engineered.csv")


### Prepare features and target

In [3]:
# Prepare features and target

# Identify feature columns (exclude target, category, and weight)
exclude_cols = ["vulnerability_score", "vulnerability_category"]
if "weight" in working_data.columns:
    exclude_cols.append("weight")

feature_columns = [col for col in working_data.columns if col not in exclude_cols]
print(f"Feature columns identified: {len(feature_columns)}")


Feature columns identified: 65


### Separate features and target

In [4]:
# Separate features and target
X = working_data[feature_columns].copy()
y = working_data["vulnerability_score"].copy()
y_cat = working_data["vulnerability_category"].copy()


## Stratified train/validation/test split

- We stratify on categories instead of scores because categories represent meaningful business segments and we need each split to represent all segments

In [7]:
# First split: 70% train, 30% temp
## the remaining 30% will be split into validation and test sets later
X_train, X_temp, y_train, y_temp, y_cat_train, y_cat_temp = train_test_split(
    X, y, y_cat, test_size=0.30, stratify=y_cat, random_state=42
)

# Second split: Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test, y_cat_val, y_cat_test = train_test_split(
    X_temp,
    y_temp,
    y_cat_temp,
    test_size=0.50,  # 50% of 30% is equal to 15% of total
    stratify=y_cat_temp,
    random_state=42,
)

print("Split sizes:")
print(
    f"  Training: {X_train.shape[0]:,} ({X_train.shape[0] / len(working_data) * 100:.1f}%)"
)
print(
    f"  Validation: {X_val.shape[0]:,} ({X_val.shape[0] / len(working_data) * 100:.1f}%)"
)
print(f"  Test: {X_test.shape[0]:,} ({X_test.shape[0] / len(working_data) * 100:.1f}%)")
print(f"  Total: {X_train.shape[0] + X_val.shape[0] + X_test.shape[0]:,}")


Split sizes:
  Training: 16,933 (70.0%)
  Validation: 3,628 (15.0%)
  Test: 3,629 (15.0%)
  Total: 24,190


### Verify stratification

In [8]:
# Verify stratification


def print_category_distribution(y_cat, split_name):
    dist = y_cat.value_counts(normalize=True).sort_index() * 100  # we get percentages
    print(f"{split_name}:")
    for cat in [
        "Low",
        "Medium",
        "High",
        "Very High",
    ]:  # here, specifying the order of categories to ensure consistency
        if cat in dist.index:  # checks if the category exists in the distribution
            print(f"  {cat}: {dist[cat]:.1f}%")


print("Category distributions across splits:")
print_category_distribution(y_cat, "Original")
print_category_distribution(y_cat_train, "Training")
print_category_distribution(y_cat_val, "Validation")
print_category_distribution(y_cat_test, "Test")


Category distributions across splits:
Original:
  Low: 25.0%
  Medium: 58.1%
  High: 15.9%
  Very High: 1.0%
Training:
  Low: 25.0%
  Medium: 58.1%
  High: 15.9%
  Very High: 1.0%
Validation:
  Low: 24.9%
  Medium: 58.1%
  High: 15.9%
  Very High: 1.0%
Test:
  Low: 25.0%
  Medium: 58.1%
  High: 15.9%
  Very High: 1.0%


In [9]:
# Calculate distribution differences
original_dist = y_cat.value_counts(normalize=True).sort_index()
train_dist = y_cat_train.value_counts(normalize=True).sort_index()
val_dist = y_cat_val.value_counts(normalize=True).sort_index()
test_dist = y_cat_test.value_counts(normalize=True).sort_index()

max_diff_train = abs(original_dist - train_dist).max()
# abs() because we want the magnitude of the difference not the direction
# max()because worst case difference across all categories
# difference > 5% indicates strat problems

max_diff_val = abs(original_dist - val_dist).max()
max_diff_test = abs(original_dist - test_dist).max()

print(f"Max distribution differences:")
print(f"  Training: {max_diff_train:.3f}")
print(f"  Validation: {max_diff_val:.3f}")
print(f"  Test: {max_diff_test:.3f}")

if max(max_diff_train, max_diff_val, max_diff_test) < 0.05:
    print("Stratification good (differences < 5%)")


Max distribution differences:
  Training: 0.000
  Validation: 0.000
  Test: 0.000
Stratification good (differences < 5%)


In [12]:
# Target variable statistics by split

splits = [
    ("Training", y_train),
    ("Validation", y_val),
    ("Test", y_test),
    ("Overall", y),
]

print("Vulnerability score statistics:")
print(f"{'Split':<12} {'Mean':<6} {'Std':<6} {'Min':<6} {'Max':<6}")
print()

for split_name, y_split in splits:
    mean_val = y_split.mean()
    std_val = y_split.std()
    min_val = y_split.min()
    max_val = y_split.max()
    print(
        f"{split_name:<12} {mean_val:<6.1f} {std_val:<6.1f} {min_val:<6.1f} {max_val:<6.1f}"
    )


Vulnerability score statistics:
Split        Mean   Std    Min    Max   

Training     35.6   15.1   0.0    94.0  
Validation   35.6   15.2   0.0    97.0  
Test         35.6   15.2   0.0    100.0 
Overall      35.6   15.2   0.0    100.0 


#### Feature statistics validation

In [13]:
# Check that feature distributions are similar across splits

# Sample a few key features for validation
key_features = []
if "age" in X.columns:
    key_features.append("age")

# Add a few binary features
binary_features = [col for col in X.columns if col.endswith("_binary")]
if binary_features:
    key_features.extend(binary_features[:3])

# Add a few one-hot encoded features
onehot_features = [
    col for col in X.columns if "_" in col and not col.endswith("_binary")
]
if onehot_features:
    key_features.extend(onehot_features[:2])

print(f"Validating {len(key_features)} key features:")

for feature in key_features:
    if feature in X.columns:
        train_mean = X_train[feature].mean()
        val_mean = X_val[feature].mean()
        test_mean = X_test[feature].mean()
        overall_mean = X[feature].mean()

        max_diff = max(
            abs(train_mean - overall_mean),
            abs(val_mean - overall_mean),
            abs(test_mean - overall_mean),
        )

        print(f"  {feature}: max difference = {max_diff:.4f}")

print("Feature distributions validated")


Validating 6 key features:
  age: max difference = 0.3944
  use_news_tvshows_binary: max difference = 0.0102
  use_news_tvchannels_binary: max difference = 0.0051
  use_news_radio_binary: max difference = 0.0068
  income_ordinal: max difference = 0.0138
  country_AT: max difference = 0.0051
Feature distributions validated


#  Save split datasets


In [16]:
# Save training data
X_train.to_csv("../data/splits/X_train.csv", index=False)
y_train.to_csv(
    "../data/splits/y_train.csv", index=False, header=["vulnerability_score"]
)


# Save validation data
X_val.to_csv("../data/splits/X_val.csv", index=False)
y_val.to_csv("../data/splits/y_val.csv", index=False, header=["vulnerability_score"])


# Save test data
X_test.to_csv("../data/splits/X_test.csv", index=False)
y_test.to_csv("../data/splits/y_test.csv", index=False, header=["vulnerability_score"])


# Save feature names
feature_info = {
    "feature_columns": feature_columns,
    "n_features": len(feature_columns),
    "split_info": {
        "train_size": len(X_train),
        "val_size": len(X_val),
        "test_size": len(X_test),
        "total_size": len(working_data),
    },
}


In [21]:
print("Data splitting complete:")
print(f"- Total observations: {len(working_data):,}")
print(f"- Features: {len(feature_columns)}")
print(f"- Training set: {len(X_train):,} (70%)")
print(f"- Validation set: {len(X_val):,} (15%)")
print(f"- Test set: {len(X_test):,} (15%)")
print()

print(f"Stratification verified:")
print(f"- Category distributions preserved across splits")
print(
    f"- Maximum distribution difference: {max(max_diff_train, max_diff_val, max_diff_test):.3f}"
)


Data splitting complete:
- Total observations: 24,190
- Features: 65
- Training set: 16,933 (70%)
- Validation set: 3,628 (15%)
- Test set: 3,629 (15%)

Stratification verified:
- Category distributions preserved across splits
- Maximum distribution difference: 0.000
