
# Analyzing Data Breach Trends Over Time (2013–2024)

This notebook performs an end-to-end analysis of publicly reported data breaches from **2013–2024**.

**What you'll see:**
- Data cleaning & validation
- Yearly trend analysis
- Industry × Attack Vector heatmap
- Severity classification from `records_exposed`
- *(Optional)* simple ML to predict severity from features

> If `data/breaches.csv` is empty or missing rows, we'll generate a small **synthetic sample** to demonstrate the pipeline.


In [None]:

# Imports
import os, math, json, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from src.utils import ensure_schema, classify_severity

DATA_PATH = Path('../data/breaches.csv')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)


## 1) Load data

In [None]:

# Load CSV
if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH)
else:
    df = pd.DataFrame()

df = ensure_schema(df)

# If dataset seems empty, create a synthetic sample to make the notebook runnable.
if df.dropna(how='all').empty or len(df) < 100:
    rng = np.random.default_rng(42)
    years = np.arange(2013, 2025)
    industries = ["Healthcare", "Financial", "Public Sector", "Retail", "Technology", "Education"]
    vectors = ["Phishing", "Ransomware", "Web App", "Insider", "Lost/Stolen", "Misconfiguration"]

    rows = []
    for y in years:
        n = rng.integers(50, 120)  # incidents per year
        for _ in range(int(n)):
            ind = rng.choice(industries)
            vec = rng.choice(vectors, p=[0.22,0.18,0.25,0.10,0.12,0.13])  # rough priors
            # records exposed grows slightly over time with heavy tail
            base = max(1, (y - 2012)) * 1000
            exp = int(abs(rng.lognormal(mean=math.log(base), sigma=1.0)))
            rows.append((y, ind, vec, exp, rng.choice(["Small","Medium","Large"]), rng.choice(["US","CA","UK","EU","IN"])))
    df = pd.DataFrame(rows, columns=["year","industry","attack_vector","records_exposed","org_size","country"])

df.head()


## 2) Clean & basic validation

In [None]:

# Keep only 2013–2024
df = df[(df['year'] >= 2013) & (df['year'] <= 2024)].copy()

# Drop rows missing critical fields
df = df.dropna(subset=['year','industry','attack_vector','records_exposed'])

# Add severity
df['severity'] = df['records_exposed'].apply(classify_severity)

# Quick sanity checks
print("Rows:", len(df))
print("Years:", int(df['year'].min()), "to", int(df['year'].max()))
print("Industries:", df['industry'].nunique(), "| Attack vectors:", df['attack_vector'].nunique())
df.sample(5, random_state=1)


## 3) Yearly trend analysis

In [None]:

yearly = df.groupby('year', as_index=False).agg(
    incidents=('year','count'),
    total_records=('records_exposed','sum'),
    median_records=('records_exposed','median')
)

display(yearly.head())

# Plot incidents per year
plt.figure()
plt.plot(yearly['year'], yearly['incidents'], marker='o')
plt.title('Incidents per Year (2013–2024)')
plt.xlabel('Year')
plt.ylabel('Number of Incidents')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR / 'incidents_per_year.png', dpi=200)
plt.show()

# Plot records exposed per year (log scale for readability)
plt.figure()
plt.plot(yearly['year'], yearly['total_records'], marker='o')
plt.title('Total Records Exposed per Year (2013–2024)')
plt.xlabel('Year')
plt.ylabel('Records Exposed (log scale)')
plt.yscale('log')
plt.grid(True, which='both', alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR / 'records_per_year.png', dpi=200)
plt.show()


## 4) Industry × Attack Vector heatmap

In [None]:

pivot = (df
         .groupby(['industry','attack_vector'])
         .size()
         .reset_index(name='count')
         .pivot(index='industry', columns='attack_vector', values='count')
         .fillna(0))

plt.figure()
sns.heatmap(pivot, annot=True, fmt='.0f')
plt.title('Incidents by Industry × Attack Vector (2013–2024)')
plt.xlabel('Attack Vector')
plt.ylabel('Industry')
plt.tight_layout()
plt.savefig(FIG_DIR / 'industry_attack_heatmap.png', dpi=200)
plt.show()


## 5) Severity distribution and top categories

In [None]:

# Severity counts
sev_counts = df['severity'].value_counts().reindex(['Low','Medium','High','Critical']).fillna(0)
display(sev_counts)

plt.figure()
sev_counts.plot(kind='bar')
plt.title('Severity Distribution')
plt.xlabel('Severity')
plt.ylabel('Incidents')
plt.tight_layout()
plt.savefig(FIG_DIR / 'severity_distribution.png', dpi=200)
plt.show()

# Top industries and attack vectors by median severity (proxied by records)
by_industry = df.groupby('industry')['records_exposed'].median().sort_values(ascending=False).head(10)
by_vector = df.groupby('attack_vector')['records_exposed'].median().sort_values(ascending=False).head(10)

display(by_industry, by_vector)


## 6) (Optional) Simple ML: Predict severity from features

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# Encode target as ordinal: Low < Medium < High < Critical
order = {'Low':0,'Medium':1,'High':2,'Critical':3}
df_ml = df[df['severity'].isin(order)].copy()
df_ml['severity_label'] = df_ml['severity'].map(order)

X = df_ml[['industry','attack_vector','org_size','country']].fillna('Unknown')
y = df_ml['severity_label']

categorical = list(X.columns)
pre = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical)]
)

clf = Pipeline(steps=[('pre', pre),
                     ('lr', LogisticRegression(max_iter=200, multi_class='auto'))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred, target_names=['Low','Medium','High','Critical']))



## 7) Takeaways (edit these after reviewing your actual data)

- Incidents show a (fill in) trend from 2013 to 2024.
- The most frequent attack vectors are (fill in), with notable spikes in (years).
- Industries most impacted include (fill in) driven by (vectors).
- Median records exposed indicate higher severity for (categories).
- The simple ML baseline achieves (X%) macro-F1; feature engineering could improve this.
