# Data Staging & Exfiltration Early Warning

This notebook surfaces early indicators of data staging and exfiltration by correlating endpoint, network, and storage telemetry from Microsoft Sentinel Data Lake.

## Why it matters
- Focuses on the *exfiltration* phase of the cyber kill chain, complementing identity and threat-hunting notebooks.
- Helps SOC analysts validate whether suspicious file staging is progressing toward actual data loss.
- Provides repeatable detections you can operationalize into scheduled jobs or Sentinel analytics rules.

ℹ️ **Run the setup notebook first** to populate workspace names and attach to your Spark pool.

## Detection focus
- **Local staging**: creation of archive files or bulk copies into staging folders.
- **Cloud/object uploads**: command-line tools pushing data to blob/object storage.
- **Outbound spikes**: anomalous egress bursts toward rare destinations.
- **Audit corroboration**: storage key/list operations that often precede exfiltration.

| Table | Signal | Why it matters |
| --- | --- | --- |
| `DeviceFileEvents` | File writes, archive creation | Identify staging of data sets |
| `DeviceProcessEvents` | Command-line telemetry | Catch compression tools and storage CLIs |
| `DeviceNetworkEvents` | Network flows | Measure outbound volume and destinations |
| `StorageBlobLogs` / `BlobServiceLogs` | Storage access | Confirm uploads to cloud storage |
| `AuditLogs` | Azure activity trail | Highlight storage key/permission operations |

In [None]:
# Imports and analyst configuration
import pandas as pd
from datetime import datetime, timedelta

from sentinel_lake.providers import MicrosoftSentinelProvider
from pyspark.sql.functions import (
    col, lower, regexp_extract, sum as spark_sum, countDistinct,
    lit, date_trunc
)
from pyspark.sql.window import Window

data_provider = MicrosoftSentinelProvider(spark)

ANALYSIS_HOURS = 24
PRIMARY_WORKSPACE = "ak-SecOps"  # replace with your workspace
STORAGE_WORKSPACE = PRIMARY_WORKSPACE  # override if storage logs sit elsewhere

WORKSPACE_MAPPING = {
    'DeviceFileEvents': PRIMARY_WORKSPACE,
    'DeviceProcessEvents': PRIMARY_WORKSPACE,
    'DeviceNetworkEvents': PRIMARY_WORKSPACE,
    'AuditLogs': PRIMARY_WORKSPACE,
    'StorageBlobLogs': STORAGE_WORKSPACE,
    'BlobServiceLogs': STORAGE_WORKSPACE
}

UTC_NOW = datetime.utcnow()
START_TIME = UTC_NOW - timedelta(hours=ANALYSIS_HOURS)
print(f"Analysis window (UTC): {START_TIME} → {UTC_NOW}")
for table, workspace in WORKSPACE_MAPPING.items():
    print(f"• {table} → {workspace}")

In [None]:
# Helper utilities
def try_load(table_name: str, timestamp_col: str | None = None, workspace: str | None = None):
    workspace = workspace or WORKSPACE_MAPPING.get(table_name, PRIMARY_WORKSPACE)
    try:
        df = data_provider.read_table(table_name, workspace)
        if timestamp_col and timestamp_col in df.columns:
            df = df.filter((col(timestamp_col) >= lit(START_TIME)) & (col(timestamp_col) <= lit(UTC_NOW)))
        return df
    except Exception as exc:
        print(f"⚠️ {table_name} not available: {exc}")
        return None


def preview(df, label: str, sample: int = 5):
    if df is None:
        print(f"⚠️ {label}: table unavailable")
        return
    count = df.count()
    print(f"✅ {label}: {count} rows in window")
    sample_columns = df.columns[: min(len(df.columns), 8)]
    if sample_columns:
        print("   columns:", ", ".join(sample_columns))


device_files = try_load('DeviceFileEvents', 'Timestamp')
device_processes = try_load('DeviceProcessEvents', 'Timestamp')
device_network = try_load('DeviceNetworkEvents', 'Timestamp')
storage_logs = try_load('StorageBlobLogs', 'TimeGenerated') or try_load('BlobServiceLogs', 'TimeGenerated')
audit_logs = try_load('AuditLogs', 'TimeGenerated')

preview(device_files, 'DeviceFileEvents')
preview(device_processes, 'DeviceProcessEvents')
preview(device_network, 'DeviceNetworkEvents')
preview(storage_logs, 'Storage blob logs')
preview(audit_logs, 'AuditLogs')

In [None]:
# Detection 1 – Compression / staging behavior
ALLOWED_COMMAND_PATTERNS = [
    # Add fully qualified command lines or fragments for sanctioned compression jobs
    r".*\\backup_agent\\.*zip",
    r"/usr/bin/tar .* /var/backups"
]
ALLOWED_PATH_PATTERNS = [
    # Add trusted staging directories used by backup tooling
    r".*\\CompanyBackup\\",
    r"/var/backups/.*"
]
ALLOWED_ACCOUNTS = {"backup-svc"}
ALLOWED_DEVICES = {"backup-server01"}


def _compile_or_pattern(patterns: list[str]) -> str:
    return "|".join([f"({p.lower()})" for p in patterns if p])


def detect_compression_activity(file_df, process_df):
    if file_df is None or process_df is None:
        return pd.DataFrame()

    compression_procs = process_df.filter(
        lower(col('ProcessCommandLine')).rlike('compress|zip|7z|rar|archive|tar')
    )

    command_allowlist = _compile_or_pattern(ALLOWED_COMMAND_PATTERNS)
    if command_allowlist:
        compression_procs = compression_procs.filter(~lower(col('ProcessCommandLine')).rlike(command_allowlist))

    if ALLOWED_ACCOUNTS and 'InitiatingProcessAccountName' in compression_procs.columns:
        allowed_accounts = [acct.lower() for acct in ALLOWED_ACCOUNTS]
        compression_procs = compression_procs.filter(~lower(col('InitiatingProcessAccountName')).isin(allowed_accounts))

    if ALLOWED_DEVICES and 'DeviceName' in compression_procs.columns:
        allowed_devices = [device.lower() for device in ALLOWED_DEVICES]
        compression_procs = compression_procs.filter(~lower(col('DeviceName')).isin(allowed_devices))

    if compression_procs.rdd.isEmpty():
        return pd.DataFrame()

    staging_candidates = file_df.filter(
        lower(col('FolderPath')).rlike('temp|\\\\staging|\\\\backup') &
        lower(col('FileName')).rlike('\\\\.(zip|7z|rar|tar|gz|bz2)$')
    )

    path_allowlist = _compile_or_pattern(ALLOWED_PATH_PATTERNS)
    if path_allowlist:
        staging_candidates = staging_candidates.filter(~lower(col('FolderPath')).rlike(path_allowlist))

    staging_files = staging_candidates.select(
        'DeviceName',
        col('FileName').alias('StagedFileName'),
        col('FolderPath').alias('StagedFolderPath'),
        col('Timestamp').alias('StagedFileTimestamp')
    )

    joined = compression_procs.join(
        staging_files,
        on='DeviceName',
        how='left'
    )

    selected_cols = [
        'DeviceName',
        'InitiatingProcessAccountName',
        'ProcessCommandLine',
        'Timestamp'
    ]
    if 'InitiatingProcessParentFileName' in compression_procs.columns:
        selected_cols.append('InitiatingProcessParentFileName')

    pandas_df = joined.select(*selected_cols,
                              'StagedFileName',
                              'StagedFolderPath',
                              'StagedFileTimestamp').toPandas()

    return pandas_df

compression_findings = detect_compression_activity(device_files, device_processes)
compression_findings.head()

In [None]:
# Detection 2 – Cloud storage uploads via CLI tools
def detect_cli_storage_uploads(process_df, network_df):
    if process_df is None or network_df is None:
        return pd.DataFrame()

    cli_exec = process_df.filter(
        lower(col('ProcessCommandLine')).rlike('azcopy|az storage|aws s3|gsutil|rclone|scp')
    )
    if cli_exec.count() == 0:
        return pd.DataFrame()

    storage_net = network_df.filter(
        lower(col('RemoteUrl')).rlike('blob.core.windows.net|amazonaws.com|storage.googleapis.com|digitaloceanspaces.com')
    )

    if 'InitiatingProcessId' in cli_exec.columns and 'InitiatingProcessId' in storage_net.columns:
        joined = cli_exec.join(storage_net, ['DeviceName', 'InitiatingProcessId'], 'inner')
    else:
        joined = cli_exec.join(storage_net, ['DeviceName'], 'inner')

    cols = [c for c in ['Timestamp', 'DeviceName', 'InitiatingProcessAccountName', 'ProcessCommandLine', 'RemoteUrl', 'BytesSent'] if c in joined.columns]
    return joined.select(*cols).toPandas()

cli_uploads = detect_cli_storage_uploads(device_processes, device_network)
cli_uploads.head()

In [None]:
# Detection 3 – Anomalous egress spikes
def detect_egress_spikes(network_df):
    if network_df is None or 'BytesSent' not in network_df.columns:
        return pd.DataFrame()

    hourly = network_df.groupBy('DeviceName', date_trunc('hour', col('Timestamp')).alias('Hour')).agg(
        spark_sum('BytesSent').alias('BytesSent')
    )

    window = Window.partitionBy('DeviceName').orderBy('Hour').rowsBetween(-23, 0)
    enriched = hourly.withColumn('RollingAvg', spark_sum('BytesSent').over(window) / lit(24))
    suspicious = enriched.filter(col('BytesSent') > col('RollingAvg') * lit(3))

    return suspicious.orderBy(col('BytesSent').desc()).toPandas()

egress_spikes = detect_egress_spikes(device_network)
egress_spikes.head()

In [None]:
# Detection 4 – Storage account audit anomalies
def detect_storage_audit_events(audit_df):
    if audit_df is None:
        return pd.DataFrame()

    storage_ops = audit_df.filter(
        lower(col('OperationName')).rlike('storageaccounts|accesskeys|listkeys|setserviceproperties')
    )

    cols = [c for c in ['TimeGenerated', 'OperationName', 'ResultDescription', 'InitiatedBy', 'TargetResources'] if c in storage_ops.columns]
    return storage_ops.select(*cols).toPandas()

storage_audit_events = detect_storage_audit_events(audit_logs)
storage_audit_events.head()

## Triage summary
Correlate findings to decide whether to escalate. Tune thresholds and allow-lists for your environment.

In [None]:
# Aggregate and score findings
def summarize_findings(**signals):
    rows = []
    score = 0
    for name, df in signals.items():
        if isinstance(df, pd.DataFrame) and not df.empty:
            rows.append({'signal': name, 'rows': len(df)})
            score += min(len(df), 10)
    summary = pd.DataFrame(rows)
    if score >= 20:
        level = 'HIGH'
    elif score >= 10:
        level = 'MEDIUM'
    else:
        level = 'LOW' if score > 0 else 'NONE'
    return summary, score, level

summary_table, risk_score, risk_level = summarize_findings(
    compression=compression_findings,
    cli_uploads=cli_uploads,
    egress_spikes=egress_spikes,
    storage_audit=storage_audit_events
)

print(f"Risk score: {risk_score} ({risk_level})")
if not summary_table.empty:
    display(summary_table)
else:
    print('No detections in current window.')

## Next steps for defenders
- Validate compression hosts with scheduled backup windows.
- Investigate CLI uploads for credential or tool misuse.
- Cross-reference egress hosts with identity anomalies or incident queues.
- Convert high-confidence detections into Sentinel analytics rules or scheduled notebooks.
- Iterate on thresholds and allow-lists after each hunting cycle.