# üîê Service Principal Sign-In & Risk Analysis - Microsoft Sentinel Data Lake

Actionable analytics and security hunting over Azure AD (Entra ID) service principal activity using `AADServicePrincipalSignInLogs` and optional risk enrichment from `AADRiskyServicePrincipals`.

## üéØ Objectives
- Understand service principal authentication patterns
- Detect abnormal spikes, failures, geographic or resource anomalies
- Identify risky / high-impact service principals

## üöÄ Zero-Config Loading
This notebook attempts to load required tables automatically (no `PRIMARY_WORKSPACE` variable). It will:
1. Try `data_provider.read_table(table)` directly
2. If that fails, try common fallback workspace names (e.g. `default`, `ak-SecOps`)
3. Gracefully continue if a table is missing

You can later introduce explicit workspace mapping if needed for multi-workspace deployments.

---

In [None]:
# üì¶ Imports & Initialization (Zero Manual Workspace Config)
from sentinel_lake.providers import MicrosoftSentinelProvider
from pyspark.sql.functions import (
    col, lower, upper, count as spark_count, countDistinct, expr, when, avg, stddev,
    date_trunc, hour, dayofweek, to_timestamp, current_timestamp, lit, percentile_approx
)
from pyspark.sql import DataFrame
import seaborn as sns, matplotlib.pyplot as plt
from datetime import datetime, timedelta
import math, statistics, json, warnings
warnings.filterwarnings('ignore')

sns.set_theme(style='whitegrid')

data_provider = MicrosoftSentinelProvider(spark)
print('‚úÖ Environment initialized (no workspace variables required)')

TARGET_HOURS = 24  # Analysis window
FALLBACK_WORKSPACES = ['default', 'ak-SecOps']  # Attempted only if direct load fails
SERVICE_PRINCIPAL_TABLE_CANDIDATES = [
    'AADServicePrincipalSignInLogs',
    'ServicePrincipalSignInLogs',
    'AADServicePrincipalSignIns'
]
TABLES_OPTIONAL = ['AADRiskyServicePrincipals']


def try_read(table_name: str, workspace: str | None):
    """Attempt a read with optional workspace (positional second arg)."""
    if workspace:
        return data_provider.read_table(table_name, workspace)
    return data_provider.read_table(table_name)


def smart_load(table_name: str):
    """Attempt to load table with minimal assumptions.
    Strategy:
      1. Direct read (implicit/default binding)
      2. Fallback workspaces (positional second argument)
      3. Return (df, source_workspace, error)
    """
    last_error = None
    # 1. Direct
    try:
        df = try_read(table_name, None)
        return df, 'auto', None
    except Exception as e:
        last_error = str(e)
    # 2. Workspaces
    for ws in FALLBACK_WORKSPACES:
        try:
            df = try_read(table_name, ws)
            return df, ws, None
        except Exception as e2:
            last_error = str(e2)
    return None, None, last_error

loaded = {}

# Try all candidate service principal tables until one loads
sp_df = None
sp_workspace = None
sp_error_chain = []
for candidate in SERVICE_PRINCIPAL_TABLE_CANDIDATES:
    df, ws, err = smart_load(candidate)
    if df is not None:
        sp_df = df
        sp_workspace = ws
        print(f"‚úÖ Loaded service principal sign-ins from '{candidate}' (workspace={ws})")
        break
    else:
        sp_error_chain.append(f"{candidate}: {err}")

if sp_df is None:
    print('‚ö†Ô∏è No dedicated service principal sign-in table found. Attempting to derive from SigninLogs...')
    signin_df, signin_ws, signin_err = smart_load('SigninLogs')
    if signin_df is not None:
        print(f"üîÑ Deriving service principal activity from SigninLogs (workspace={signin_ws})")
        # Heuristic filters (schema dependent). We keep flexible checks.
        candidate_cols = signin_df.columns
        derived = signin_df
        if 'ServicePrincipalId' in candidate_cols:
            derived = derived.filter(col('ServicePrincipalId').isNotNull())
        elif 'AppId' in candidate_cols and 'UserId' in candidate_cols:
            # Filter where UserId null but AppId present (often service principal)
            derived = derived.filter(col('AppId').isNotNull() & col('UserId').isNull())
        elif 'AppId' in candidate_cols:
            derived = derived.filter(col('AppId').isNotNull())
        sp_df = derived
        sp_workspace = signin_ws if signin_ws else 'derived'
    else:
        print('‚ùå Could not derive from SigninLogs either.')
        for e in sp_error_chain:
            print('   ‚Ä¢ ' + e)

# Optional risk table
risk_df, risk_ws, risk_err = smart_load('AADRiskyServicePrincipals') if sp_df is not None else (None, None, None)
if risk_df is not None:
    print(f"üß© Loaded AADRiskyServicePrincipals (workspace={risk_ws})")
else:
    if risk_err:
        print(f"‚ÑπÔ∏è Risk enrichment table not available: {risk_err}")
    else:
        print('‚ÑπÔ∏è Risk enrichment skipped (service principal data missing)')

# If still no service principal sign-ins, stop early
if sp_df is None:
    raise RuntimeError('No service principal sign-in data available from candidates or SigninLogs.')

service_principal_signins = sp_df
risky_principals = risk_df

# Filter to analysis window if a recognizable time column exists
TIME_COLUMN_CANDIDATES = ['CreatedDateTime', 'TimeGenerated', 'Timestamp']
actual_time_col = None
for c in TIME_COLUMN_CANDIDATES:
    if c in service_principal_signins.columns:
        actual_time_col = c
        break

if actual_time_col:
    cutoff = current_timestamp() - expr(f'INTERVAL {TARGET_HOURS} HOURS')
    service_principal_signins = service_principal_signins.filter(col(actual_time_col) >= cutoff)
    filtered_count = service_principal_signins.count()
    print(f"‚è±Ô∏è Filtered sign-ins to last {TARGET_HOURS}h using column '{actual_time_col}': {filtered_count:,} rows")
else:
    print('‚ö†Ô∏è No recognizable time column found; skipping time window filter')

if risky_principals is not None:
    print('üß© Risk enrichment table available (AADRiskyServicePrincipals)')
else:
    print('‚ÑπÔ∏è Risk enrichment table not available (optional)')

## 1. Data Exploration & Profiling

Understand schema, volume, temporal coverage, and key distributions for service principal sign-ins.

In [None]:
# üß≠ COMPACT PROFILE (security-focused essentials only)
if service_principal_signins is None:
    print('‚ùå Required sign-in data missing')
else:
    total_rows = service_principal_signins.count()
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    if time_col:
        tb = service_principal_signins.selectExpr(f'min({time_col}) as start', f'max({time_col}) as end').collect()[0]
        print(f'üóÇÔ∏è Window: {tb.start} ‚Üí {tb.end}')
    print(f'üßÆ Rows: {total_rows:,}')
    # Distinct core entities
    app_cnt = service_principal_signins.select('AppId').distinct().count() if 'AppId' in service_principal_signins.columns else None
    ip_cnt = service_principal_signins.select('IpAddress').distinct().count() if 'IpAddress' in service_principal_signins.columns else None
    print('üî¢ Distinct:', end=' ')
    if app_cnt is not None: print(f'Apps={app_cnt}', end=' ')
    if ip_cnt is not None: print(f'IPs={ip_cnt}', end=' ')
    print('')
    
    if app_cnt and 'AppId' in service_principal_signins.columns:
        name_col = 'AppDisplayName' if 'AppDisplayName' in service_principal_signins.columns else None
        selected_cols = ([] if name_col is None else [name_col]) + ['AppId']
        sample_apps_df = (
            service_principal_signins
            .select(*selected_cols)
            .dropDuplicates()
            .orderBy(col(name_col) if name_col else col('AppId'))
            .limit(10)
        )
        samples = sample_apps_df.collect()
        if samples:
            formatted = []
            for row in samples:
                app_id = row['AppId'] if 'AppId' in row else None
                display = row[name_col] if name_col else None
                if display and str(display).strip():
                    formatted.append(f"{display} ({app_id})" if app_id else str(display))
                elif app_id:
                    formatted.append(app_id)
            if formatted:
                extras = max(app_cnt - len(samples), 0)
                suffix = f" ‚Ä¶ +{extras}" if extras > 0 else ''
                print(f"   ‚Üí Sample apps: {', '.join(formatted)}{suffix}")
    
    # Quick null signal for status/app
    key_cols = [c for c in ['AppId','AppDisplayName','IpAddress','ResultType','Status'] if c in service_principal_signins.columns]
    if key_cols:
        null_info = []
        for c in key_cols:
            n = service_principal_signins.filter(col(c).isNull()).count()
            if n > 0:
                null_info.append(f"{c}:{n}")
        if null_info:
            print('‚ö†Ô∏è Nulls:', ', '.join(null_info))
    # Risk overlap quick stat
    if risky_principals is not None:
        join_key = None
        if 'AppId' in service_principal_signins.columns and 'ServicePrincipalId' in risky_principals.columns:
            join_key = ('AppId','ServicePrincipalId')
        if join_key:
            overlap = service_principal_signins.select(join_key[0]).distinct().join(
            risky_principals.select(join_key[1]).distinct(), col(join_key[0])==col(join_key[1]), 'inner'
            ).count()
            print(f'üß© Risk overlap principals: {overlap}')
        else:
            print('‚ÑπÔ∏è Risk table loaded but no join key matched')

In [None]:
# üìà HOURLY & FAILURE RATE
if service_principal_signins is not None:
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    status_col = next((c for c in ['ResultType','Status','status','ResultType'.lower()] if c in service_principal_signins.columns), None)
    if time_col:
        base = service_principal_signins.withColumn('HourBucket', date_trunc('hour', col(time_col)))
        hourly = base.groupBy('HourBucket').agg(spark_count('*').alias('Events'))
        if status_col:
            # Assume 0 = success style codes; adapt if textual
            success_expr = when(col(status_col).isin(['0',0,'Success','success']), 1).otherwise(0)
            fail_expr = when(col(status_col).isin(['0',0,'Success','success']), 0).otherwise(1)
            fr = base.withColumn('Success', success_expr).withColumn('Failure', fail_expr) \
                     .groupBy('HourBucket') \
                     .agg(spark_count('*').alias('Events'),
                          expr('sum(Failure) as Failures'),
                          expr('sum(Success) as Successes')) \
                     .withColumn('FailureRate', col('Failures')/col('Events')) \
                     .orderBy('HourBucket')
            pdf = fr.toPandas()
            if not pdf.empty:
                plt.figure(figsize=(10,4))
                sns.lineplot(data=pdf, x='HourBucket', y='Events', label='Events')
                ax2 = plt.twinx()
                sns.lineplot(data=pdf, x='HourBucket', y='FailureRate', color='red', label='FailureRate', ax=ax2)
                plt.title('Hourly Volume & Failure Rate')
                plt.xticks(rotation=45, ha='right')
                ax2.set_ylabel('Failure Rate')
                plt.tight_layout(); plt.show()
            else:
                print('‚ö†Ô∏è No hourly data')
        else:
            pdf = hourly.orderBy('HourBucket').toPandas()
            if not pdf.empty:
                plt.figure(figsize=(10,4))
                sns.lineplot(data=pdf, x='HourBucket', y='Events', marker='o')
                plt.title('Service Principal Sign-Ins per Hour')
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout(); plt.show()
            else:
                print('‚ö†Ô∏è No hourly data')
    else:
        print('‚ö†Ô∏è No time column for temporal analysis')
else:
    print('‚ùå Dataset not available')

In [None]:
# üß™ STATUS, APPLICATION & IP INSIGHTS
if service_principal_signins is not None:
    status_col = next((c for c in ['ResultType','Status','status'] if c in service_principal_signins.columns), None)
    if status_col:
        print(f'üîé Using status column: {status_col}')
        breakdown = (service_principal_signins
                     .groupBy(status_col)
                     .agg(spark_count('*').alias('Count'))
                     .orderBy(col('Count').desc()))
        breakdown.show(10, truncate=False)
    # App failure ranking
    app_col = 'AppId' if 'AppId' in service_principal_signins.columns else None
    if app_col and status_col:
        success_expr = when(col(status_col).isin(['0',0,'Success','success']), 1).otherwise(0)
        fail_expr = when(col(status_col).isin(['0',0,'Success','success']), 0).otherwise(1)
        app_fail = (service_principal_signins
                    .withColumn('Fail', fail_expr)
                    .groupBy(app_col)
                    .agg(spark_count('*').alias('Events'), expr('sum(Fail) as Failures'))
                    .withColumn('FailureRate', col('Failures')/col('Events'))
                    .filter(col('Events') >= 3)
                    .orderBy(col('FailureRate').desc(), col('Events').desc())
                    .limit(15))
        print('\nüî• Apps by Failure Rate (>=3 events)')
        app_fail.show(truncate=False)
    if 'IpAddress' in service_principal_signins.columns:
        # Top IPs
        top_ips = (service_principal_signins
                   .groupBy('IpAddress')
                   .agg(spark_count('*').alias('Events'))
                   .orderBy(col('Events').desc())
                   .limit(10))
        print('\nüåê Top Source IPs:')
        top_ips.show(truncate=False)
        # New IPs (first seen within this window) only if time column present
        time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
        if time_col:
            first_seen = (service_principal_signins
                          .groupBy('IpAddress')
                          .agg(expr(f'min({time_col}) as FirstSeen'), expr(f'max({time_col}) as LastSeen'), spark_count('*').alias('Events'))
                          .orderBy('FirstSeen'))
            # Heuristic: new if lifespan < 1 hour
            new_ips = first_seen.withColumn('LifespanMinutes', (expr('unix_timestamp(LastSeen)') - expr('unix_timestamp(FirstSeen)'))/60.0) \
                                  .filter(col('LifespanMinutes') < 60)
            if new_ips.count() > 0:
                print('\n? Newly Observed IPs (<60m lifespan):')
                new_ips.show(10, truncate=False)
    # Risk overlay simple metric
    if risky_principals is not None and 'ServicePrincipalId' in risky_principals.columns and 'AppId' in service_principal_signins.columns:
        risky_join = (service_principal_signins.select('AppId')
                      .distinct()
                      .join(risky_principals.select('ServicePrincipalId').distinct(), col('AppId')==col('ServicePrincipalId'), 'inner')
                      .count())
        print(f'üß© Risk-tagged principals in window: {risky_join}')
else:
    print('‚ùå Dataset not available')

## 2. Security Detections & Anomalies

Focused heuristics highlighting suspicious service principal behavior. Thresholds are intentionally simple and can be tuned.

In [None]:
# üî• BURST & SPIKE DETECTION (Events per Hour > mean+3*std)
if service_principal_signins is None:
    print('‚ùå No data for detection phase')
else:
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    if not time_col:
        print('‚ö†Ô∏è No time column; skipping burst detection')
    else:
        hourly = (service_principal_signins
                  .withColumn('HourBucket', date_trunc('hour', col(time_col)))
                  .groupBy('HourBucket')
                  .agg(spark_count('*').alias('Events'))
                  .orderBy('HourBucket'))
        stats = hourly.agg(avg('Events').alias('mean'), stddev('Events').alias('std')).collect()[0]
        mean_v = stats['mean'] or 0
        std_v = stats['std'] or 0
        threshold = mean_v + (3 * std_v)
        spikes = hourly.filter(col('Events') > threshold)
        print(f'Mean={mean_v:.2f} Std={std_v:.2f} Threshold={threshold:.2f}')
        cnt = spikes.count()
        if cnt > 0:
            print(f'üö® {cnt} spike hour(s) detected (>{threshold:.2f})')
            spikes.show(truncate=False)
        else:
            print('‚úÖ No abnormal bursts above threshold')

In [None]:
# üß™ RARE / NEW APPLICATION DETECTION (Apps <=2 events or first seen this window)
if service_principal_signins is not None and 'AppId' in service_principal_signins.columns:
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    usage = (service_principal_signins
             .groupBy('AppId')
             .agg(spark_count('*').alias('Events')))
    rare = usage.filter(col('Events') <= 2)
    rc = rare.count()
    if rc > 0:
        print(f'üïµÔ∏è Rare apps (<=2 events): {rc}')
        rare.show(15, truncate=False)
    else:
        print('‚úÖ No rare low-volume apps')
    if time_col:
        # Calculate first seen heuristic inside window (all are within window by design but show earliest timestamp)
        first_seen = (service_principal_signins
                      .groupBy('AppId')
                      .agg(expr(f'min({time_col}) as FirstSeen'), spark_count('*').alias('Events'))
                      .orderBy('FirstSeen'))
        # (Optionally could compare against historical baseline if persisted; here just surface earliest ordering)
        print('\nüïí Earliest observed apps in window (potentially new):')
        first_seen.show(10, truncate=False)
else:
    print('‚ö†Ô∏è AppId column absent; skipping rare app detection')

In [None]:
# üîÅ FAILURE ‚Üí SUCCESS PIVOT (Potential credential misuse)
if service_principal_signins is not None:
    status_col = next((c for c in ['ResultType','Status','status'] if c in service_principal_signins.columns), None)
    app_col = 'AppId' if 'AppId' in service_principal_signins.columns else None
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    if status_col and app_col and time_col:
        labeled = service_principal_signins.select(app_col, time_col, status_col).withColumn(
            'IsSuccess', when(col(status_col).isin(['0',0,'Success','success']), 1).otherwise(0)
        ).withColumn(
            'IsFailure', when(col(status_col).isin(['0',0,'Success','success']), 0).otherwise(1)
        )
        # Aggregate sequentially at hour level for simplicity
        hour_app = labeled.withColumn('HourBucket', date_trunc('hour', col(time_col))) \
                           .groupBy('HourBucket', app_col) \
                           .agg(expr('sum(IsFailure) as Failures'), expr('sum(IsSuccess) as Successes')) \
                           .orderBy('HourBucket')
        # Look for pattern: failures in an hour followed by success next hour
        from pyspark.sql.window import Window
        w = Window.partitionBy(app_col).orderBy('HourBucket')
        shifted_success = hour_app.withColumn('NextHourSuccess', expr('lead(Successes,1) over (partition by {0} order by HourBucket)'.format(app_col)))
        pivots = shifted_success.filter((col('Failures') > 3) & (col('NextHourSuccess') > 0))
        pc = pivots.count()
        if pc > 0:
            print(f'‚ö†Ô∏è Potential credential pivot events: {pc}')
            pivots.select(app_col, 'HourBucket', 'Failures', 'NextHourSuccess').show(15, truncate=False)
        else:
            print('‚úÖ No failure‚Üísuccess pivot patterns detected (threshold Failures>3 then next-hour success)')
    else:
        print('‚ö†Ô∏è Missing columns for pivot detection')
else:
    print('‚ùå Dataset not available')

In [None]:
# üåç UNUSUAL / NEW IP ADDRESS ACTIVITY
if service_principal_signins is not None and 'IpAddress' in service_principal_signins.columns:
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    ip_usage = (service_principal_signins
                .groupBy('IpAddress')
                .agg(spark_count('*').alias('Events')))
    low_freq = ip_usage.filter(col('Events') == 1)
    lf_count = low_freq.count()
    if lf_count > 0:
        print(f'üïµÔ∏è Single-use IPs: {lf_count}')
        low_freq.show(10, truncate=False)
    else:
        print('‚úÖ No single-use IPs')
    if time_col:
        first_seen = (service_principal_signins
                      .groupBy('IpAddress')
                      .agg(expr(f'min({time_col}) as FirstSeen'), expr(f'max({time_col}) as LastSeen'), spark_count('*').alias('Events'))
                      .orderBy('FirstSeen'))
        new_short = first_seen.withColumn('SpanMinutes', (expr('unix_timestamp(LastSeen)') - expr('unix_timestamp(FirstSeen)'))/60.0) \
                               .filter(col('SpanMinutes') < 15)
        ns_count = new_short.count()
        if ns_count > 0:
            print(f'üÜï Very short-lived IP activity (<15m span): {ns_count}')
            new_short.show(10, truncate=False)
else:
    print('‚ö†Ô∏è IP column absent; skipping IP anomaly logic')

In [None]:
# üåô OFF-HOURS ACTIVITY (00:00‚Äì05:00 local bucket)
if service_principal_signins is not None:
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    if time_col:
        off_hours = (service_principal_signins
                     .withColumn('Hour', hour(col(time_col)))
                     .filter(col('Hour').between(0,5)))
        cnt = off_hours.count()
        if cnt > 0:
            print(f'üåô Off-hours events (00‚Äì05): {cnt}')
            # Top apps active off-hours
            if 'AppId' in off_hours.columns:
                top_off = (off_hours.groupBy('AppId')
                           .agg(spark_count('*').alias('Events'))
                           .orderBy(col('Events').desc())
                           .limit(10))
                print('\nTop off-hours apps:')
                top_off.show(truncate=False)
        else:
            print('‚úÖ No off-hours activity detected')
    else:
        print('‚ö†Ô∏è No time column; skipping off-hours analysis')
else:
    print('‚ùå Dataset not available')

In [None]:
# ‚ôªÔ∏è PERSISTENCE PATTERN (Regular hourly presence > N hours, low variance)
if service_principal_signins is not None:
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    app_col = 'AppId' if 'AppId' in service_principal_signins.columns else None
    if time_col and app_col:
        hour_app = (service_principal_signins
                    .withColumn('HourBucket', date_trunc('hour', col(time_col)))
                    .groupBy('HourBucket', app_col)
                    .agg(spark_count('*').alias('Events')))
        # Count active hours and compute variance of events per app
        from pyspark.sql.window import Window
        app_stats = (hour_app.groupBy(app_col)
                     .agg(
                         spark_count('*').alias('ActiveHours'),
                         expr('avg(Events) as MeanPerHour'),
                         expr('stddev(Events) as StdPerHour')
                     )
                     .withColumn('StdPerHour', when(col('StdPerHour').isNull(), lit(0)).otherwise(col('StdPerHour'))))
        # Persistence heuristic: active >= 8 hours AND low variance (std <= 0.5 * mean)
        persistent = app_stats.filter((col('ActiveHours') >= 8) & (col('StdPerHour') <= (col('MeanPerHour') * 0.5)))
        pc = persistent.count()
        if pc > 0:
            print(f'‚ôªÔ∏è Potential persistence (scheduled/automated) apps: {pc}')
            persistent.orderBy(col('ActiveHours').desc()).show(15, truncate=False)
        else:
            print('‚úÖ No persistence patterns meeting heuristic (>=8 active hours & low variance)')
    else:
        print('‚ö†Ô∏è Missing columns for persistence analysis')
else:
    print('‚ùå Dataset not available')

## 3. Risk Enrichment (If Available)

Correlates observed service principals with entries in `AADRiskyServicePrincipals` and computes a simple priority score.

In [None]:
# üß© RISK ENRICHMENT JOIN & SCORING
if risky_principals is None:
    print('‚ÑπÔ∏è Risk dataset not loaded; skipping enrichment')
else:
    # Identify join keys
    join_key = None
    if 'AppId' in service_principal_signins.columns and 'ServicePrincipalId' in risky_principals.columns:
        join_key = ('AppId','ServicePrincipalId')
    elif 'ServicePrincipalId' in service_principal_signins.columns and 'ServicePrincipalId' in risky_principals.columns:
        join_key = ('ServicePrincipalId','ServicePrincipalId')

    if not join_key:
        print('‚ö†Ô∏è No compatible join key between sign-ins and risk table')
    else:
        left_key, right_key = join_key
        # Compactify risk table columns for robustness
        risk_cols = [c for c in risky_principals.columns if c.lower() in (
            'serviceprincipalid','risklevel','riskscore','riskstate','detectionid','riskdetections','riskdetail'
        )]
        risk_view = risky_principals.select(*risk_cols).dropDuplicates([right_key])
        enriched = service_principal_signins.join(risk_view, col(left_key)==col(right_key), 'left')

        # Simple scoring heuristic
        def risk_case(colname):
            return (
                when(col(colname) == 'high', 3)
                .when(col(colname) == 'medium', 2)
                .when(col(colname) == 'low', 1)
                .otherwise(0)
            )
        score_col = None
        if 'riskLevel' in risk_view.columns:
            score_col = risk_case('riskLevel')
        elif 'risklevel' in risk_view.columns:
            score_col = risk_case('risklevel')

        if score_col is not None:
            enriched = enriched.withColumn('RiskScore', score_col)
        else:
            enriched = enriched.withColumn('RiskScore', lit(0))

        # Aggregate per principal
        principal_col = left_key
        agg = (enriched.groupBy(principal_col)
               .agg(
                    spark_count('*').alias('Events'),
                    expr('max(RiskScore) as MaxRiskScore'),
                    expr('sum(RiskScore) as TotalRiskScore')
                )
               .withColumn('PriorityScore', col('Events')*0.4 + col('MaxRiskScore')*2 + col('TotalRiskScore')*0.6)
               .orderBy(col('PriorityScore').desc()))

        print('üî• PRIORITIZED PRINCIPALS (Top 15):')
        agg.limit(15).show(truncate=False)

        # Persist small result for later summary
        agg_cached = agg.cache()
        agg_cached.count()  # trigger
        globals()['risk_enriched_principals'] = agg_cached
        print('‚úÖ Risk enrichment complete')

## 4. Executive Summary & Recommended Actions

Quick situational snapshot suitable for incident review or daily security brief.

In [None]:
# üìã EXECUTIVE SUMMARY
from pyspark.sql.functions import sum as spark_sum
summary = {}

if 'service_principal_signins' in globals() and service_principal_signins is not None:
    total = service_principal_signins.count()
    summary['total_events'] = total
    # Off-hours proportion
    time_col = next((c for c in ['CreatedDateTime','TimeGenerated','Timestamp'] if c in service_principal_signins.columns), None)
    if time_col:
        off_hours = service_principal_signins.withColumn('Hour', hour(col(time_col))).filter(col('Hour').between(0,5)).count()
        summary['off_hours_events'] = off_hours
        summary['off_hours_pct'] = (off_hours/total*100) if total else 0
    # Distinct apps
    if 'AppId' in service_principal_signins.columns:
        summary['distinct_apps'] = service_principal_signins.select('AppId').distinct().count()
else:
    print('‚ö†Ô∏è Sign-in dataset missing; summary limited')

# Risk enrichment snapshot
if 'risk_enriched_principals' in globals():
    rep = risk_enriched_principals
    total_principals = rep.count()
    high_prior = rep.orderBy(col('PriorityScore').desc()).limit(5)
    summary['risk_tracked_principals'] = total_principals
else:
    high_prior = None

print('=== SERVICE PRINCIPAL ACTIVITY SUMMARY ===')
for k,v in summary.items():
    print(f'{k}: {v}')

if high_prior is not None:
    print('\nTop Priority Principals:')
    high_prior.show(truncate=False)

# Recommended actions heuristics
actions = []
if summary.get('off_hours_pct',0) > 30:
    actions.append('Investigate high off-hours activity for automation drift or misuse')
if high_prior is not None and high_prior.count() > 0:
    actions.append('Review top priority principals for least privilege & rotation needs')
if not actions:
    actions.append('Maintain monitoring; no immediate high-risk signals')

print('\n=== RECOMMENDED ACTIONS ===')
for a in actions:
    print('- ' + a)