# üîê Identity Security Analysis - Microsoft Sentinel Data Lake

**Hunt for identity-based threats using Entra ID and authentication data.**

## üéØ Security Scenarios Covered

| Scenario | Detection | Impact |
|----------|-----------|---------|
| **üö® Brute Force Attacks** | Multiple failed logins, eventual success | Critical |
| **üåç Impossible Travel** | Sign-ins from distant locations | High |
| **‚è∞ Suspicious Timing** | Off-hours or unusual activity patterns | Medium |
| **üîì Credential Stuffing** | Same IP targeting multiple accounts | High |
| **üì± Risky Applications** | High-failure rate apps, unusual access | Medium |

## ‚öôÔ∏è Quick Setup
1. Update `PRIMARY_WORKSPACE` and `ENTRA_WORKSPACE` in the config cell below
2. Run all cells
3. Analyze the identity security findings

---

In [None]:
# üîß CONFIGURATION - UPDATE THESE WORKSPACE NAMES!
# =================================================================
# ‚ö†Ô∏è  IMPORTANT: Update these workspace names to match YOUR environment
# =================================================================

PRIMARY_WORKSPACE = "ak-SecOps"    # üëà UPDATE THIS to your primary Sentinel workspace name
ENTRA_WORKSPACE = "ak-SecOps"     # üëà UPDATE THIS to your Entra workspace name (activity logs are in primary workspace)

# Workspace mapping for automatic fallback
workspace_mapping = {
    "SecurityEvent": PRIMARY_WORKSPACE,   # Windows security events (primary workspace)
    "SigninLogs": PRIMARY_WORKSPACE,      # Entra ID sign-in logs (primary workspace)
    "AADNonInteractiveUserSignInLogs": PRIMARY_WORKSPACE,  # Entra ID non-interactive sign-ins
    "EntraUsers": ENTRA_WORKSPACE,        # Entra ID user information
    "EntraGroups": ENTRA_WORKSPACE,       # Entra ID groups
    "EntraApplications": ENTRA_WORKSPACE, # Entra ID applications
}

print("‚úÖ Configuration loaded successfully!")
print(f"Primary Workspace: {PRIMARY_WORKSPACE}")
print(f"Entra Workspace: {ENTRA_WORKSPACE}")
print("\nüîç Workspace Mapping:")
for table, workspace in workspace_mapping.items():
    print(f"  üìã {table} ‚Üí {workspace}")

print("\n‚ö†Ô∏è  Remember: Update the workspace names above to match YOUR environment!")
print("üìö Publication-ready: No hardcoded values, works anywhere!")

In [None]:
# üìä DATA LOADER
# =================================================================
# Simple identity data loading with fallback handling
# =================================================================

from sentinel_lake.providers import MicrosoftSentinelProvider
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

# Initialize data provider
data_provider = MicrosoftSentinelProvider(spark)

def load_identity_data():
    """Load identity security data with smart fallbacks"""
    data = {}
    
    # Core identity tables for security analysis
    tables = {
        "SecurityEvent": "Windows security events",
        "SigninLogs": "Entra ID sign-in logs",
        "AADNonInteractiveUserSignInLogs": "Entra ID non-interactive sign-in logs",
        "EntraUsers": "Entra ID user information",
        "EntraGroups": "Entra ID groups", 
        "EntraApplications": "Entra ID applications"
    }
    
    print("üîÑ Loading identity security data...\n")
    
    for table, description in tables.items():
        try:
            workspace = workspace_mapping.get(table, ENTRA_WORKSPACE if "AAD" in table or "Signin" in table else PRIMARY_WORKSPACE)
            df = data_provider.read_table(table, workspace)
            # Load last 24 hours for performance
            df = df.filter(col("TimeGenerated") >= (current_timestamp() - expr("INTERVAL 24 HOURS")))
            data[table] = df
            print(f"‚úÖ {table}: {description}")
        except:
            print(f"‚ö†Ô∏è {table}: Not available")
            data[table] = None
    
    print(f"\nüöÄ Loaded {len([v for v in data.values() if v is not None])}/{len(tables)} tables")
    return data

# Load the data
identity_data = load_identity_data()

## üö® Scenario 1: Brute Force Attack Detection

**Hunt for attackers systematically guessing passwords**

**Detection Patterns:**
- High volume of failed authentication attempts
- Multiple failures followed by success
- Same source IP targeting multiple accounts

In [None]:
# üö® BRUTE FORCE ATTACK DETECTION
# =================================================================

security_events = identity_data.get("SecurityEvent")

if security_events is not None:
    print("üîç Hunting for brute force attacks using Windows Security Events...")
    
    # Look for failed logon events (Event ID 4625) and successful logons (Event ID 4624)
    logon_events = security_events.filter(
        col("EventID").isin([4624, 4625])  # Success and failure logon events
    )
    
    # Categorize logon status
    logon_analysis = logon_events.withColumn("LogonStatus", 
                                           when(col("EventID") == 4624, "Success")
                                           .when(col("EventID") == 4625, "Failure")
                                           .otherwise("Unknown"))
    
    # Find brute force patterns by Account and Source IP
    brute_force = logon_analysis.groupBy("Account", "IpAddress") \
        .agg(
            count(when(col("LogonStatus") == "Failure", True)).alias("FailureCount"),
            count(when(col("LogonStatus") == "Success", True)).alias("SuccessCount"),
            count("*").alias("TotalAttempts")
        ) \
        .withColumn("FailureRate", col("FailureCount") / col("TotalAttempts")) \
        .filter(col("FailureCount") > 10)  # 10+ failures
    
    # High-risk: Multiple failures then success
    critical_attacks = brute_force.filter(
        (col("FailureCount") > 20) & 
        (col("SuccessCount") > 0)
    ).orderBy(desc("FailureCount"))
    
    critical_count = critical_attacks.count()
    
    if critical_count > 0:
        print(f"üî• {critical_count} CRITICAL BRUTE FORCE ATTACKS DETECTED")
        critical_attacks.show(10, truncate=False)
        
        print("\n‚ö° IMMEDIATE ACTIONS:")
        print("‚Ä¢ Isolate compromised accounts")
        print("‚Ä¢ Block attacking IP addresses")
        print("‚Ä¢ Force password resets")
        
    else:
        print("‚úÖ No critical brute force attacks detected")
        
    # Show ongoing attempts (failures without success yet)
    ongoing_attacks = brute_force.filter(col("SuccessCount") == 0) \
                                .orderBy(desc("FailureCount"))
    
    ongoing_count = ongoing_attacks.count()
    if ongoing_count > 0:
        print(f"\n‚ö†Ô∏è {ongoing_count} ongoing brute force attempts:")
        ongoing_attacks.show(5, truncate=False)
        
else:
    print("‚ö†Ô∏è SecurityEvent table not available - brute force detection requires Windows security events")

## üåç Scenario 2: Impossible Travel Detection

**Identify sign-ins from geographically impossible locations**

**Detection Focus:**
- Sign-ins from multiple countries within short timeframes
- Unusual geographic patterns for specific users
- High-risk countries with authentication attempts

In [None]:
# üåç IMPOSSIBLE TRAVEL DETECTION
# =================================================================

signin_logs = identity_data.get("SigninLogs")

if signin_logs is not None:
    print("? Hunting for impossible travel...")
    
    # Parse location details
    location_schema = StructType([
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("countryOrRegion", StringType(), True)
    ])
    
    location_analysis = signin_logs.filter(col("Location").isNotNull()) \
        .withColumn("Location_parsed", from_json(col("Location"), location_schema)) \
        .select(
            "UserPrincipalName", "TimeGenerated", "IPAddress",
            col("Location_parsed.countryOrRegion").alias("Country"),
            col("Location_parsed.city").alias("City")
        ).filter(col("Country").isNotNull())
    
    # Users with multiple countries in short timeframe
    impossible_travel = location_analysis.groupBy("UserPrincipalName") \
        .agg(
            countDistinct("Country").alias("CountryCount"),
            countDistinct("IPAddress").alias("IPCount"),
            count("*").alias("TotalSignIns"),
            collect_set("Country").alias("Countries")
        ) \
        .filter(col("CountryCount") > 1) \
        .orderBy(desc("CountryCount"))
    
    travel_count = impossible_travel.count()
    
    if travel_count > 0:
        print(f"üö® {travel_count} users with multi-country sign-ins detected")
        impossible_travel.show(10, truncate=False)
        
        # High-risk countries analysis
        country_risk = location_analysis.groupBy("Country") \
            .agg(
                countDistinct("UserPrincipalName").alias("UniqueUsers"),
                count("*").alias("SignInCount")
            ) \
            .filter(col("UniqueUsers") < 3)  # Countries with few users (suspicious)
        
        print("\n‚ö†Ô∏è Potentially risky countries (few legitimate users):")
        country_risk.show(10, truncate=False)
        
    else:
        print("‚úÖ No impossible travel patterns detected")
        
else:
    print("‚ö†Ô∏è SigninLogs not available - location analysis requires Entra ID logs")

## ‚è∞ Scenario 3: Suspicious Timing Analysis

**Detect unusual authentication patterns based on timing**

**Detection Patterns:**
- Off-hours authentication (nights, weekends)
- Unusual activity bursts
- Accounts active during non-business hours

In [None]:
# ‚è∞ SUSPICIOUS TIMING ANALYSIS
# =================================================================

signin_logs = identity_data.get("SigninLogs")

if signin_logs is not None:
    print("üîç Hunting for suspicious timing patterns...")
    
    timing_analysis = signin_logs.withColumn("HourOfDay", hour(col("TimeGenerated"))) \
                                .withColumn("DayOfWeek", dayofweek(col("TimeGenerated")))
    
    # Off-hours activity (10 PM - 6 AM)
    off_hours = timing_analysis.filter(
        (col("HourOfDay") >= 22) | (col("HourOfDay") <= 6)
    )
    
    off_hours_users = off_hours.groupBy("UserPrincipalName") \
        .agg(
            count("*").alias("OffHoursSignIns"),
            countDistinct("IPAddress").alias("UniqueIPs"),
            countDistinct("HourOfDay").alias("UniqueHours")
        ) \
        .filter(col("OffHoursSignIns") > 5) \
        .orderBy(desc("OffHoursSignIns"))
    
    off_hours_count = off_hours_users.count()
    
    if off_hours_count > 0:
        print(f"? {off_hours_count} users with significant off-hours activity:")
        off_hours_users.show(10, truncate=False)
    else:
        print("‚úÖ No unusual off-hours activity detected")
    
    # Weekend activity (Saturday = 7, Sunday = 1)
    weekend_activity = timing_analysis.filter(
        col("DayOfWeek").isin([1, 7])
    ).groupBy("UserPrincipalName") \
        .agg(count("*").alias("WeekendSignIns")) \
        .filter(col("WeekendSignIns") > 10) \
        .orderBy(desc("WeekendSignIns"))
    
    weekend_count = weekend_activity.count()
    
    if weekend_count > 0:
        print(f"\nüìÖ {weekend_count} users with high weekend activity:")
        weekend_activity.show(10, truncate=False)
    else:
        print("\n‚úÖ No unusual weekend activity detected")
        
else:
    print("‚ö†Ô∏è SigninLogs not available - timing analysis requires Entra ID logs")

## üîì Scenario 4: Credential Stuffing Detection

**Identify attackers using stolen credential lists against multiple accounts**

**Detection Patterns:**
- Same IP address targeting many different usernames
- Systematic authentication attempts across user base
- Low success rate with wide user targeting

In [None]:
# üîì CREDENTIAL STUFFING DETECTION
# =================================================================

signin_logs = identity_data.get("SigninLogs")

if signin_logs is not None:
    print("üîç Hunting for credential stuffing attacks...")
    
    # Parse status for analysis
    status_schema = StructType([StructField("errorCode", StringType(), True)])
    
    stuffing_analysis = signin_logs.withColumn("Status_parsed", from_json(col("Status"), status_schema)) \
                                  .withColumn("ResultType", col("Status_parsed.errorCode")) \
                                  .withColumn("SignInStatus", 
                                             when(col("ResultType") == "0", "Success")
                                             .otherwise("Failure"))
    
    # IP addresses targeting multiple users (credential stuffing pattern)
    ip_targeting = stuffing_analysis.groupBy("IPAddress") \
        .agg(
            countDistinct("UserPrincipalName").alias("UniqueUsers"),
            count(when(col("SignInStatus") == "Failure", True)).alias("FailureCount"),
            count(when(col("SignInStatus") == "Success", True)).alias("SuccessCount"),
            count("*").alias("TotalAttempts")
        ) \
        .withColumn("SuccessRate", col("SuccessCount") / col("TotalAttempts")) \
        .filter(col("UniqueUsers") > 10)  # Targeting 10+ different users
    
    # High-risk credential stuffing (many users, low success rate)
    credential_stuffing = ip_targeting.filter(
        (col("UniqueUsers") > 20) & 
        (col("SuccessRate") < 0.1)  # Less than 10% success rate
    ).orderBy(desc("UniqueUsers"))
    
    stuffing_count = credential_stuffing.count()
    
    if stuffing_count > 0:
        print(f"üö® {stuffing_count} IP addresses performing credential stuffing:")
        credential_stuffing.show(10, truncate=False)
        
        print("\n‚ö° IMMEDIATE ACTIONS:")
        print("‚Ä¢ Block identified IP addresses")
        print("‚Ä¢ Implement rate limiting")
        print("‚Ä¢ Enable account lockout policies")
        print("‚Ä¢ Force MFA for affected accounts")
        
    else:
        print("‚úÖ No credential stuffing attacks detected")
        
    # Show IPs with moderate targeting (potential reconnaissance)
    moderate_targeting = ip_targeting.filter(
        (col("UniqueUsers") >= 5) & 
        (col("UniqueUsers") <= 20)
    ).orderBy(desc("UniqueUsers"))
    
    moderate_count = moderate_targeting.count()
    if moderate_count > 0:
        print(f"\n‚ö†Ô∏è {moderate_count} IPs with moderate user targeting (reconnaissance):")
        moderate_targeting.show(5, truncate=False)
        
else:
    print("‚ö†Ô∏è SigninLogs not available - credential stuffing detection requires Entra ID logs")

## üì± Scenario 5: Risky Application Analysis

**Identify suspicious application access patterns and authentication issues**

**Detection Focus:**
- Applications with unusually high failure rates
- Unusual application access patterns
- Applications being targeted by attackers

In [None]:
# üì± RISKY APPLICATION ANALYSIS
# =================================================================

signin_logs = identity_data.get("SigninLogs")

if signin_logs is not None:
    print("? Hunting for risky application patterns...")
    
    # Parse status for analysis
    status_schema = StructType([StructField("errorCode", StringType(), True)])
    
    app_analysis = signin_logs.withColumn("Status_parsed", from_json(col("Status"), status_schema)) \
                             .withColumn("ResultType", col("Status_parsed.errorCode")) \
                             .withColumn("SignInStatus", 
                                        when(col("ResultType") == "0", "Success")
                                        .otherwise("Failure")) \
                             .filter(col("AppDisplayName").isNotNull())
    
    # Application failure rates
    app_risk = app_analysis.groupBy("AppDisplayName", "AppId") \
        .agg(
            count(when(col("SignInStatus") == "Failure", True)).alias("FailureCount"),
            count(when(col("SignInStatus") == "Success", True)).alias("SuccessCount"),
            count("*").alias("TotalAttempts"),
            countDistinct("UserPrincipalName").alias("UniqueUsers")
        ) \
        .withColumn("FailureRate", col("FailureCount") / col("TotalAttempts")) \
        .filter(col("TotalAttempts") > 10)  # Apps with meaningful volume
    
    # High-risk applications (high failure rate)
    risky_apps = app_risk.filter(col("FailureRate") > 0.5) \
                        .orderBy(desc("FailureRate"))
    
    risky_count = risky_apps.count()
    
    if risky_count > 0:
        print(f"‚ö†Ô∏è {risky_count} applications with high failure rates:")
        risky_apps.show(10, truncate=False)
        
        print("\nüí° Investigation Tips:")
        print("‚Ä¢ High failure rates may indicate:")
        print("  - Application misconfiguration")
        print("  - Targeted attacks against specific apps")
        print("  - Integration issues")
        
    else:
        print("‚úÖ No applications with unusually high failure rates")
    
    # Most targeted applications by attackers
    print("\nüìä Most authentication attempts by application:")
    top_targeted = app_risk.orderBy(desc("TotalAttempts"))
    top_targeted.show(10, truncate=False)
    
    # Applications with diverse user base (normal vs. targeted)
    user_diversity = app_risk.withColumn("UsersPerAttempt", col("UniqueUsers") / col("TotalAttempts"))
    
    # Low diversity might indicate targeting
    potentially_targeted = user_diversity.filter(
        (col("TotalAttempts") > 50) & 
        (col("UsersPerAttempt") < 0.1)  # Many attempts, few unique users
    ).orderBy(desc("TotalAttempts"))
    
    targeting_count = potentially_targeted.count()
    if targeting_count > 0:
        print(f"\nüéØ {targeting_count} potentially targeted applications:")
        potentially_targeted.show(5, truncate=False)
        
else:
    print("‚ö†Ô∏è SigninLogs not available - application analysis requires Entra ID logs")

## üìã Identity Security Assessment Summary

**Review the findings above and take action based on detected threats:**

### üö® Critical Priority Actions
- **Brute Force Attacks**: Immediately block IPs and reset compromised accounts
- **Impossible Travel**: Investigate multi-country sign-ins for account compromise
- **Credential Stuffing**: Implement IP blocking and rate limiting

### üîç Investigation Priorities
- Cross-reference findings across scenarios for comprehensive view
- Validate suspicious patterns with users before taking action
- Correlate with endpoint security data for full attack picture

### ? User Risk Hotspots
- Review the user failure hotspot analysis below to pinpoint repeated failures
- Partner with account owners to validate MFA coverage and device hygiene
- Track whether remediation reduces failure volume in future notebook runs

## üë• User Failure Hotspots
Visualize accounts that repeatedly failed authentication attempts yet eventually succeeded. Use the chart to prioritise user outreach, enforce stronger controls, and validate whether remediation actions reduce failure volume over time.

In [None]:
# üë• USER FAILURE HOTSPOTS ANALYSIS
# =================================================================
# Highlight accounts with heavy failure volume that eventually succeeded
# =================================================================

FAILURE_THRESHOLD = 50  # Tune for your environment
signin_logs = identity_data.get("SigninLogs")
noninteractive_logs = identity_data.get("AADNonInteractiveUserSignInLogs")

from pyspark.sql.functions import sum as spark_sum

def build_failure_summary(df):
    if df is None or df.rdd.isEmpty():
        return None
    required_cols = {"UserPrincipalName", "IPAddress"}
    if not required_cols.issubset(df.columns):
        return None
    status_schema = StructType([StructField("errorCode", StringType(), True)])
    success_codes = ["0", "50125", "50140", "70043", "70044"]
    
    if "Status" in df.columns:
        parsed = df.withColumn("Status_struct", from_json(col("Status"), status_schema))
    else:
        parsed = df.withColumn("Status_struct", lit(None).cast(status_schema))
    
    if "UserDisplayName" not in parsed.columns:
        parsed = parsed.withColumn("UserDisplayName", lit(None).cast(StringType()))
    
    enriched = (
        parsed
        .withColumn(
"NormalizedResultType",
                   coalesce(col("ResultType"), col("Status_struct.errorCode")))
        .withColumn(
            "Outcome",
            when(col("NormalizedResultType").isin(success_codes), lit("Success")).otherwise(lit("Failure"))
        )
    )
    
    aggregated = (
        enriched
        .groupBy("UserPrincipalName", "UserDisplayName", "IPAddress")
        .agg(
            count(when(col("Outcome") == "Failure", True)).alias("FailureCount"),
            count(when(col("Outcome") == "Success", True)).alias("SuccessCount")
        )
    )
    
    if aggregated.rdd.isEmpty():
        return None
    return aggregated

frames = []
for source_df in (signin_logs, noninteractive_logs):
    summary_df = build_failure_summary(source_df)
    if summary_df is not None:
        frames.append(summary_df)

if not frames:
    print("‚ö†Ô∏è Sign-in tables unavailable ‚Äì cannot build user failure hotspots.")
else:
    combined = frames[0]
    for extra_df in frames[1:]:
        combined = combined.unionByName(extra_df, allowMissingColumns=True)

    user_totals = (
        combined
        .groupBy("UserPrincipalName", "UserDisplayName")
        .agg(
            spark_sum("FailureCount").alias("FailureCount"),
            spark_sum("SuccessCount").alias("SuccessCount"),
            countDistinct("IPAddress").alias("DistinctIPs")
        )
        .filter((col("FailureCount") >= FAILURE_THRESHOLD) & (col("SuccessCount") > 0))
        .orderBy(col("FailureCount").desc())
    )

    sample_count = user_totals.limit(1).count()
    if sample_count == 0:
        print(f"‚úÖ No users exceeded the failure threshold of {FAILURE_THRESHOLD} within the last 24 hours.")
    else:
        print(f"üö® Users with ‚â• {FAILURE_THRESHOLD} failures and at least one successful sign-in:")
        user_totals.show(20, truncate=False)

        top_users = user_totals.limit(20)
        pandas_df = top_users.toPandas()

        if pandas_df.empty:
            print("‚ÑπÔ∏è No rows to visualize after filtering.")
        else:
            pandas_df["DisplayName"] = pandas_df["UserDisplayName"].fillna(pandas_df["UserPrincipalName"])
            plt.figure(figsize=(12, 6))
            plt.bar(pandas_df["DisplayName"], pandas_df["FailureCount"], color="#1f77b4")
            plt.xlabel("Users")
            plt.ylabel("Number of failed sign-ins")
            plt.title("Top users by failed sign-ins with eventual success")
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()
            plt.close()