# SOC-Level Anomaly Detection (Advanced)
### Scenarios: multi-stage brute force + impossible travel

You will build **SOC-style detections** and produce a concise incident narrative.


In [1]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, asin, sqrt
from datetime import timedelta


In [2]:
df = pd.read_csv(r'C:/Users/kayro/jupyter/Assignments/SOC_Level_Version/soc_auth_activity_advanced.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,event_type,user,ip_address,location,country,lat,lon,bytes_out
0,2025-02-14 00:04:42,FILE_READ,alice,104.26.3.2,"New York, US",US,40.71,-74.01,17380
1,2025-02-14 00:05:36,FILE_READ,eva,104.26.3.2,"New York, US",US,40.71,-74.01,47582
2,2025-02-14 00:05:50,FILE_READ,bob,192.168.1.10,"Los Angeles, US",US,34.05,-118.24,29524
3,2025-02-14 00:20:20,FILE_READ,eva,192.168.1.10,"Los Angeles, US",US,34.05,-118.24,40708
4,2025-02-14 00:50:00,AUTH_SUCCESS,admin1,192.168.1.10,"Los Angeles, US",US,34.05,-118.24,0


## Step 1 – Feature Engineering
**Windowed counts · Time deltas · Geo speed**

In [3]:
# --- 1a) Windowed AUTH_FAIL counts (per user+ip, 10-min windows) ---
fail = df[df['event_type'] == 'AUTH_FAIL'].copy().set_index('timestamp')
fail_counts = (
    fail.groupby(['user', 'ip_address'])
    .resample('10min')
    .size()
    .reset_index(name='auth_fail_10m')
)
print('Windowed AUTH_FAIL counts (>0):')
print(fail_counts[fail_counts['auth_fail_10m'] > 0].to_string(index=False))

# --- 1b) Time delta between consecutive events per user ---
df['prev_time'] = df.groupby('user')['timestamp'].shift(1)
df['time_delta_sec'] = (df['timestamp'] - df['prev_time']).dt.total_seconds()
print('\nSample time deltas:')
print(df[df['time_delta_sec'].notna()][['user','timestamp','event_type','time_delta_sec']].head(8).to_string(index=False))

# --- 1c) Geo speed between consecutive AUTH_SUCCESS per user ---
def haversine_km(lat1, lon1, lat2, lon2):
    from math import radians, sin, cos, asin, sqrt
    R = 6371.0
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi, dl = radians(lat2-lat1), radians(lon2-lon1)
    a = sin(dphi/2)**2 + cos(phi1)*cos(phi2)*sin(dl/2)**2
    return 2*R*asin(sqrt(a))

auth = df[df['event_type']=='AUTH_SUCCESS'].copy().sort_values(['user','timestamp'])
auth['prev_lat']   = auth.groupby('user')['lat'].shift(1)
auth['prev_lon']   = auth.groupby('user')['lon'].shift(1)
auth['prev_loc']   = auth.groupby('user')['location'].shift(1)
auth['prev_time_a']= auth.groupby('user')['timestamp'].shift(1)
mask  = auth['prev_time_a'].notna()
auth2 = auth[mask].copy()
auth2['dt_hours']  = (auth2['timestamp'] - auth2['prev_time_a']).dt.total_seconds() / 3600.0
auth2['dist_km']   = auth2.apply(lambda r: haversine_km(r['prev_lat'], r['prev_lon'], r['lat'], r['lon']), axis=1)
auth2['speed_kmh'] = auth2['dist_km'] / auth2['dt_hours'].replace(0, float('nan'))
print('\nGeo speed between consecutive logins:')
print(auth2[['user','prev_loc','location','dt_hours','dist_km','speed_kmh']].to_string(index=False))


Windowed AUTH_FAIL counts (>0):
       user    ip_address           timestamp  auth_fail_10m
     admin1 185.220.101.1 2025-02-14 01:50:00             30
svc_account 198.51.100.77 2025-02-14 04:10:00             28

Sample time deltas:
   user           timestamp   event_type  time_delta_sec
    eva 2025-02-14 00:20:20    FILE_READ           884.0
  alice 2025-02-14 00:53:21 AUTH_SUCCESS          2919.0
    eva 2025-02-14 00:56:30 AUTH_SUCCESS          2170.0
charlie 2025-02-14 01:11:18 AUTH_SUCCESS           373.0
 admin1 2025-02-14 01:29:00 AUTH_SUCCESS          2340.0
 admin1 2025-02-14 01:33:00 AUTH_SUCCESS           240.0
  alice 2025-02-14 01:36:57    FILE_READ          2616.0
  alice 2025-02-14 01:49:42 AUTH_SUCCESS           765.0

Geo speed between consecutive logins:
   user        prev_loc        location  dt_hours     dist_km     speed_kmh
 admin1 Los Angeles, US   Frankfurt, DE  0.650000 9302.605752  14311.701158
 admin1   Frankfurt, DE Los Angeles, US  0.066667 9302.60575

  .size()


## 1) Detection A – Multi-Stage Brute Force Chain

In [4]:
# Windowed counts of AUTH_FAIL per (user, ip) over 10 minutes
fail = df[df['event_type'] == 'AUTH_FAIL'].copy().set_index('timestamp')
counts = fail.groupby(['user','ip_address']).resample('10min').size().reset_index(name='auth_fail_10m')
susp_fail = counts[counts['auth_fail_10m'] >= 25]
susp_fail.head()

  counts = fail.groupby(['user','ip_address']).resample('10min').size().reset_index(name='auth_fail_10m')


Unnamed: 0,user,ip_address,timestamp,auth_fail_10m
0,admin1,185.220.101.1,2025-02-14 01:50:00,30
1,svc_account,198.51.100.77,2025-02-14 04:10:00,28


In [5]:
# TODO: For each suspicious window, check if AUTH_SUCCESS occurs soon after
# Build an alerts table with evidence.
alerts = []

def add(alert_type, severity, user, ip, start, end, evidence):
    alerts.append({
        'alert_type': alert_type,
        'severity': severity,
        'user': user,
        'ip_address': ip,
        'start_time': start,
        'end_time': end,
        'evidence': evidence
    })

for _, r in susp_fail.iterrows():
    user, ip, st = r['user'], r['ip_address'], r['timestamp']
    window_end = st + pd.Timedelta('10min')
    # Look 30 minutes after for success
    after = df[(df['user']==user) & (df['ip_address']==ip) & (df['timestamp']>=st) & (df['timestamp']<=st+pd.Timedelta('40min'))]
    if (after['event_type']=='AUTH_SUCCESS').any():
        add('BRUTE_FORCE_THEN_SUCCESS','CRITICAL',user,ip,st,window_end,
            evidence=f"{int(r['auth_fail_10m'])} AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m")

pd.DataFrame(alerts).head()

Unnamed: 0,alert_type,severity,user,ip_address,start_time,end_time,evidence
0,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,admin1,185.220.101.1,2025-02-14 01:50:00,2025-02-14 02:00:00,"30 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
1,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,svc_account,198.51.100.77,2025-02-14 04:10:00,2025-02-14 04:20:00,"28 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"


## 2) Detection B – Privilege Escalation After Suspicious Auth

In [6]:
# Detection B: PRIV_ESCALATION within 30 min after a CRITICAL brute force alert
critical_alerts = [a for a in alerts if a['severity'] == 'CRITICAL']

for a in critical_alerts:
    user        = a['user']
    after_start = a['start_time']
    after_end   = after_start + pd.Timedelta('30min')

    priv = df[
        (df['user'] == user) &
        (df['event_type'] == 'PRIV_ESCALATION') &
        (df['timestamp'] >= after_start) &
        (df['timestamp'] <= after_end)
    ]

    for _, r in priv.iterrows():
        add('PRIV_ESC_AFTER_BRUTE_FORCE', 'CRITICAL',
            user, r['ip_address'],
            r['timestamp'], r['timestamp'],
            evidence=f"PRIV_ESCALATION at {r['location']} within 30m of brute force")

pd.DataFrame(alerts).tail()


Unnamed: 0,alert_type,severity,user,ip_address,start_time,end_time,evidence
0,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,admin1,185.220.101.1,2025-02-14 01:50:00,2025-02-14 02:00:00,"30 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
1,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,svc_account,198.51.100.77,2025-02-14 04:10:00,2025-02-14 04:20:00,"28 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
2,PRIV_ESC_AFTER_BRUTE_FORCE,CRITICAL,admin1,185.220.101.1,2025-02-14 02:10:00,2025-02-14 02:10:00,"PRIV_ESCALATION at Frankfurt, DE within 30m of..."


## 3) Detection C – Impossible Travel

In [7]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi = radians(lat2-lat1)
    dl = radians(lon2-lon1)
    a = sin(dphi/2)**2 + cos(phi1)*cos(phi2)*sin(dl/2)**2
    return 2*R*asin(sqrt(a))

# Compute consecutive successful auths per user and estimate travel speed
auth = df[df['event_type']=='AUTH_SUCCESS'].copy().sort_values(['user','timestamp'])
auth['prev_time'] = auth.groupby('user')['timestamp'].shift(1)
auth['prev_lat'] = auth.groupby('user')['lat'].shift(1)
auth['prev_lon'] = auth.groupby('user')['lon'].shift(1)
auth['prev_loc'] = auth.groupby('user')['location'].shift(1)

mask = auth['prev_time'].notna()
auth2 = auth[mask].copy()
auth2['dt_hours'] = (auth2['timestamp'] - auth2['prev_time']).dt.total_seconds() / 3600.0
auth2['dist_km'] = auth2.apply(lambda r: haversine_km(r['prev_lat'], r['prev_lon'], r['lat'], r['lon']), axis=1)
auth2['speed_kmh'] = auth2['dist_km'] / auth2['dt_hours'].replace(0, np.nan)

# Flag if speed exceeds 900 km/h (faster than realistic travel between logins)
impossible = auth2[auth2['speed_kmh'] > 900].copy()
impossible[['user','prev_time','prev_loc','timestamp','location','dist_km','dt_hours','speed_kmh']].head(10)

Unnamed: 0,user,prev_time,prev_loc,timestamp,location,dist_km,dt_hours,speed_kmh
9,admin1,2025-02-14 00:50:00,"Los Angeles, US",2025-02-14 01:29:00,"Frankfurt, DE",9302.605752,0.65,14311.701158
10,admin1,2025-02-14 01:29:00,"Frankfurt, DE",2025-02-14 01:33:00,"Los Angeles, US",9302.605752,0.066667,139539.086287
49,admin1,2025-02-14 01:33:00,"Los Angeles, US",2025-02-14 02:05:00,"Frankfurt, DE",9302.605752,0.533333,17442.385786
12,alice,2025-02-14 00:53:21,"Los Angeles, US",2025-02-14 01:49:42,"New York, US",3935.218414,0.939167,4190.117211
57,alice,2025-02-14 02:02:11,"New York, US",2025-02-14 02:35:05,"Los Angeles, US",3935.218414,0.548333,7176.690117
65,alice,2025-02-14 02:35:05,"Los Angeles, US",2025-02-14 03:43:16,"New York, US",3935.218414,1.136389,3462.915251
120,alice,2025-02-14 05:47:46,"New York, US",2025-02-14 05:54:59,"Los Angeles, US",3935.218414,0.120278,32717.75125
123,alice,2025-02-14 05:54:59,"Los Angeles, US",2025-02-14 06:10:41,"New York, US",3935.218414,0.261667,15039.051265
185,alice,2025-02-14 10:15:30,"New York, US",2025-02-14 13:56:45,"Los Angeles, US",3935.218414,3.6875,1067.177875
196,alice,2025-02-14 13:56:45,"Los Angeles, US",2025-02-14 15:42:11,"New York, US",3935.218414,1.757222,2239.454045


In [8]:
# Convert impossible travel rows into alerts
for _, r in impossible.iterrows():
    add('IMPOSSIBLE_TRAVEL','HIGH', r['user'], r['ip_address'], r['prev_time'], r['timestamp'],
        evidence=f"{r['prev_loc']} -> {r['location']} in {r['dt_hours']:.2f}h (~{r['speed_kmh']:.0f} km/h)")

alerts_df = pd.DataFrame(alerts)
alerts_df.sort_values(['severity','start_time']).head(20)

Unnamed: 0,alert_type,severity,user,ip_address,start_time,end_time,evidence
0,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,admin1,185.220.101.1,2025-02-14 01:50:00,2025-02-14 02:00:00,"30 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
2,PRIV_ESC_AFTER_BRUTE_FORCE,CRITICAL,admin1,185.220.101.1,2025-02-14 02:10:00,2025-02-14 02:10:00,"PRIV_ESCALATION at Frankfurt, DE within 30m of..."
1,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,svc_account,198.51.100.77,2025-02-14 04:10:00,2025-02-14 04:20:00,"28 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
3,IMPOSSIBLE_TRAVEL,HIGH,admin1,185.220.101.1,2025-02-14 00:50:00,2025-02-14 01:29:00,"Los Angeles, US -> Frankfurt, DE in 0.65h (~14..."
6,IMPOSSIBLE_TRAVEL,HIGH,alice,104.26.3.2,2025-02-14 00:53:21,2025-02-14 01:49:42,"Los Angeles, US -> New York, US in 0.94h (~419..."
27,IMPOSSIBLE_TRAVEL,HIGH,charlie,104.26.3.2,2025-02-14 01:05:05,2025-02-14 01:11:18,"Los Angeles, US -> New York, US in 0.10h (~379..."
4,IMPOSSIBLE_TRAVEL,HIGH,admin1,192.168.1.10,2025-02-14 01:29:00,2025-02-14 01:33:00,"Frankfurt, DE -> Los Angeles, US in 0.07h (~13..."
5,IMPOSSIBLE_TRAVEL,HIGH,admin1,185.220.101.1,2025-02-14 01:33:00,2025-02-14 02:05:00,"Los Angeles, US -> Frankfurt, DE in 0.53h (~17..."
28,IMPOSSIBLE_TRAVEL,HIGH,charlie,192.168.1.10,2025-02-14 01:52:44,2025-02-14 02:04:15,"New York, US -> Los Angeles, US in 0.19h (~205..."
7,IMPOSSIBLE_TRAVEL,HIGH,alice,192.168.1.10,2025-02-14 02:02:11,2025-02-14 02:35:05,"New York, US -> Los Angeles, US in 0.55h (~717..."


## 4) Detection D – Data Exfiltration (bytes_out outliers)

In [9]:
# Simple baseline: per-user bytes_out distribution
exports = df[df['event_type'].isin(['DATA_EXPORT','FILE_READ'])].copy()
stats = exports.groupby('user')['bytes_out'].agg(['mean','std','count']).reset_index()
exports = exports.merge(stats, on='user', how='left')
exports['z'] = (exports['bytes_out'] - exports['mean']) / (exports['std'].replace(0, np.nan))
exfil = exports[(exports['event_type']=='DATA_EXPORT') & (exports['z'] > 3)].copy()
exfil[['timestamp','user','ip_address','bytes_out','z']].head(10)

Unnamed: 0,timestamp,user,ip_address,bytes_out,z


In [10]:
for _, r in exfil.iterrows():
    add('DATA_EXFIL', 'CRITICAL' if r['z'] > 5 else 'HIGH',
        r['user'], r['ip_address'],
        r['timestamp'], r['timestamp'],
        evidence=f"bytes_out={r['bytes_out']:,} (z={r['z']:.2f}) via {r['event_type']}")

pd.DataFrame(alerts).tail()


Unnamed: 0,alert_type,severity,user,ip_address,start_time,end_time,evidence
46,IMPOSSIBLE_TRAVEL,HIGH,eva,104.26.3.2,2025-02-14 20:36:17,2025-02-14 21:01:14,"Los Angeles, US -> New York, US in 0.42h (~946..."
47,IMPOSSIBLE_TRAVEL,HIGH,eva,192.168.1.10,2025-02-14 21:01:14,2025-02-14 21:25:10,"New York, US -> Los Angeles, US in 0.40h (~986..."
48,IMPOSSIBLE_TRAVEL,HIGH,eva,104.26.3.2,2025-02-14 21:25:10,2025-02-14 22:04:17,"Los Angeles, US -> New York, US in 0.65h (~603..."
49,IMPOSSIBLE_TRAVEL,HIGH,eva,192.168.1.10,2025-02-14 22:04:17,2025-02-14 22:10:40,"New York, US -> Los Angeles, US in 0.11h (~369..."
50,IMPOSSIBLE_TRAVEL,HIGH,eva,104.26.3.2,2025-02-14 22:10:40,2025-02-14 22:49:09,"Los Angeles, US -> New York, US in 0.64h (~613..."


## 5) Export + SOC Narrative

In [11]:
alerts_df = pd.DataFrame(alerts)
alerts_df = alerts_df.sort_values(['severity','start_time'], ascending=[True,True])
alerts_df.to_csv(r'C:/Users/kayro/jupyter/Assignments/SOC_Level_Version/soc_alerts.csv', index=False)
alerts_df


Unnamed: 0,alert_type,severity,user,ip_address,start_time,end_time,evidence
0,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,admin1,185.220.101.1,2025-02-14 01:50:00,2025-02-14 02:00:00,"30 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
2,PRIV_ESC_AFTER_BRUTE_FORCE,CRITICAL,admin1,185.220.101.1,2025-02-14 02:10:00,2025-02-14 02:10:00,"PRIV_ESCALATION at Frankfurt, DE within 30m of..."
1,BRUTE_FORCE_THEN_SUCCESS,CRITICAL,svc_account,198.51.100.77,2025-02-14 04:10:00,2025-02-14 04:20:00,"28 AUTH_FAIL in 10m, then AUTH_SUCCESS within 40m"
3,IMPOSSIBLE_TRAVEL,HIGH,admin1,185.220.101.1,2025-02-14 00:50:00,2025-02-14 01:29:00,"Los Angeles, US -> Frankfurt, DE in 0.65h (~14..."
6,IMPOSSIBLE_TRAVEL,HIGH,alice,104.26.3.2,2025-02-14 00:53:21,2025-02-14 01:49:42,"Los Angeles, US -> New York, US in 0.94h (~419..."
27,IMPOSSIBLE_TRAVEL,HIGH,charlie,104.26.3.2,2025-02-14 01:05:05,2025-02-14 01:11:18,"Los Angeles, US -> New York, US in 0.10h (~379..."
4,IMPOSSIBLE_TRAVEL,HIGH,admin1,192.168.1.10,2025-02-14 01:29:00,2025-02-14 01:33:00,"Frankfurt, DE -> Los Angeles, US in 0.07h (~13..."
5,IMPOSSIBLE_TRAVEL,HIGH,admin1,185.220.101.1,2025-02-14 01:33:00,2025-02-14 02:05:00,"Los Angeles, US -> Frankfurt, DE in 0.53h (~17..."
28,IMPOSSIBLE_TRAVEL,HIGH,charlie,192.168.1.10,2025-02-14 01:52:44,2025-02-14 02:04:15,"New York, US -> Los Angeles, US in 0.19h (~205..."
7,IMPOSSIBLE_TRAVEL,HIGH,alice,192.168.1.10,2025-02-14 02:02:11,2025-02-14 02:35:05,"New York, US -> Los Angeles, US in 0.55h (~717..."


### SOC Analyst Narrative (required)
Write a short narrative (8–12 bullets):
1. **Timeline**: What happened first → next → last?
2. **Most severe alert** and why
3. **Evidence** (counts, windows, IPs, locations)
4. **Likely attack type** (e.g., credential stuffing, account takeover)
5. **Recommended response actions** (containment + investigation)
