In [1]:
import pandas as pd
import ipaddress

In [5]:
df = pd.read_csv('sample_logs_no_status.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df

Unnamed: 0,timestamp,user_id,action,ip_address
0,2025-09-01 08:01:00,user1,download_file,192.168.1.27
1,2025-09-01 08:02:00,user18,login_success,192.168.1.13
2,2025-09-01 08:03:00,user7,login_success,192.168.1.81
3,2025-09-01 08:05:00,user18,view_report,192.168.1.45
4,2025-09-01 08:06:00,user6,view_report,192.168.1.29
...,...,...,...,...
411,2025-09-04 10:03:45,user14,login_failed,203.0.113.200
412,2025-09-04 10:04:30,user14,login_failed,203.0.113.200
413,2025-09-04 10:05:15,user14,login_failed,203.0.113.200
414,2025-09-06 09:00:00,user16,login_success,185.34.100.25


### Brute force detection

In [6]:
failed_logins = df[df['action'] == 'login_failed'].copy()
failed_logins = failed_logins.sort_values(['ip_address', 'timestamp'])

failed_logins['fail_count'] = (
    failed_logins.groupby('ip_address')
    .rolling('10min', on='timestamp')['action']
    .count()
    .values
)
brute_force_anomalies = failed_logins[failed_logins['fail_count'] >= 3]

brute_force_anomalies

Unnamed: 0,timestamp,user_id,action,ip_address,fail_count
408,2025-09-04 10:01:30,user14,login_failed,203.0.113.200,3.0
409,2025-09-04 10:02:15,user14,login_failed,203.0.113.200,4.0
410,2025-09-04 10:03:00,user14,login_failed,203.0.113.200,5.0
411,2025-09-04 10:03:45,user14,login_failed,203.0.113.200,6.0
412,2025-09-04 10:04:30,user14,login_failed,203.0.113.200,7.0
413,2025-09-04 10:05:15,user14,login_failed,203.0.113.200,8.0
403,2025-09-03 14:05:18,user2,login_failed,5.5.5.5,3.0
405,2025-09-03 14:07:25,user19,login_failed,5.5.5.5,4.0


### External ips

In [7]:
def is_external(ip):
    try:
        return not ipaddress.ip_address(ip).is_private
    except:
        return False

df['is_external'] = df['ip_address'].apply(is_external)
external_access = df[df['is_external'] == True]
external_access

Unnamed: 0,timestamp,user_id,action,ip_address,is_external
9,2025-09-01 08:22:00,user2,view_report,92.123.45.13,True
10,2025-09-01 08:27:00,user9,view_report,151.101.1.69,True
14,2025-09-01 08:43:00,user11,download_file,93.184.216.34,True
19,2025-09-01 09:06:00,user12,login_success,8.8.8.8,True
22,2025-09-01 09:19:00,user9,change_settings,104.244.42.1,True
...,...,...,...,...,...
403,2025-09-03 14:05:18,user2,login_failed,5.5.5.5,True
404,2025-09-03 14:05:52,user3,login_success,5.5.5.5,True
405,2025-09-03 14:07:25,user19,login_failed,5.5.5.5,True
414,2025-09-06 09:00:00,user16,login_success,185.34.100.25,True


### Geo-hops

In [9]:
#take only the first part of the ip address, ip can change because Wi-fi disconnection
def get_network_prefix(ip):
    return ".".join(str(ip).split('.')[:2])

#sort by user first and then by time
df = df.sort_values(['user_id', 'timestamp'])

#gives the row the data from the row above (where and when the user connected last)
df['ip_prefix'] = df['ip_address'].apply(get_network_prefix)
df['prev_ip'] = df.groupby('user_id')['ip_address'].shift(1)
df['prev_prefix'] = df.groupby('user_id')['ip_prefix'].shift(1)
df['prev_timestamp'] = df.groupby('user_id')['timestamp'].shift(1)

#calculates the time_difference between the connections
df['time_diff'] = (df['timestamp'] - df['prev_timestamp']).dt.total_seconds() / 60

#if the ip changed, changed place and the time difference is small then it suspicious
geo_hops = df[
    (df['ip_prefix'] != df['prev_prefix']) &
    (df['time_diff'] < 15) &
    (df['prev_prefix'].notna())
].copy()
print(f"{geo_hops.shape[0]} rows")
geo_hops[['timestamp', 'user_id', 'prev_ip', 'ip_address', 'time_diff']].head()

36 rows


Unnamed: 0,timestamp,user_id,prev_ip,ip_address,time_diff
209,2025-09-01 20:40:00,user12,203.0.113.75,192.168.1.43,12.0
342,2025-09-02 05:04:00,user12,192.168.1.35,93.184.216.34,9.0
394,2025-09-02 08:30:00,user12,151.101.1.69,10.0.0.7,1.0
79,2025-09-01 12:34:00,user13,93.184.216.34,192.168.1.15,9.0
81,2025-09-01 12:42:00,user14,104.244.42.1,192.168.1.52,9.0


### Create report

In [11]:
import json

def create_anomaly_entry(row, anomaly_type, rationale):
    return {
        "timestamp": str(row['timestamp']),
        "user_id": row['user_id'],
        "ip_address": row['ip_address'],
        "type": anomaly_type,
        "rationale": rationale
    }

final_report = []

for _, row in brute_force_anomalies.iterrows():
    msg = f"Detected {int(row['fail_count'])} failed logins from this IP within 10 minutes."
    final_report.append(create_anomaly_entry(row, "Brute Force Attempt", msg))

for _, row in geo_hops.iterrows():
    msg = f"User logged in from {row['ip_address']} shortly after {row['prev_ip']} (Time diff: {row['time_diff']:.1f} mins)."
    final_report.append(create_anomaly_entry(row, "Potential Geo-hop", msg))

final_report = sorted(final_report, key=lambda x: x['timestamp'])

with open('security_anomalies_report.json', 'w') as f:
    json.dump(final_report, f, indent=4)

print(f"Report generated successfully with {len(final_report)} entries.")

Report generated successfully with 44 entries.
