# Module 7 – Log Parsing Lab
## Applying Python to Cybersecurity


In [67]:
import pandas as pd
from datetime import datetime
from pandas.io.formats.style import Styler

## Task 1 – Load the Dataset

In [68]:
# Load dataset
df = pd.read_csv('auth_logs.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Display first 5 rows
print("First 5 rows:")
display(df.head().style.hide(axis='index'))

# Check data types
print("\nData types:")
print(df.dtypes)

# Basic shape info
print(f"\nShape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Event types present: {df['event_type'].unique().tolist()}")

First 5 rows:


timestamp,user,ip_address,event_type
2025-02-14 13:11:00,charlie,172.16.0.8,LOGIN_FAILED
2025-02-14 00:17:00,alice,203.0.113.5,LOGIN_FAILED
2025-02-14 19:48:00,frank,10.0.0.5,LOGIN_SUCCESS
2025-02-14 12:25:00,alice,192.168.1.20,LOGIN_SUCCESS
2025-02-14 20:03:00,david,192.168.1.20,LOGIN_SUCCESS



Data types:
timestamp     datetime64[us]
user                     str
ip_address               str
event_type               str
dtype: object

Shape: 500 rows, 4 columns
Event types present: ['LOGIN_FAILED', 'LOGIN_SUCCESS', 'FILE_ACCESS', 'PASSWORD_CHANGE']


## Task 2 – Feature Extraction

In [69]:
# --- Failed logins per user ---
failed_logins = df[df['event_type'] == 'LOGIN_FAILED'].groupby('user').size().reset_index(name='failed_logins')
print("Failed logins per user:")
print(failed_logins.to_string(index=False))

# --- Login attempts per IP ---
attempts_per_ip = df[df['event_type'].str.startswith('LOGIN')].groupby('ip_address').size().reset_index(name='login_attempts')
attempts_per_ip = attempts_per_ip.sort_values('login_attempts', ascending=False)
print("\nLogin attempts per IP:")
print(attempts_per_ip.to_string(index=False))

# --- Login attempts per hour ---
df['hour'] = df['timestamp'].dt.hour
attempts_per_hour = df[df['event_type'].str.startswith('LOGIN')].groupby('hour').size().reset_index(name='login_attempts')
print("\nLogin attempts per hour:")
print(attempts_per_hour.to_string(index=False))

Failed logins per user:
   user  failed_logins
  alice             50
    bob             18
charlie             17
  david             18
    eva             20
  frank             19

Login attempts per IP:
  ip_address  login_attempts
192.168.1.20              93
  172.16.0.8              92
192.168.1.10              89
    10.0.0.5              87
 203.0.113.5              85

Login attempts per hour:
 hour  login_attempts
    0              17
    1              20
    2              16
    3              21
    4              17
    5              20
    6              18
    7              25
    8              22
    9              13
   10              15
   11              18
   12              18
   13              16
   14              15
   15              20
   16              23
   17              20
   18              19
   19              23
   20              18
   21              15
   22              14
   23              23


## Task 3 – Rule-Based Detection

In [70]:
# --- Rule 1: More than 5 failed login attempts per user ---
fail_counts = df[df['event_type'] == 'LOGIN_FAILED'].groupby('user').size().reset_index(name='fail_count')
brute_force_suspects = fail_counts[fail_counts['fail_count'] > 5]

print("Users with more than 5 failed login attempts:")
print(brute_force_suspects.to_string(index=False))

# --- Rule 2: Login activity between 2AM–4AM ---
suspicious_hours = df[(df['timestamp'].dt.hour >= 2) & (df['timestamp'].dt.hour < 4)]

print(f"\nLogin activity between 2AM–4AM ({len(suspicious_hours)} events):")
print(suspicious_hours[['timestamp', 'user', 'ip_address', 'event_type']].to_string(index=False))

Users with more than 5 failed login attempts:
   user  fail_count
  alice          50
    bob          18
charlie          17
  david          18
    eva          20
  frank          19

Login activity between 2AM–4AM (41 events):
          timestamp    user   ip_address      event_type
2025-02-14 03:32:00   frank     10.0.0.5   LOGIN_SUCCESS
2025-02-14 02:42:00 charlie 192.168.1.10   LOGIN_SUCCESS
2025-02-14 02:20:00     bob     10.0.0.5   LOGIN_SUCCESS
2025-02-14 02:54:00     eva 192.168.1.10    LOGIN_FAILED
2025-02-14 02:58:00 charlie  203.0.113.5    LOGIN_FAILED
2025-02-14 02:32:00   frank 192.168.1.10    LOGIN_FAILED
2025-02-14 03:32:00     eva   172.16.0.8    LOGIN_FAILED
2025-02-14 03:17:00   frank     10.0.0.5   LOGIN_SUCCESS
2025-02-14 02:16:00   alice 192.168.1.20    LOGIN_FAILED
2025-02-14 02:04:00   frank  203.0.113.5    LOGIN_FAILED
2025-02-14 03:26:00   alice   172.16.0.8    LOGIN_FAILED
2025-02-14 02:29:00     bob 192.168.1.20 PASSWORD_CHANGE
2025-02-14 02:22:00     eva 

## Task 4 – Analytical Questions
Answer:
- Which user had the most failed logins?
- Which IP generated the most login attempts?
- Are there signs of brute-force behavior?

In [71]:
# --- Which user had the most failed logins? ---
top_fail_user = failed_logins.sort_values('failed_logins', ascending=False).iloc[0]
display(f"User with most failed logins: {top_fail_user['user']} ({int(top_fail_user['failed_logins'])} failures)")

# --- Which IP generated the most login attempts? ---
top_ip = attempts_per_ip.iloc[0]
display(f"IP with most login attempts: {top_ip['ip_address']} ({int(top_ip['login_attempts'])} attempts)")

# --- Are there signs of brute-force behavior? ---
display("Brute-force suspects (>5 failed logins):")
if len(brute_force_suspects) > 0:
    display(brute_force_suspects.reset_index(drop=True).style.hide(axis='index'))
    display("Conclusion: Yes — the above user(s) show signs of brute-force behavior.")
else:
    display("None found. No clear brute-force behavior detected.")

'User with most failed logins: alice (50 failures)'

'IP with most login attempts: 192.168.1.20 (93 attempts)'

'Brute-force suspects (>5 failed logins):'

user,fail_count
alice,50
bob,18
charlie,17
david,18
eva,20
frank,19


'Conclusion: Yes — the above user(s) show signs of brute-force behavior.'

## Final Reflection
Write 1–2 paragraphs summarizing suspicious patterns observed.

The log data from 2025-02-14 shows clear signs of a coordinated brute-force attack across multiple users and IP addresses. Alice stood out individually with 50 failed login attempts, well above any reasonable threshold, but the broader picture is more concerning — several IP addresses each logged over 80 attempts, suggesting an automated tool systematically cycling through credentials rather than a single targeted effort.

This pattern is typical of credential stuffing or a botnet-driven attack, where rotating IPs are used to sidestep basic rate-limiting, and the sheer volume suggests the attacker wasn't particularly concerned with staying under the radar. The 2AM–4AM activity spike adds further weight to this conclusion, as that level of authentication traffic during off-hours is rarely legitimate and is a well-known fingerprint of automated attack tools running unattended. Altogether, the evidence points to the environment being under active and deliberate attack on this date.