In [None]:
import pandas as pd
import re
from collections import defaultdict
from tqdm import tqdm

# input
RAW_LOG = 'log-attack.txt'
OUT_CSV = 'ssh_logs_processed.csv'

FAIL_COUNT_THRESHOLD = 50
OCC_COUNT_THRESHOLD = 100

In [None]:
# raw log
ip_pattern = re.compile(r'\b\d{1,3}(?:\.\d{1,3}){3}\b')
time_pattern = re.compile(r'^\s*([0-9T:\-\+]+)')

ip_events = defaultdict(list)
all_lines = []
with open(RAW_LOG, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        all_lines.append(line)
        ipm = ip_pattern.search(line)
        if ipm:
            ip = ipm.group()
            ip_events[ip].append(line.strip())

In [None]:
#tính fail_count và occ_count
rows = []
for ip, events in ip_events.items():
    occ_count = len(events)
    fail_count = 0
    status = 200
    for msg in events:
        if 'Failed password' in msg or 'authentication failure' in msg or 'Invalid user' in msg:
            fail_count += 1
    rows.append({
        'ip': ip,
        'occ_count': occ_count,
        'fail_count': fail_count,
        'status': status
    })

df_processed = pd.DataFrame(rows)

In [None]:
# phân loại
def classify_row(row):
    if row['fail_count'] >= FAIL_COUNT_THRESHOLD and row['occ_count'] >= OCC_COUNT_THRESHOLD:
        return 'bruteforce'
    return 'other'

df_processed['classification'] = df_processed.apply(classify_row, axis=1)

In [None]:
#lưu
df_processed.to_csv(OUT_CSV, index=False)
print('Saved to CSV:', OUT_CSV)

Saved to CSV: ssh_logs_processed.csv


In [None]:
print('raw preview')
print(''.join(all_lines[:30]))

raw preview
2025-10-18T08:21:51+07:00 TDC-250818413 sshd[748510]: pam_unix(sshd:auth): check pass; user unknown
2025-10-18T08:21:51+07:00 TDC-250818413 sshd[748510]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=45.159.112.142
2025-10-18T08:21:54+07:00 TDC-250818413 sshd[748510]: Failed password for invalid user user from 45.159.112.142 port 33580 ssh2
2025-10-18T08:21:55+07:00 TDC-250818413 sshd[748510]: Received disconnect from 45.159.112.142 port 33580:11: Bye Bye [preauth]
2025-10-18T08:21:55+07:00 TDC-250818413 sshd[748510]: Disconnected from invalid user user 45.159.112.142 port 33580 [preauth]
2025-10-18T08:21:57+07:00 TDC-250818413 sshd[748513]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=185.169.6.22  user=root
2025-10-18T08:21:57+07:00 TDC-250818413 sshd[748515]: Invalid user admin from 192.145.169.91 port 60232
2025-10-18T08:21:58+07:00 TDC-250818413 sshd[748515]: pam_unix(sshd:auth): check

In [None]:
print('dataset preview')
print(df_processed.head(30))

dataset preview
                 ip  occ_count  fail_count  status classification
0    45.159.112.142       1351         806     200     bruteforce
1      185.169.6.22       2493        1381     200     bruteforce
2    192.145.169.91       3703        2139     200     bruteforce
3      45.140.17.97          3           0     200          other
4    49.248.175.215       2828        1695     200     bruteforce
5      87.106.90.97        163         108     200     bruteforce
6   190.129.122.185         90          52     200          other
7     171.254.92.61        276         207     200     bruteforce
8     185.102.75.74        111          83     200     bruteforce
9     82.156.52.230          1           0     200          other
10    114.96.103.55          4           2     200          other
11    78.128.112.74         20          15     200          other
12   195.178.110.15          2           0     200          other
13    197.243.67.27         15           0     200          