In [5]:
import pandas as pd
import re
from datetime import datetime
import numpy as np
from urllib.parse import urlparse

# Corrected: removed duplicate 'dest_domain'
REQUIRED_COLUMNS = [
    'timestamp', 'user', 'action', 'description', 'protocol', 'url', 'method',
    'source_ip', 'dest_ip', 'dest_domain', 'client_ip', 'source_port',
    'dest_port', 'severity', 'host', 'rule_name', 'app', 'category',
    'bytes_sent', 'bytes_received', 'threat_id', 'user_agent', 'eventid',
    'workstation', 'privilege', 'object', 'accessmask', 'target', 'status',
    'reason', 'process', 'parentprocess', 'fileaccessed', 'alert', 'machine',
    'os', 'details', 'resource', 'hostname', 'location', 'facility', 'code',
    'message', 'event', 'file', 'verdict', 'log_type'
]

def clean_csv_log(file_path, log_type):
    try:
        df = pd.read_csv(file_path)
        df.dropna(how='all', inplace=True)
        df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
        df['log_type'] = log_type
        return df
    except Exception as e:
        print(f"[!] Error reading {file_path}: {e}")
        return pd.DataFrame(columns=REQUIRED_COLUMNS)

def normalize_timestamp(ts):
    try:
        return datetime.strptime(ts, '%Y-%m-%d %H:%M:%S').isoformat()
    except Exception:
        return None

def clean_ip_column(ip):
    ip_pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
    if isinstance(ip, str) and re.match(ip_pattern, ip):
        return ip
    return np.nan

def clean_domain_column(domain):
    domain_pattern = r'^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    if isinstance(domain, str) and re.match(domain_pattern, domain):
        return domain.lower()
    return np.nan

def map_ad_columns(df):
    ip_fields = ['client_ip', 'client_address', 'ip_address', 'ip']
    client_ip_col = next((col for col in ip_fields if col in df.columns), None)
    
    if client_ip_col:
        df['client_ip'] = df[client_ip_col]
    else:
        df['client_ip'] = None

    col_map = {
        'user': 'user',
        'description': 'description',
        'timestamp': 'timestamp'
    }
    col_map = {k: v for k, v in col_map.items() if k in df.columns}
    df = df.rename(columns=col_map)
    return df

def map_proxy_columns(df):
    ip_fields = ['client', 'client_ip', 'source_ip', 'ip']
    for field in ip_fields:
        if field in df.columns:
            df['client_ip'] = df[field].astype(str).str.extract(r'(\d{1,3}(?:\.\d{1,3}){3})')[0]
            break
    else:
        df['client_ip'] = None

    if 'url' in df.columns:
        df['hostname'] = df['url'].apply(lambda x: urlparse(str(x)).hostname if pd.notnull(x) else None)

    return df

def map_firewall_columns(df):
    if 'dest_ip' in df.columns:
        df['dest_domain'] = df['dest_ip'].apply(
            lambda x: x if isinstance(x, str) and not re.match(r'^(\d{1,3}\.){3}\d{1,3}$', x) else np.nan
        )
        df['dest_ip'] = df['dest_ip'].apply(clean_ip_column)
    return df

# Define your input files
log_files = {
    "firewall": "./logs/firewall_logs.csv",
    "proxy": "./logs/proxy_logs.csv",
    "ad": "./logs/ad_logs.csv",
    "edr": "./logs/edr_logs.csv",
    "iam": "./logs/iam_logs.csv",
    "mail": "./logs/mail_logs.csv",
    "router": "./logs/router_logs.csv",
    "user": "./logs/user.csv",
    "siem": "./logs/siem_logs.csv"
}

dataframes = []
for log_type, file_path in log_files.items():
    df = clean_csv_log(file_path, log_type)

    if df.empty:
        continue

    if log_type == 'ad':
        df = map_ad_columns(df)
    elif log_type == 'proxy':
        df = map_proxy_columns(df)
    elif log_type == 'firewall':
        df = map_firewall_columns(df)
    else:
        df['client_ip'] = None
        df['dest_domain'] = None

    # Ensure all required columns exist
    for col in REQUIRED_COLUMNS:
        if col not in df.columns:
            df[col] = None

    df = df[REQUIRED_COLUMNS]  # Reorder and limit columns
    dataframes.append(df)

# Concatenate all processed logs
final_dataset = pd.concat(dataframes, ignore_index=True)

# Normalize timestamps
final_dataset['timestamp'] = final_dataset['timestamp'].apply(normalize_timestamp)

# Clean IP fields
for ip_col in ['source_ip', 'dest_ip', 'client_ip']:
    final_dataset[ip_col] = final_dataset[ip_col].apply(clean_ip_column)

# Clean domain fields
final_dataset['domain'] = final_dataset['hostname'].apply(clean_domain_column)
final_dataset['dest_domain'] = final_dataset['dest_domain'].apply(clean_domain_column)

# Drop temporary hostname column
final_dataset.drop(columns=['hostname'], inplace=True)

# Save final output
final_dataset.to_csv("normalized_logs.csv", index=False)
print("[✓] All logs cleaned, normalized, and saved to normalized_logs.csv")

[✓] All logs cleaned, normalized, and saved to normalized_logs.csv


  final_dataset = pd.concat(dataframes, ignore_index=True)


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('normalized_logs.csv')

# Convert 'event_time' to datetime, coercing errors to NaT (Not a Time)
# This handles cases where some time entries might not be in a standard format
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Drop rows where 'event_time' could not be parsed (NaT values)
df.dropna(subset=['timestamp'], inplace=True)

# Sort the DataFrame by 'event_time'
df_sorted = df.sort_values(by='timestamp')

# Save the sorted DataFrame to a new CSV file
output_file_name = 'logs_sorted_by_time.csv'
df_sorted.to_csv(output_file_name, index=False)

print(f"\nLogs sorted by time saved to {output_file_name}")


Logs sorted by time saved to logs_sorted_by_time.csv


In [3]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("logs_sorted_by_time.csv")

# Normalize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Drop empty rows and columns
df.dropna(how='all', inplace=True)
df.dropna(axis=1, how='all', inplace=True)

# Clean strings (skip these fields)
exclude_clean = ['user_agent', 'url', 'fileaccessed', 'message']
for col in df.select_dtypes(include='object').columns:
    if col not in exclude_clean:
        df[col] = df[col].astype(str).str.strip().str.lower()

# Convert timestamp
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df = df[df['timestamp'].notna()]

# Standardize event IDs
for field in ['eventid', 'threat_id']:
    if field in df.columns:
        df[field] = df[field].replace(['', ' ', 'nan', 'None', None], pd.NA).fillna(-1)
        try:
            df[field] = df[field].astype(int)
        except:
            pass

# IP cleanup
ip_fields = ['source_ip', 'dest_ip', 'client_ip']
for col in ip_fields:
    if col in df.columns:
        df[col] = df[col].astype(str).str.extract(r'(\d{1,3}(?:\.\d{1,3}){3})')[0].fillna('0.0.0.0')

# Port cleanup
for port in ['source_port', 'dest_port']:
    if port in df.columns:
        df[port] = pd.to_numeric(df[port], errors='coerce').fillna(0).astype(int)
        df[port] = df[port].replace(0, -1)

# Bytes cleanup
for col in ['bytes_sent', 'bytes_received']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Normalize categorical values
cat_fields = ['action', 'severity', 'verdict', 'log_type', 'category', 'protocol']
for col in cat_fields:
    if col in df.columns:
        df[col] = df[col].replace({'none': np.nan, 'na': np.nan, 'null': np.nan}).fillna('unknown')

# Fill general missing values
df.fillna({
    'user': 'unknown',
    'app': 'unknown',
    'rule_name': 'unknown',
    'host': 'unknown',
    'workstation': 'unknown',
    'privilege': 'unknown',
    'event': 'unknown',
    'status': 'unknown',
    'alert': 'none',
    'machine': 'unknown',
    'os': 'unknown',
    'domain': 'unknown',
    'location': 'unknown',
    'reason': 'unspecified',
    'accessmask': 'unknown',
}, inplace=True)

# Fill text fields
text_fill = {
    'url': 'unknown_url',
    'user_agent': 'unknown_agent',
    'fileaccessed': 'none',
    'message': 'no_message',
}
for col, default in text_fill.items():
    if col in df.columns:
        df[col] = df[col].fillna(default).replace('nan', default).replace('', default)

# Rename columns to consistent format
column_rename_map = {
    'timestamp': 'event_time',
    'eventid': 'event_id',
    'threat_id': 'threat_identifier',
    'user': 'user_account',
    'severity': 'event_severity',
    'log_type': 'log_source_type',
    'source_ip': 'source_ip_address',
    'dest_ip': 'destination_ip_address',
    'client_ip': 'client_ip_address',
    'dest_domain': 'destination_domain',
    'source_port': 'source_port_number',
    'dest_port': 'destination_port_number',
    'protocol': 'network_protocol',
    'action': 'security_action',
    'verdict': 'security_verdict',
    'category': 'event_category',
    'app': 'application_name',
    'rule_name': 'detection_rule',
    'bytes_sent': 'bytes_sent_total',
    'bytes_received': 'bytes_received_total',
    'url': 'resource_url',
    'method': 'http_method',
    'user_agent': 'user_agent_string',
    'description': 'event_description',
    'fileaccessed': 'file_accessed_path',
    'host': 'host_name',
    'workstation': 'workstation_name',
    'privilege': 'user_privilege_level',
    'object': 'affected_object',
    'accessmask': 'access_mask',
    'target': 'target_resource',
    'status': 'event_status',
    'reason': 'status_reason',
    'process': 'process_name',
    'parentprocess': 'parent_process_name',
    'alert': 'alert_type',
    'machine': 'machine_name',
    'os': 'operating_system',
    'event': 'raw_event_data',
    'file': 'file_path',
    'facility': 'log_facility',
    'code': 'status_code',
    'message': 'log_message',
    'details': 'event_details',
    'resource': 'resource_name',
    'location': 'event_location',
    'domain': 'network_domain'
}
df.rename(columns=column_rename_map, inplace=True)

# Reorder columns logically
ordered_columns = [
    # Event metadata
    'event_time','log_source_type' ,'event_id', 'threat_identifier',

    # User/device context
    'user_account', 'user_privilege_level', 'host_name', 'workstation_name', 'machine_name', 'operating_system',

    # Network context
    'source_ip_address', 'source_port_number', 'destination_ip_address', 'destination_port_number',
    'client_ip_address', 'destination_domain', 'network_domain', 'network_protocol',

    # Security action
    'security_action', 'event_severity', 'event_status', 'status_reason', 'event_category', 'detection_rule', 'security_verdict', 'alert_type',

    # Resource access
    'resource_url', 'http_method', 'user_agent_string', 'resource_name', 'affected_object', 'target_resource',
    'file_accessed_path', 'file_path',

    # Process/activity
    'process_name', 'parent_process_name', 'access_mask',

    # Logging & raw data
    'event_description', 'event_details', 'log_message', 'raw_event_data', 'log_facility', 'status_code', 'event_location',

    # Data volume
    'bytes_sent_total', 'bytes_received_total',

    # Extras
    'application_name'
]

# Keep only existing columns
existing_ordered_columns = [c for c in ordered_columns if c in df.columns]
remaining_columns = [c for c in df.columns if c not in existing_ordered_columns]
df = df[existing_ordered_columns + remaining_columns]

# Remove duplicates
df.drop_duplicates(inplace=True)

# Save to file
df.to_csv("dataset.csv", index=False)
print("✅ Logs cleaned, renamed, and saved as 'dataset.csv'")


✅ Logs cleaned, renamed, and saved as 'dataset.csv'


In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Load original dataset
original_path = "./dataset.csv"
df = pd.read_csv(original_path)

# Helper function to generate random times
def random_time(start_time, end_time, count):
    start = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
    end = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
    return [start + timedelta(seconds=random.randint(0, int((end - start).total_seconds()))) for _ in range(count)]

# Predefined attacks to simulate
def generate_attack_logs(num_each=10):
    logs = []

    phishing_users = ['noura.benali', 'ismail.rahimi', 'yassir.ouattara']
    brute_users = ['saif.alqahtani', 'hajar.ait', 'amin.elk']
    lateral_users = ['karim.dadi', 'mohamed.hassan', 'nadia.bouk']

    times = random_time("2025-05-26 09:00:00", "2025-05-26 11:00:00", num_each * 3)

    for i in range(num_each):
        # Phishing attack
        logs.append({
            "event_time": times[i],
            "log_source_type": "firewall",
            "event_id": -1,
            "threat_identifier": "phishing",
            "user_account": random.choice(phishing_users),
            "event_description": "User clicked on suspicious email link",
            "application_name": "http",
            "log_message": "Access to known phishing domain detected",
            "bytes_sent_total": random.randint(200, 1000),
            "bytes_received_total": random.randint(1000, 4000),
        })

        # Brute force attack
        logs.append({
            "event_time": times[i + num_each],
            "log_source_type": "edr",
            "event_id": -1,
            "threat_identifier": "brute_force",
            "user_account": random.choice(brute_users),
            "event_description": "Multiple failed login attempts",
            "application_name": "ssh",
            "log_message": "Account locked after repeated login failures",
            "bytes_sent_total": random.randint(100, 300),
            "bytes_received_total": random.randint(50, 150),
        })

        # Lateral movement
        logs.append({
            "event_time": times[i + 2*num_each],
            "log_source_type": "ad",
            "event_id": -1,
            "threat_identifier": "lateral_movement",
            "user_account": random.choice(lateral_users),
            "event_description": "Suspicious internal remote login",
            "application_name": "rdp",
            "log_message": "Unusual RDP access across subnets",
            "bytes_sent_total": random.randint(300, 1200),
            "bytes_received_total": random.randint(400, 1500),
        })

    return pd.DataFrame(logs)

# Generate and merge with original data
attack_logs_df = generate_attack_logs(num_each=20)
full_dataset = pd.concat([df, attack_logs_df], ignore_index=True)

# Fill NaNs with a default string to avoid correlation issues
full_dataset.fillna("N/A", inplace=True)

# Save the result
attack_path = "./dataset_with_attacks.csv"
full_dataset.to_csv(attack_path, index=False)

attack_path


  full_dataset.fillna("N/A", inplace=True)


'./dataset_with_attacks.csv'

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("dataset_with_attacks.csv")
df['event_time'] = pd.to_datetime(df['event_time'], errors='coerce')

# Create 5-minute window bucket
df['time_bucket'] = df['event_time'].dt.floor('5min')

# Group by user and time window
grouped = df.groupby(['user_account', 'time_bucket'])

# List of all events for each group
summary = grouped['event_description'].apply(lambda x: list(x.dropna())).reset_index()
summary.rename(columns={'event_description': 'events_in_5min'}, inplace=True)

# Correlation Rule Logic
def classify_threat(events):
    events_text = ' '.join(events).lower()
    if "phishing" in events_text or "suspicious email" in events_text:
        return "phishing"
    elif "failed login" in events_text or "account locked" in events_text:
        return "brute_force"
    elif "rdp access" in events_text or "remote login" in events_text:
        return "lateral_movement"
    else:
        return "normal"

# Apply correlation rule
summary['correlated_threat'] = summary['events_in_5min'].apply(classify_threat)

# Save the correlation summary
summary.to_csv("user_activity_correlation_5min.csv", index=False)
print("Saved to user_activity_correlation_5min.csv")


Saved to user_activity_correlation_5min.csv
