In [1]:
# ============================================================
# ðŸ”§ Synthetic Dataset Generator: login_activity.csv
# Creates realistic login logs for analysis in Pandas
# ============================================================

import pandas as pd
import random
from datetime import datetime, timedelta
import numpy as np

# -------------------------------
# Configuration
# -------------------------------
num_records = 200  # You can change this to 500+ if you want a bigger dataset
output_file = "login_activity.csv"

usernames = ["alice", "bob", "charlie", "david", "eve", "frank", "grace", "heidi", "ivan", "judy"]
locations = ["India", "USA", "UK", "Germany", "Australia", "Singapore", "Brazil"]
statuses = ["Success", "Failed"]

# Function to generate random IPs
def random_ip():
    return ".".join(str(random.randint(1, 255)) for _ in range(4))

# Generate random timestamps within the last 7 days
def random_timestamp():
    start_date = datetime.now() - timedelta(days=7)
    random_minutes = random.randint(0, 7 * 24 * 60)
    return start_date + timedelta(minutes=random_minutes)

# Introduce a slightly higher chance of failure during odd hours (0â€“5 AM)
def random_status(hour):
    if 0 <= hour <= 5:
        return np.random.choice(statuses, p=[0.3, 0.7])  # 70% Failed
    else:
        return np.random.choice(statuses, p=[0.8, 0.2])  # 20% Failed

# -------------------------------
# Generate the dataset
# -------------------------------
records = []
for _ in range(num_records):
    timestamp = random_timestamp()
    hour = timestamp.hour
    record = {
        "username": random.choice(usernames),
        "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
        "source_ip": random_ip(),
        "location": random.choice(locations),
        "status": random_status(hour)
    }
    records.append(record)

# Create DataFrame
df = pd.DataFrame(records)

# Optional: insert some duplicate & missing values for cleaning practice
if num_records > 50:
    df.loc[random.sample(range(len(df)), 3), "username"] = np.nan
    df.loc[random.sample(range(len(df)), 3), "location"] = np.nan
    df = pd.concat([df, df.sample(2)])  # Add 2 duplicate rows

# Save to CSV
df.to_csv(output_file, index=False)
print(f"âœ… Dataset '{output_file}' created successfully with {len(df)} records!")

# Display first few rows
df.head()

âœ… Dataset 'login_activity.csv' created successfully with 202 records!


Unnamed: 0,username,timestamp,source_ip,location,status
0,david,2026-01-04 17:30:55,136.103.14.3,Singapore,Success
1,judy,2026-01-07 12:34:55,227.133.121.123,Brazil,Success
2,frank,2026-01-02 20:14:55,97.116.137.169,Brazil,Success
3,bob,2026-01-02 15:19:55,220.134.69.41,India,Success
4,grace,2026-01-01 23:40:55,45.200.96.161,Singapore,Success
