In [5]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Define possible categories
education_levels = [
    "None", "Primary", "Lower Secondary", "Upper Secondary",
    "Diploma/Technical", "Undergraduate", "Postgraduate"
]
education_probs = [0.05, 0.2, 0.25, 0.25, 0.1, 0.1, 0.05]

income_brackets = ["<5000", "5000-10000", "10000-20000", ">20000"]
marital_statuses = ["Married", "Unmarried", "Divorced", "Widowed"]
yes_no = ["Yes", "No"]
employment_statuses = ["Employed", "Unemployed", "Part-time", "Student", "Homemaker"]
housing_situations = ["Own", "Rent", "Shelter", "Homeless", "With relatives"]

# Function to simulate one row of data
def generate_row():
    age = np.random.randint(18, 50)
    education = np.random.choice(education_levels, p=education_probs)
    income = np.random.choice(income_brackets, p=[0.3, 0.3, 0.25, 0.15])
    marital_status = np.random.choice(marital_statuses, p=[0.6, 0.2, 0.1, 0.1])
    number_of_children = np.random.poisson(1.5) if marital_status != "Unmarried" else 0

    # Only assume partner if married, divorced, widowed, or has_partner flag
    has_partner = "Yes" if marital_status != "Unmarried" else np.random.choice(yes_no, p=[0.4, 0.6])
    partner_alcoholic = np.random.choice(yes_no, p=[0.4, 0.6]) if has_partner == "Yes" else "No"

    has_support_system = np.random.choice(yes_no, p=[0.5, 0.5])
    past_violence = np.random.choice(yes_no, p=[0.35, 0.65])
    mental_health_issues = np.random.choice(yes_no, p=[0.3, 0.7])
    employment_status = np.random.choice(employment_statuses, p=[0.4, 0.3, 0.1, 0.1, 0.1])
    housing_situation = np.random.choice(housing_situations, p=[0.3, 0.4, 0.1, 0.05, 0.15])
    disability = np.random.choice(yes_no, p=[0.1, 0.9])
    self_substance_abuse = np.random.choice(yes_no, p=[0.2, 0.8])
    previous_reports = np.random.poisson(0.5)

    # Enhanced risk score
    risk_score = (
        (education in ["None", "Primary", "Lower Secondary"]) * 1 +
        (income == "<5000") * 1 +
        (partner_alcoholic == "Yes") * 2 +
        (has_support_system == "No") * 1 +
        (past_violence == "Yes") * 2 +
        (mental_health_issues == "Yes") * 1 +
        (employment_status == "Unemployed") * 1 +
        (housing_situation in ["Shelter", "Homeless"]) * 1 +
        (disability == "Yes") * 1 +
        (self_substance_abuse == "Yes") * 2 +
        (previous_reports >= 2) * 2
    )

    violence_occurred = 1 if risk_score >= 5 else 0

    return [
        age, education, income, marital_status, number_of_children, has_partner,
        partner_alcoholic, has_support_system, past_violence, mental_health_issues,
        employment_status, housing_situation, disability, self_substance_abuse,
        previous_reports, violence_occurred
    ]

# Generate 100,000 rows
rows = [generate_row() for _ in range(150000)]
columns = [
    "age", "education", "income", "marital_status", "number_of_children", "has_partner",
    "partner_alcoholic", "has_support_system", "past_violence", "mental_health_issues",
    "employment_status", "housing_situation", "disability", "self_substance_abuse",
    "previous_reports", "violence_occurred"
]

# Create DataFrame
df = pd.DataFrame(rows, columns=columns)

# Save to CSV
df.to_csv("domestic_violence_data.csv", index=False)

print("✅ Final dataset with 100,000 rows saved as 'domestic_violence_data.csv'")

✅ Final dataset with 100,000 rows saved as 'domestic_violence_data.csv'
