In [1]:
import pandas as pd
import numpy as np
import random

# Total number of records
num_records = 500
# Percentage of rows to be dirty
dirty_percentage = 0.20
num_dirty_rows = int(num_records * dirty_percentage)
num_clean_rows = num_records - num_dirty_rows

# --- Define Lists for Clean and Dirty Data ---

# Clean values
clean_gender = ["Male", "Female"]
clean_insurance_status = ["Active", "Lapsed", "Pending"]
clean_policy_type = ["Basic", "Premium", "Family", "Gold"]
clean_occupation = [
    "Software Engineer", "Doctor", "Sales Representative", "Teacher", "Nurse",
    "Engineer", "Accountant", "Lawyer", "Scientist", "Manager", "Retired",
    "Student", "Office Worker", "Manual Labor"
]
clean_chronic = ["None", "Hypertension", "Diabetes", "Asthma", "Heart Disease", "Arthritis"]
clean_smoker = ["Yes", "No", "Former"]
clean_alcohol = ["None", "Low", "Moderate", "High"]

# Dirty values (to be sprinkled in)
dirty_gender = ["male", "female", "M", "F", " Other", "Male ", np.nan, "Missing"]
dirty_insurance_status = ["active", " lapsed", "Pending ", np.nan, "NA"]
dirty_policy_type = ["Basic ", "premium", "family", np.nan, "Not Available"]
dirty_dentist_visits = ["0", "1", "One", "Two", "None", np.nan, 50, -1, "missing"]
dirty_checkups = ["0", "1", "One", "Zero", "None", np.nan, 10, "NA"]
dirty_occupation = [" office worker", "Doctor ", "retired", np.nan, "Missing"]
dirty_chronic = ["none", "hypertension", "Diabtes", "Asthma ", "Heart Disase", np.nan, "NA"]
dirty_smoker = ["Yes", "No", "yes", "no", " YES", "No ", np.nan, "missing"]
dirty_alcohol = ["none", "low", " moderate", "High ", "0-2", "3-5", np.nan, "NA"]
dirty_age = [150, 5, -10, np.nan]
dirty_years = [-2, 100, np.nan]

# --- Generate Clean Data ---
clean_data = {
    'customer_id': range(10001, 10001 + num_clean_rows),
    'age': np.random.randint(18, 85, size=num_clean_rows),
    'gender': [random.choice(clean_gender) for _ in range(num_clean_rows)],
    'insurance_status': [random.choice(clean_insurance_status) for _ in range(num_clean_rows)],
    'policy_type': [random.choice(clean_policy_type) for _ in range(num_clean_rows)],
    'years_with_company': [random.randint(0, 40) for _ in range(num_clean_rows)],
    'dentist_visits_per_year': [random.choice([0, 1, 2, 3]) for _ in range(num_clean_rows)],
    'annual_checkups': [random.choice([0, 1]) for _ in range(num_clean_rows)],
    'occupational_category': [random.choice(clean_occupation) for _ in range(num_clean_rows)],
    'chronic_condition': [random.choice(clean_chronic) for _ in range(num_clean_rows)],
    'smoker_status': [random.choice(clean_smoker) for _ in range(num_clean_rows)],
    'alcohol_consumption_weekly': [random.choice(clean_alcohol) for _ in range(num_clean_rows)]
}
df_clean = pd.DataFrame(clean_data)

# --- Generate Dirty Data ---
# Start with clean data and make it dirty
dirty_data = {
    'customer_id': range(10001 + num_clean_rows, 10001 + num_records),
    'age': np.random.randint(18, 85, size=num_dirty_rows),
    'gender': [random.choice(clean_gender) for _ in range(num_dirty_rows)],
    'insurance_status': [random.choice(clean_insurance_status) for _ in range(num_dirty_rows)],
    'policy_type': [random.choice(clean_policy_type) for _ in range(num_dirty_rows)],
    'years_with_company': [random.randint(0, 40) for _ in range(num_dirty_rows)],
    'dentist_visits_per_year': [random.choice([0, 1, 2, 3]) for _ in range(num_dirty_rows)],
    'annual_checkups': [random.choice([0, 1]) for _ in range(num_dirty_rows)],
    'occupational_category': [random.choice(clean_occupation) for _ in range(num_dirty_rows)],
    'chronic_condition': [random.choice(clean_chronic) for _ in range(num_dirty_rows)],
    'smoker_status': [random.choice(clean_smoker) for _ in range(num_dirty_rows)],
    'alcohol_consumption_weekly': [random.choice(clean_alcohol) for _ in range(num_dirty_rows)]
}
df_dirty = pd.DataFrame(dirty_data)

# List of columns to make dirty
columns_to_dirty = [
    'age', 'gender', 'insurance_status', 'policy_type', 'years_with_company',
    'dentist_visits_per_year', 'annual_checkups', 'occupational_category',
    'chronic_condition', 'smoker_status', 'alcohol_consumption_weekly'
]

dirty_value_map = {
    'age': dirty_age,
    'gender': dirty_gender,
    'insurance_status': dirty_insurance_status,
    'policy_type': dirty_policy_type,
    'years_with_company': dirty_years,
    'dentist_visits_per_year': dirty_dentist_visits,
    'annual_checkups': dirty_checkups,
    'occupational_category': dirty_occupation,
    'chronic_condition': dirty_chronic,
    'smoker_status': dirty_smoker,
    'alcohol_consumption_weekly': dirty_alcohol
}

# Iterate over each dirty row and introduce 1-3 dirty values
for i in range(num_dirty_rows):
    # Choose how many columns to make dirty in this row (e.g., 1 to 3)
    num_errors_in_row = random.randint(1, 3)
    # Choose which columns to make dirty
    cols_to_corrupt = random.sample(columns_to_dirty, num_errors_in_row)

    for col in cols_to_corrupt:
        # Get a random dirty value for that column
        dirty_val = random.choice(dirty_value_map[col])
        # Set the value in the DataFrame
        df_dirty.at[i, col] = dirty_val

# --- Combine and Save ---
df_final = pd.concat([df_clean, df_dirty])
# Shuffle the final dataframe
df_final = df_final.sample(frac=1).reset_index(drop=True)

# Save to CSV
output_filename = 'insurance_customer_data_20_percent_dirty.csv'
df_final.to_csv(output_filename, index=False)

print(f"Created '{output_filename}' with {num_records} records.")
print(f"Approximately {num_dirty_rows} rows contain intentionally dirty data.")
print(df_final.head())

Created 'insurance_customer_data_20_percent_dirty.csv' with 500 records.
Approximately 100 rows contain intentionally dirty data.
   customer_id   age  gender insurance_status policy_type  years_with_company  \
0        10476  49.0    Male          Pending      family                27.0   
1        10383  53.0    Male          Pending       Basic                21.0   
2        10454  82.0    Male           Active      Family               100.0   
3        10053  73.0    Male           Active        Gold                 7.0   
4        10446  72.0  Female          Pending       Basic                -2.0   

  dentist_visits_per_year annual_checkups occupational_category  \
0                     0.0               1         office worker   
1                       2               1         Office Worker   
2                     0.0               1               Manager   
3                       0               1              Engineer   
4                     2.0              10       

  df_dirty.at[i, col] = dirty_val
  df_dirty.at[i, col] = dirty_val


In [None]:
output_filename = 'insurance_customer_data_20_percent_dirty.csv'

```text?code_stdout&code_event_index=2
Created 'insurance_customer_data_20_percent_dirty.csv' with 500 records.
Approximately 100 rows contain intentionally dirty data.
   customer_id   age  gender insurance_status policy_type  years_with_company dentist_visits_per_year annual_checkups occupational_category chronic_condition smoker_status alcohol_consumption_weekly
0        10032  55.0  Female           Lapsed       Basic                22.0                       3               0                 Nurse         Arthritis            No                       None
1        10407  67.0    Male          Pending       Basic                40.0                       0               1               Doctor            Asthma            Yes                        Low
2        10207  55.0  Female          Pending     Premium                35.0                       1               1               Teacher            Asthma           Yes                        Low
3        10425  57.0    Male           Active       Basic                 8.0                     One               0               Manager      Hypertension        Former                   moderate
4        10139  45.0  Female          Pending        Gold                 2.0                       2               1                Doctor      Hypertension           Yes                        Low

```

[file-tag: code-generated-file-0-1762892115778704321]

Here you go\! I've created the new CSV file with a more controlled amount of "dirtiness."

The file is named **`insurance_customer_data_20_percent_dirty.csv`**.

I've set it up so that approximately 20% of the rows (100 out of 500 records) contain one or more intentional errors, such as:

  * **Invalid values:** (e.g., ages like 150 or -10, negative years)
  * **Inconsistent text:** (e.g., "Male", "male", "M")
  * **Missing data:** (e.g., `NaN`, "NA", "missing")
  * **Mixed types:** (e.g., "One" instead of 1)
  * **Formatting issues:** (e.g., " moderate", "Doctor ")

The remaining 80% of the data is clean. This should give you a much more manageable dataset to work with for your transformations.