In [14]:
from datetime import date
import pandas as pd
import numpy as np

def random_dob_from_age(age):
    today = date.today()
    birth_year = today.year - age

    month = np.random.randint(1, 13)

    max_day = pd.Period(f"{birth_year}-{month:02d}").days_in_month
    day = np.random.randint(1, max_day + 1)

    return date(birth_year, month, day)

df = pd.read_csv('Customers_Names_Sample_DB.csv')

# df_to_sql('customers', 'sqlite:///customers.db', if_exists='replace', index=False)
df_first = df.drop_duplicates(subset='Full Name', keep='first')

today = pd.Timestamp.today().normalize()

df_first["DateOfBirth"] = df_first["Age"].apply(
    # lambda age: (today - pd.DateOffset(years=age)).strftime('%Y-%m-%d')
    random_dob_from_age
)

df_first_names = df_first["Full Name"].str.split(' ', n=1, expand=True)
df_first["First Name"] = df_first_names[0]
df_first["Last Name"] = df_first_names[1]

res = df_first[[
    "Customer ID",
    "First Name",
    "Last Name",
    "MSISDN",
    "Gender",
    "DateOfBirth"
]]

# res.to_sql('customers', 'sqlite:///customers.db', if_exists='replace', index=False)

cus_addresses = pd.read_csv('Phone_ID_Database_Examples.csv')
cus_addresses = cus_addresses.rename(columns={
    "FULL_ADDRESS": "Home Address",
    "CITY": "City",
    "ZIP5": "Postal Code",
    "HOUSEHOLD_ID": "HouseholdId"
})

addresses = cus_addresses[[
    "Home Address",
    "City",
    "Postal Code",
    "HouseholdId"
]]

random_addresses = addresses.sample(
    n=len(res),
    replace=True,
    random_state=None
).reset_index(drop=True)

res = res.reset_index(drop=True)
final_df = pd.concat([res, random_addresses], axis=1)

final_df.to_csv("full_customers_data.csv", index=False)

In [None]:
import pandas as pd

dfFromCsv1 = pd.read_csv('Phone_ID_Database_Examples.csv')
dfFromFullCusCsv = pd.read_csv('full_customers_data.csv')

first_names = dfFromCsv1['FIRST']
last_names = dfFromFullCusCsv['Last Name']

# Create A List Of New Names By Randomly Combining First And Last Names
new_names = []
for _ in range(len(dfFromFullCusCsv)):
    random_first_name = first_names.sample(n=1).values[0].capitalize()
    random_first_name = random_first_name.capitalize()
    random_last_name = last_names.sample(n=1).values[0]
    new_name = f"{random_first_name} {random_last_name}"
    new_names.append(new_name)

new_names_df = pd.DataFrame({'Full Name': new_names})
new_names_df.to_csv('new_full_names.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import random, string
from datetime import datetime, timedelta

df = pd.read_csv('full_customers_data.csv')
# print(df.tail())

# df[["First Name", "Last Name"]] = df["Full Name"].str.split(' ', n=1, expand=True)

df["DateOfBirth"] = pd.to_datetime(df["DateOfBirth"], errors='coerce')

first_names = df['First Name'].dropna().unique().tolist()
last_names = df['Last Name'].dropna().unique().tolist()
cities = df['City'].dropna().unique().tolist()
postal_codes = df['Postal Code'].dropna().unique().tolist()
households = df['HouseholdId'].dropna().unique().tolist()

def generate_customer_id(existing_ids):
    while True:
        # cid = ''.join(random.choices(string.ascii_uppercase + string.digits, k=12))
        # To ensure the new Customer ID is unique, and it follows a similar format like 1000974618162
        cid = '1' + ''.join(random.choices(string.digits, k=12))
        if cid not in existing_ids:
            return cid

def generate_msisdn(existing_numbers):
    while True:
        number = '07' + ''.join(random.choices(string.digits, k=8))
        if number not in existing_numbers:
            return number

def random_dob(min_age=18, max_age=75):
    today = datetime.today()
    age = random.randint(min_age, max_age)
    start_date = today - timedelta(days=365*age)
    return start_date.date()

# Generation of the new dataset
target_rows = 5000
synthetic_rows = []

existing_ids = set(df['Customer ID'])
existing_msisdn = set(df['MSISDN'])

for _ in range(target_rows):
    first = random.choice(first_names)
    last = random.choice(last_names)
    city = random.choice(cities)
    postal = random.choice(postal_codes)
    household = random.choice(households)

    dob = random_dob()
    customer_id = generate_customer_id(existing_ids)
    msisdn = generate_msisdn(existing_msisdn)
    existing_ids.add(customer_id)
    existing_msisdn.add(msisdn)
    
    synthetic_rows.append({
        "Customer ID": customer_id,
        "First Name": first,
        "Last Name": last,
        "DateOfBirth": dob,
        "MSISDN": msisdn,
        "City": city,
        "Postal Code": postal,
        "HouseholdId": household
    })

synthetic_df = pd.DataFrame(synthetic_rows)
final_df = pd.concat([df, synthetic_df], ignore_index=True)
print("Total Rows in Final Dataset:", len(final_df))

final_df.to_csv('synthetic_5000_customers_data.csv', index=False)