In [21]:
import pandas as pd
import random
from faker import Faker
import pycountry

In [22]:
fake = Faker()
Faker.seed(42)
random.seed(42)

In [27]:
# Get random country names (with typo possiblieties)
def get_country_with_typos():
    country = fake.country()
    typo_chance =random.random()
    if typo_chance < 0.1:
        return country[:-1]+random.choice("abcdefghijklmnopqrstuvwxyz")
    elif typo_chance < 0.12:
        return random.choice(["Iran","North korea"])  #Sactioned country
    return country


In [28]:
# Generate unique client ID
def generate_client_id():
    return 'CL' + str(random.randint(100000, 999999))

In [29]:
# Generate Aadhaar number (for Indian clients only)
def generate_aadhaar():
    return fake.numerify(text='#### #### ####')

# Generate PAN (Indian tax ID)
def generate_pan():
    return fake.bothify(text='?????####?').upper()


In [33]:
# Generate sample data
data=[]
for _ in range(500):
    country = get_country_with_typos()
    is_indian = country == 'India'
    
    row ={
        'Full_name': fake.name() if random.random() > 0.05 else None,
        'Client ID': generate_client_id(),
        'Date of Birth': fake.date_of_birth(minimum_age=18, maximum_age=75).isoformat(),
        'Country': country,
        'ID Number': fake.bothify(text='??######') if random.random() > 0.1 else None,
        'ID Type': random.choice(['Passport', 'Driver License', 'National ID']),
        'Onboarding Date': fake.date_between(start_date='-3y', end_date='today').isoformat(),
        'KYC Status': random.choice(['Complete', 'Incomplete']) if random.random() > 0.1 else None,
        'Account Type': random.choice(['Retail', 'Corporate', 'HNI']),
        
        # Indian KYC fields (only for Indian clients)
        'Aadhaar Number': generate_aadhaar() if is_indian and random.random() > 0.1 else None,
        'PAN': generate_pan() if is_indian and random.random() > 0.15 else None
    }
    
    data.append(row)
        

In [34]:
# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv("kyc_onboarding_data.csv", index=False)

In [35]:
df.head()

Unnamed: 0,Full_name,Client ID,Date of Birth,Country,ID Number,ID Type,Onboarding Date,KYC Status,Account Type,Aadhaar Number,PAN
0,Dr. Julie Miller,CL148759,1991-01-07,Togo,tN137189,National ID,2022-09-22,Incomplete,Corporate,,
1,Richard Harrison,CL241176,1984-05-22,United Kingdom,sk795654,Passport,2022-08-26,Complete,HNI,,
2,Nicole Acosta,CL606390,1987-11-11,Puerto Rico,dQ407929,Passport,2023-09-29,Complete,HNI,,
3,Alyssa Washington,CL789461,1958-02-18,Sudan,rr814429,Driver License,2023-08-06,Incomplete,Corporate,,
4,Lindsey Kent,CL899407,1999-10-16,American Samoa,Fl275243,Driver License,2024-09-29,Incomplete,Corporate,,
