In [7]:
import logging
import pandas as pd
from faker import Faker
import random
import string

logging.basicConfig(format='%(asctime)s %(levelname)s:%(name)s:%(message)s')
logging.getLogger().setLevel(20)

In [8]:
#instantiating the faker object
fake = Faker()

In [9]:
#creating a dictionary to generate 11 numbers with null probability
def generate_11_digit_numbers(count=1, null_probability=0.2):
    """
    Generate random 11-digit numbers using Faker, with some null values
    
    Args:
        count (int): Number of random numbers to generate
        null_probability (float): Probability of generating a null value (between 0 and 1)
        
    Returns:
        list: List of 11-digit numbers as int or None
    """
    fake : Faker()
    numbers : []
    
    for _ in range(count):
        # Randomly decide whether to generate a null value
        if random.random() < null_probability:
            numbers.append(None)
        else:
            # Generate number between 10000000000 and 99999999999
            number : fake.random_number(digits=11, fix_len=True)
            numbers.append(number)
    
    return numbers

In [10]:
def generate_11_digit_number(null_probability=0.2):
    """
    Generate a random 11-digit number using Faker, can be null depending on probability
    
    Args:
        null_probability (float): Probability of generating a null value (between 0 and 1)
        
    Returns:
        int: a single 11-digit number as int or None
    """
    fake = Faker()
    # Randomly decide whether to generate a null value
    if random.random() < null_probability:
        return None
    else:
        # Generate number between 10000000000 and 99999999999
        number = fake.random_number(digits=11, fix_len=True)
        return number

In [11]:
def generate_nigerian_passport_id(null_probability=0.2):
    """
    Generate random Nigerian passport IDs matching pattern ^[A-Z]{1,2}[0-9]{7}$
    
    Args:
        null_probability (float): Probability of generating a null value (between 0 and 1)
        
    Returns:
        str: single passport ID as strings or None
    """
    fake = Faker()
    
    if random.random() < null_probability:
        return None
    else:
        # Randomly decide between 1 or 2 letters
        letter_count = random.randint(1, 2)
        # Generate random uppercase letters
        letters = ''.join(random.choices(string.ascii_uppercase, k=letter_count))
        # Generate 7 random digits
        numbers = ''.join(str(random.randint(0, 9)) for _ in range(7))
        passport_id = f"{letters}{numbers}"
        return passport_id

In [12]:
def generate_voters_card(null_probability=0.2):
    """
    Generate a single voter's card number matching pattern AAA/BBB/CC/DDDDD
    Format: ^[A-Z]{3}/\d{3}/\d{2}/\d{5}$
    
    Args:
        null_probability (float): Probability of generating None instead of an ID
    
    Returns:
        str or None: A voter's card number or None
    """
    if random.random() < null_probability:
        return None
        
    first_part = ''.join(random.choices(string.ascii_uppercase, k=3))
    second_part = ''.join(str(random.randint(0, 9)) for _ in range(3))
    third_part = ''.join(str(random.randint(0, 9)) for _ in range(2))
    fourth_part = ''.join(str(random.randint(0, 9)) for _ in range(5))
    
    return f"{first_part}/{second_part}/{third_part}/{fourth_part}"

def generate_tax_id(null_probability=0.2):
    """
    Generate a single tax ID (TIN) matching pattern DDDDDDDDDD
    Format: ^\d{10}$
    
    Args:
        null_probability (float): Probability of generating None instead of an ID
    
    Returns:
        str or None: A tax ID number or None
    """
    if random.random() < null_probability:
        return None
        
    return ''.join(str(random.randint(0, 9)) for _ in range(10))

def generate_cac_number(null_probability=0.2):
    """
    Generate a single CAC number matching pattern RC/NNNNNNN or BN/NNNNNNN
    Format: ^(RC|BN)/\d{7}$
    
    Args:
        null_probability (float): Probability of generating None instead of an ID
    
    Returns:
        str or None: A CAC registration number or None
    """
    if random.random() < null_probability:
        return None
        
    prefix = random.choice(['RC', 'BN'])
    numbers = ''.join(str(random.randint(0, 9)) for _ in range(7))
    
    return f"{prefix}/{numbers}"

def generate_drivers_license(null_probability=0.2):
    """
    Generate a single driver's license number matching pattern A-BBBBBCC-DDDDDDD
    Format: ^[A-Z]-[A-Z]{5}[0-9]{2}-[0-9]{7}$
    
    Args:
        null_probability (float): Probability of generating None instead of an ID
    
    Returns:
        str or None: A driver's license number or None
    """
    if random.random() < null_probability:
        return None
        
    first_letter = random.choice(string.ascii_uppercase)
    middle_letters = ''.join(random.choices(string.ascii_uppercase, k=5))
    middle_numbers = ''.join(str(random.randint(0, 9)) for _ in range(2))
    last_numbers = ''.join(str(random.randint(0, 9)) for _ in range(7))
    
    return f"{first_letter}-{middle_letters}{middle_numbers}-{last_numbers}"


In [13]:
# customer_id = fake.random_number(digits=11, fix_len=True)
# occupation = fake.job()
# nin = generate_11_digit_number(0.4)
# passport = generate_nigerian_passport_id(0.5)
# country_of_birth = fake.country()
# marital_status = fake.random_choices(elements=("Single","Married","Widowed"))
# drivers_license = generate_drivers_license()
# voters_card = generate_voters_card()
# tax_id = generate_tax_id()
# cac_number = generate_cac_number()
# gender = fake.passport_gender()
# postal_code = fake.postal_code()

AttributeError: 'Generator' object has no attribute 'postal_code'

In [17]:
def generate_customer_data(count=1):
    """
    Generate a DataFrame with random customer data
    
    Args:
        count (int): Number of customer records to generate
        
    Returns:
        pandas.DataFrame: DataFrame containing customer records
    """
    fake = Faker()
    
    # Initialize empty lists for each column
    data = {
        'customer_id': [],
        'occupation': [],
        'nin': [],
        'passport': [],
        'country_of_birth': [],
        'marital_status': [],
        'drivers_license': [],
        'voters_card': [],
        'tax_id': [],
        'cac_number': [],
        'gender': [],
        'postal_code': []
    }
    
    # Generate data for each row
    for _ in range(count):
        data['customer_id'].append(str(fake.random_number(digits=11, fix_len=True)))
        data['occupation'].append(fake.job())
        data['nin'].append(generate_11_digit_number(null_probability=0.4))
        data['passport'].append(generate_nigerian_passport_id(null_probability=0.5))
        data['country_of_birth'].append(fake.country())
        data['marital_status'].append(random.choice(["Single", "Married", "Widowed"]))
        data['drivers_license'].append(generate_drivers_license())
        data['voters_card'].append(generate_voters_card())
        data['tax_id'].append(generate_tax_id())
        data['cac_number'].append(generate_cac_number())
        data['gender'].append(fake.random_element(elements=('M', 'F')))
        data['postal_code'].append(fake.postal_code())
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

In [18]:
trial_df = generate_customer_data(5) 

TypeError: 'int' object is not subscriptable

In [None]:
trial_df