In [0]:

from pyspark.sql.functions import col, lit, rand, round
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructType, StructField, DateType
import random, re
from datetime import datetime, timedelta
import numpy as np 



# Number of records
num_records = 1000000

# Function to generate random age
def random_date_of_birth():
    """
    Generate a random date in the past between the given start date and today.
    
    Parameters:
    start_date (str): The earliest possible date in the format 'YYYY-MM-DD' (default is '1900-01-01').
    
    Returns:
    str: Random date in the format 'YYYY-MM-DD'.
    """
    
    # Get today's date
    end_date = datetime.now()
    
    # Generate a random number of days to add to the start_date
    random_days = int(np.round(np.random.normal(14855.5, 6209, 1)[0]))
    

    # Generate the random date
    random_date = end_date - timedelta(days=random_days)
    
    # Return the random date in the format 'YYYY-MM-DD'
    return random_date.strftime('%Y-%m-%d')


# Function to generate random gender
def random_sex():
    return random.choice(["Male", "Female"])

# Function to generate random employment status
def random_employment_status():
     return random.choice(["Employed", "Unemployed", "Self-Employed", "Student"])

# Function to generate random education level
def random_education_level():
    return random.choice(["No Formal Education", "GCSE", "A-Level", "Undergraduate Degree", "PostGraduate Degree", "Doctorate"])

# Function to generate random household income
def random_household_income(employment_status):
    if employment_status == "Student" or employment_status == "Unemployed":
        return random.randint(0, 30000)
    elif employment_status == "Retired":
        return random.randint(20000, 60000)
    else:
        return random.randint(30000, 1000000)
    
#function to generate random employment income
def employment_income(employment_status):
    """
    Generates a random generated income based on an exponential distribution of UK wages Jan 24 (ONS stats) based on employment status
    Parameters:
        employment_status (str): ["Employed", "Unemployed", "Self-Employed", "Student"]

     Returns: 
     int: Random generated income based on employment status    
     """
    percentiles = [0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]
    values = [770, 1361, 2333, 3588, 5411, 7302, 15136]
     # Calculate the rate parameter λ for each percentile
    lambdas = [-np.log(1 - p) / x for p, x in zip(percentiles, values)]

    # Average λ to use for the exponential distribution
    lambda_avg = np.mean(lambdas)

    if employment_status in ("Employed", "Self-Employed"):
        return  (np.random.exponential(1 / lambda_avg)) * 12
    else:   
        return 0.0

# Function to generate random marital status
def random_marital_status():
    return random.choice(["Single", "Cohabiting", "Married", "Divorced", "Widowed"])

# Function to generate random number of children

def random_number_of_children():
    return np.random.poisson(1.49)


# Function to generate random passport number
def random_passport_number():
    return chr(random.randint(65, 90)) + ''.join(random.choices('0123456789', k=8))

def has_passport(probability_no_passport=0.135):
    """
    Determine if an individual has a passport based on a given probability.

    Parameters:
    probability_no_passport (float): The probability that an individual does not have a passport (default is 0.25).

    Returns:
    bool: True if the individual has a passport, False otherwise.
    """
    if random.random() > probability_no_passport:
        return random_passport_number()
    else: return None

# Function to generate random industry
def random_industry():
    return random.choice ([ "Agriculture","Manufacturing", "Construction", "Transport", "Retail", "Finance", "Healthcare", "Education", "Government", "Other"])


# Function to generate random postcode
def random_postcode():
    return random.choice(['AB', 'BC', 'CD', 'DE', 'EF','FG',]) + str(random.randint(1, 9)) + ' ' + str(random.randint(1, 9)) + random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']) + random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])



# Generate dataset
data = [(i + 1,
         random_date_of_birth(),
         random_sex(),
         random_employment_status(),
         employment_income(random_employment_status()),
         random_education_level(),
         random_household_income(random_employment_status()),
         random_marital_status(),
         random_number_of_children(),
          random_postcode(),
          has_passport(),
          random_industry(),
        ) for i in range(num_records)]



# Define schema
schema = StructType([
    StructField("ID", IntegerType(), False),
    StructField("DoB", StringType(), False),
    StructField("Sex", StringType(), False),
    StructField("Employment_Status", StringType(), False),
    StructField("Employment_Income", DoubleType(), False),
    StructField("Education_Level", StringType(), False),
    StructField("Household_Income", IntegerType(), False),
    StructField("Marital_Status", StringType(), False),
    StructField("Number_of_Children", IntegerType(), False),
    StructField("Postcode", StringType(), False),
    StructField("Passport_Number", StringType(), True),
    StructField("Industry", StringType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

display(df)



ID,DoB,Sex,Employment_Status,Employment_Income,Education_Level,Household_Income,Marital_Status,Number_of_Children,Postcode,Passport_Number,Industry
1,2004-12-24,Male,Student,28863.97490643857,PostGraduate Degree,421757,Cohabiting,4,FG4 9VS,C28138393,Retail
2,1965-04-28,Male,Self-Employed,0.0,PostGraduate Degree,23305,Married,5,EF2 3PH,G55664872,Finance
3,1989-08-30,Female,Unemployed,4045.947576196831,Doctorate,871351,Widowed,2,EF7 9DS,M35717163,Manufacturing
4,1956-12-09,Male,Self-Employed,0.0,PostGraduate Degree,21921,Single,0,AB3 8JC,,Construction
5,1990-11-29,Male,Unemployed,13223.855838810368,GCSE,2157,Single,0,BC8 1FG,P26833494,Other
6,1982-05-07,Female,Student,109909.60331417852,Undergraduate Degree,21825,Divorced,2,FG7 2RD,D68848435,Government
7,2022-12-07,Male,Employed,3812.725910970947,A-Level,856676,Widowed,2,EF1 7LN,P91562333,Education
8,1972-04-08,Female,Student,78262.78233014455,PostGraduate Degree,18779,Divorced,1,BC9 7UH,P15553614,Retail
9,2009-12-06,Male,Unemployed,44495.47185021456,GCSE,536300,Married,1,EF8 4FH,X54129166,Healthcare
10,1960-09-28,Male,Self-Employed,0.0,No Formal Education,137610,Widowed,2,EF1 2RH,C56932496,Government
