Generating Synthetic Financial Data using Microsoft Semantic Kernel.

In [None]:
import semantic_kernel as sk
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion, OpenAIChatCompletion
from IPython.display import display, Markdown

kernel = sk.Kernel()

useAzureOpenAI = False

if useAzureOpenAI:
    deployment, api_key, endpoint = sk.azure_openai_settings_from_dot_env()
    kernel.add_text_completion_service("azureopenai", AzureChatCompletion(deployment, endpoint, api_key))
else:
    api_key, org_id = sk.openai_settings_from_dot_env()
    kernel.add_text_completion_service("openai", OpenAIChatCompletion("gpt-3.5-turbo-0301", api_key, org_id))

print("A kernel is now ready.")    


In [None]:
prompt = """
You are a healthcare professional, working as {{$job}}, in {{$department_name}} department of {{$hospital_name}} hospital.
This hospital is located in {{$country}}. Your task is to fill 10 rows in the following table with the specified headers:
{{$template}}

Here’s a sample data entry:
{{$sample_data}}

Here are special instructions to fill up the template:
{{$instructions}}

The output rows in csv:
"""

function = kernel.create_semantic_function(prompt_template=prompt,
                                                    description="Simulates the job of a Dataentry personnel",
                                                    max_tokens=1000,
                                                    temperature=0.1,                      
                                                    top_p=0.5)

In [None]:

# HR Payroll
job="HR"
department_name="Cardiology"
hospital_name= "XYZ Hospital,Bengaluru,Karnataka"
country= "India"
template="'Employee ID', 'Designation', 'Region', 'Country', 'Contract', "
sample_data= "'12345', '150', 'Riyadh', 'Saudi', 'Agency'"
instructions="""
    Use randomization techniques or data generation tools to populate the template with synthetic data.
    Ensure that the data adheres to realistic ranges (e.g., salaries within industry norms, valid job titles, and plausible start dates, demographics of the country).
    Maintain consistency (e.g., consistent formatting for names, consistent department names, and valid email formats).
    Validate the data to avoid duplicates or errors.
"""
context = kernel.create_new_context()
context["job"] = job
context["department_name"] = department_name
context["hospital_name"] = hospital_name
context["country"] = country
context["template"] = template
context["sample_data"] = sample_data
context["instructions"] = instructions

result = await kernel.run_async(function,input_context=context ) 
str(result)






In [None]:
# Supplies
job="Admin"
department_name="Laboratory"
hospital_name= "XYZ Hospital,Bengaluru,Karnataka"
country= "India"
template="'Department', 'Item Name', 'Supplier Category', 'Supplier Number'"
sample_data= "'Facilities', 'Natraj Pencil', 'Stationary', '123456'"
instructions="""
    Use randomization methods to generate synthetic data. For numerical fields (e.g., Supplier Number), ensure uniqueness.
    Ensure that the data adheres to realistic ranges (e.g., salaries within industry norms, valid job titles, and plausible start dates, demographics of the country).
    Maintain consistency (e.g., consistent formatting for names, consistent department names, and valid email formats).
    Validate the data to avoid duplicates or errors.
"""
context = kernel.create_new_context()
context["job"] = job
context["department_name"] = department_name
context["hospital_name"] = hospital_name
context["country"] = country
context["template"] = template
context["sample_data"] = sample_data
context["instructions"] = instructions

result = await kernel.run_async(function,input_context=context ) 
str(result)








In [None]:
# Contracts
job="Admin"
department_name="Laboratory"
hospital_name= "XYZ Hospital,Bengaluru,Karnataka"
country= "India"
template="'Description','Supplier Name'"
sample_data= "'Floor Cleaning','CleanTech'"
instructions="""
    Use randomization methods to generate synthetic data. For categorical fields (e.g., Supplier Name), ensure uniqueness.
    Ensure that the data adheres to realistic ranges (e.g., salaries within industry norms, valid job titles, and plausible start dates, demographics of the country).
    Maintain consistency (e.g., consistent formatting for names, consistent department names, and valid email formats).
    Validate the data to avoid duplicates or errors.
"""
context = kernel.create_new_context()
context["job"] = job
context["department_name"] = department_name
context["hospital_name"] = hospital_name
context["country"] = country
context["template"] = template
context["sample_data"] = sample_data
context["instructions"] = instructions

result = await kernel.run_async(function,input_context=context ) 
str(result)

In [None]:
# Assets
job="Admin"
department_name="Digital Medical Equipments & IT"
hospital_name= "XYZ Hospital,Bengaluru,Karnataka"
country= "India"
template="'Asset ID', 'Description', 'Purchase Date'"
sample_data= "'111','Lenovo IdeaPad Slim 1 Intel Core Celeron N4020  HD (8GB/512GB SSD)','12/03/2024'"
instructions="""
    Use randomization methods to generate synthetic data. For categorical fields (e.g., Supplier Name), ensure uniqueness.
    Ensure that the data adheres to realistic ranges (e.g., salaries within industry norms, valid job titles, and plausible start dates, demographics of the country).
    Maintain consistency (e.g., consistent formatting for names, consistent department names, and valid email formats).
    Validate the data to avoid duplicates or errors.
"""
context = kernel.create_new_context()
context["job"] = job
context["department_name"] = department_name
context["hospital_name"] = hospital_name
context["country"] = country
context["template"] = template
context["sample_data"] = sample_data
context["instructions"] = instructions

result = await kernel.run_async(function,input_context=context ) 
str(result)