In [3]:
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from typing import List
import pandas as pd
import numpy as np
import json

In [4]:
llm = ChatOpenAI(base_url="http://THOTH.local:1234/v1/", api_key="na", temperature=0.5)

In [5]:
class DatasetCol(BaseModel):
    name: str = Field(description="The name of the column.")
    type: str = Field(description="The type of the column. One of 'int', 'float', 'str', 'bool'.")
    description: str = Field(description="A description of the column.")
    dependent: bool = Field(description="Whether this column is dependent on other columns.")
    function: str = Field(description="A python function that generates the data for this column. The function takes parameters and returns the value for the column in that row. The function takes the row as a paramter with each column accessbile by name (Eg: row[\"name\"]). Independent variables cannot use any other variables in the dataset in their function including themselves.")

class DatasetGen(BaseModel):
    columns: List[DatasetCol] = Field(description="The columns of the dataset.")

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=DatasetGen)

In [6]:
def parser_to_prompt_schema(parser):
    schema = {k: v for k, v in parser._get_schema(parser.pydantic_object).items()}

    # Remove extraneous fields.
    reduced_schema = schema
    if "title" in reduced_schema:
        del reduced_schema["title"]
    if "type" in reduced_schema:
        del reduced_schema["type"]
    # Ensure json in context is well-formed with double quotes.
    schema_str = json.dumps(reduced_schema)

    return schema_str

In [7]:
def try_eval(fn_str, *params):
    try:
        return eval(fn_str)(*params)
    except:
        return None
def create_dataframe(data_template, n):
    columns = data_template['columns']
    data = {}
    independent_vars = {}
    # Generate values for independent variables
    for column in columns:
        if not column['dependent']:
            col_name = column['name']
            col_type = column['type']
            col_function = column['function']
            values = [try_eval(col_function, i) for i in range(n)]
            data[col_name] = values
    # Calculate values for dependent variables
    for column in columns:
        if column['dependent']:
            col_name = column['name']
            col_type = column['type']
            col_function = column['function']
            values = [eval(col_function)(pd.DataFrame(data).to_dict('records')[i]) for i in range(n)]
            data[col_name] = values

    df = pd.DataFrame(data)
    return df


In [8]:
template = '''
You are a helpful assistant that answers in JSON.
Generate an artificial medical research dataset containing 20-30 columns each.
The subject of this dataset is: {subject}.
For each column provide a python function using numpy to generate the data.

Here's the json schema you must adhere to:
<schema>
{schema}
</schema>
'''
prompt = PromptTemplate(
    template=template,
    input_variables=["subject"],
    partial_variables={"schema": parser_to_prompt_schema(parser)},
)


In [9]:
chain = prompt | llm | parser
data_template = chain.invoke({"subject": "blindness in diabetes"})

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
display(pd.DataFrame(data_template["columns"]))

Unnamed: 0,name,type,description,dependent,function
0,Patient_ID,int,Unique identifier for each patient.,False,lambda row: int(row['Patient_ID'])
1,Age,int,The age of the patient in years.,False,"lambda row: np.random.randint(18, 80)"
2,Gender,str,The gender of the patient (Male/Female).,False,"lambda row: np.random.choice(['M', 'F'])"
3,Diabetes_Type,str,Type of diabetes the patient has (Type 1/Type 2).,False,"lambda row: np.random.choice(['T1', 'T2'])"
4,Blood_Glucose_Level,float,The blood glucose level of the patient in mg/dL.,False,"lambda row: np.random.uniform(70, 300)"
5,HbA1c,float,The HbA1c level of the patient (in %).,False,"lambda row: np.random.uniform(4, 10)"
6,Blood_Pressure_Systolic,int,The systolic blood pressure of the patient in mmHg.,False,"lambda row: np.random.randint(80, 140)"
7,Blood_Pressure_Diastolic,int,The diastolic blood pressure of the patient in mmHg.,False,"lambda row: np.random.randint(40, 90)"
8,Cholesterol_Level,float,The cholesterol level of the patient in mg/dL.,False,"lambda row: np.random.uniform(100, 300)"
9,Smoker,bool,Whether the patient is a smoker or not.,False,"lambda row: np.random.choice([True, False])"


In [10]:
df = create_dataframe(data_template, 100)
display(df)

Unnamed: 0,Patient_ID,Age,Gender,Diabetes_Type,Blood_Glucose_Level,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Level,Smoker,...,Retinopathy_Status,Age_at_Onset,Duration_of_Blindness,Treatment_Type,Treatment_Duration,Follow_up_Visits,Visual_Aid_Use,Eye_Examination_Date,Diabetes_Control,Complications
0,,43,M,T2,216.194095,5.625996,134,85,202.955529,True,...,No,,,,,,,,,No
1,,65,M,T2,70.439844,6.087691,125,74,131.95473,False,...,No,,,,,,,,,No
2,,61,M,T1,148.560235,5.151963,126,61,153.481332,True,...,No,,,Insulin,5.0,,,,,No
3,,71,F,T1,138.006827,8.613487,115,71,247.591967,True,...,No,,,Insulin,3.0,,,,Good,No
4,,25,F,T2,272.665293,5.827095,115,53,126.982635,False,...,No,,,,,,,,,No
5,,25,F,T2,160.193434,5.42464,80,50,282.857252,True,...,No,,,,,,,,,No
6,,38,F,T1,125.579127,7.449622,98,52,228.385735,True,...,No,,,Insulin,8.0,,,,,No
7,,46,M,T2,273.330431,6.039712,110,72,263.90071,True,...,No,,,,,,,,,No
8,,21,M,T1,146.633127,9.294553,127,69,125.37949,False,...,No,,,Insulin,3.0,,,,Good,No
9,,74,F,T1,150.443148,9.838818,132,47,283.304995,False,...,No,,,Oral Medication,4.0,,,,Fair,No
