In [1]:
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from typing import List
import pandas as pd
import numpy as np
import json
import warnings

In [2]:
llm = ChatOpenAI(base_url="http://THOTH.local:1234/v1/", api_key="na", temperature=0.5)

In [9]:
class DatasetCol(BaseModel):
    name: str = Field(description="The name of the column.")
    type: str = Field(description="The type of the column. One of 'int', 'float', 'str', 'bool'.")
    description: str = Field(description="A description of the column.")
    dependent: bool = Field(description="Whether this column is dependent on other columns. Dependent variables are generated by functions that take one or more independent variables as input and then transform them to get the final value.")
    outcome_col: bool = Field(description="Whether this column is an outcome column.")
    function: str = Field(description="A python function that generates the data for this column. The function takes parameters and returns the value for the column in that row. The function takes the row as a paramter with each column accessbile by name (Eg: row[\"name\"]). Independent variables cannot use any other variables in the dataset in their function including themselves.")

class DatasetGen(BaseModel):
    columns: List[DatasetCol] = Field(description="The columns of the dataset.")

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=DatasetGen)

In [10]:
def parser_to_prompt_schema(parser):
    schema = {k: v for k, v in parser._get_schema(parser.pydantic_object).items()}

    # Remove extraneous fields.
    reduced_schema = schema
    if "title" in reduced_schema:
        del reduced_schema["title"]
    if "type" in reduced_schema:
        del reduced_schema["type"]
    # Ensure json in context is well-formed with double quotes.
    schema_str = json.dumps(reduced_schema)

    return schema_str

In [11]:
def try_eval(fn_str, *params):
    try:
        return eval(fn_str)(*params)
    except:
        return None
def create_df_from_blueprint(data_template, n):
    columns = data_template['columns']
    data = {}
    independent_vars = {}
    # Generate values for independent variables
    for column in columns:
        if not column['dependent']:
            col_name = column['name']
            col_type = column['type']
            col_function = column['function']
            values = [try_eval(col_function, i) for i in range(n)]
            data[col_name] = values
    # Calculate values for dependent variables
    for column in columns:
        if column['dependent']:
            col_name = column['name']
            col_type = column['type']
            col_function = column['function']
            values = [try_eval(col_function, pd.DataFrame(data).to_dict('records')[i]) for i in range(n)]
            data[col_name] = values

    df = pd.DataFrame(data)
    return df


In [15]:
datagen_template = '''
You are a helpful assistant that answers in JSON.
Generate an artificial medical research dataset containing 30 columns.
The subject of this dataset is: {subject}.
For each column provide a python lambda function using numpy to generate the data.

2-3 of these columns should be an outcome variable. This variable should be generated by a function that takes two or more columns as input and then transforms them to get the final value.

Here's the json schema you must adhere to:
<schema>
{schema}
</schema>
'''
datagen_prompt = PromptTemplate(
    template=datagen_template,
    input_variables=["subject"],
    partial_variables={"schema": parser_to_prompt_schema(parser)},
)


In [19]:
datagen_chain = datagen_prompt | llm | parser
datagen_blueprint = datagen_chain.invoke({"subject": "age at first stroke"})

pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)
display(pd.DataFrame(datagen_blueprint["columns"]))

Unnamed: 0,name,type,description,dependent,outcome_col,function
0,patient_id,str,A unique identifier for each patient.,False,False,lambda row: str(row['patient_id'])
1,age_at_first_stroke,int,The age at which the patient experienced their first stroke.,True,False,lambda row: int(row['age'])
2,gender,str,The gender of the patient (Male or Female).,False,False,lambda row: 'Male' if row['gender'] == 1 else 'Female'
3,hypertension,bool,Whether the patient has hypertension.,True,False,lambda row: bool(row['blood_pressure'] > 140)
4,diabetes,bool,Whether the patient has diabetes.,True,False,lambda row: bool(row['blood_sugar'] > 126)
5,smoker,bool,Whether the patient is a smoker.,True,False,lambda row: bool(row['cigarettes_per_day'] > 0)
6,BMI,float,The body mass index of the patient.,True,False,lambda row: row['weight'] / (row['height'] ** 2)
7,stroke_severity,int,"The severity of the stroke, on a scale from 1 to 5.",True,False,lambda row: int(row['NIHSS_score'] / 2)
8,stroke_type,str,The type of stroke (Ischemic or Hemorrhagic).,True,False,lambda row: 'Ischemic' if row['stroke_subtype'] == 1 else 'Hemorrhagic'
9,time_to_treatment,int,The time in minutes from the onset of stroke symptoms to treatment.,True,False,lambda row: int(row['treatment_time'] * 60)


In [20]:
raw_data = create_df_from_blueprint(datagen_blueprint, 100)
display(raw_data)

Unnamed: 0,patient_id,gender,patient_id_2,gender_2,age_at_first_stroke,hypertension,diabetes,smoker,BMI,stroke_severity,...,hypertension_2,diabetes_2,smoker_2,BMI_2,stroke_severity_2,stroke_type_2,time_to_treatment_2,age_at_treatment_2,outcome_variable_1_2,outcome_variable_2_2
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,...,,,,,,,,,,
96,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [None]:
import numpy as np
import scipy.stats as stats

def best_cont_dist(col):
    col = col.dropna()  # Remove any missing values
    col = col.astype(float)  # Convert to float
    # Calculate basic statistics
    mean = np.mean(col)
    median = np.median(col)
    std_dev = np.std(col)
    
    # Fit different distributions and find the best fit
    distributions = [
        stats.uniform,  # Uniform distribution
        stats.norm,  # Normal distribution
        stats.expon,  # Exponential distribution
        stats.gamma,  # Gamma distribution
        stats.lognorm,  # Log-normal distribution
        stats.beta,  # Beta distribution
    ]
    
    best_fit = None
    best_fit_name = ""
    best_fit_params = ()
    best_fit_error = np.inf
    
    for distribution in distributions:
        try:
            # Fit the distribution to the data
            params = distribution.fit(col)
            
            # Calculate the error between the fitted distribution and the data
            error = stats.kstest(col, distribution.name, args=params).statistic
            
            # Update the best fit if the error is lower
            if error < best_fit_error:
                best_fit = distribution
                best_fit_name = distribution.name
                best_fit_params = params
                best_fit_error = error
        except Exception as e:
            print(f"Error fitting {distribution.name}: {e}")
    
    return {
        "best_fit_distribution": best_fit_name,
        "best_fit_params": best_fit_params,
        "best_fit_error": best_fit_error
    }

In [None]:
#create a function which summarizes every column in a pandas dataframe (both numeric and categorical) and returns a json array which can then be used by a large language model to select relevant columns for answering a specific question
def summarize_dataframe(df):
    summary = []
    
    # Iterate over each column in the dataframe
    for column in df.columns:
        column_summary = {}
        column_summary['name'] = column
        column_summary['type'] = str(df[column].dtype)

        # Check if the column is boolean
        if df[column].dtype == 'bool':
            column_summary['summary'] = df[column].value_counts().to_dict()
            #column_summary["dist"] = "Binary"                

        # Check if the column is numeric
        elif pd.api.types.is_numeric_dtype(df[column]):
            column_summary['summary'] = df[column].describe().to_dict()
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                #column_summary["dist"] = best_cont_dist(df[column])
                        
        else:
            column_summary['summary'] = df[column].value_counts().to_dict()
            #column_summary["dist"] = None
        
        summary.append(column_summary)
    
    return summary
summary = summarize_dataframe(raw_data)
summary
#display(pd.DataFrame([(bp["name"], bp["function"], bf) for (bp, bf) in zip(datagen_blueprint["columns"], [s["dist"] for s in summary])]))

In [None]:
dataread_template = '''
You are a helpful assistant that answers in JSON.

Your task is to review a summary of a dataset and return which columns should be included as independent and dependent variables in a machine learning model analysis.

The subject of this dataset is: {subject}.
For each column provide a python function using numpy to generate the data.

Here's the json schema you must adhere to:
<schema>
{schema}
</schema>
'''
datagen_prompt = PromptTemplate(
    template=datagen_template,
    input_variables=["subject"],
    partial_variables={"schema": parser_to_prompt_schema(parser)},
)


In [None]:
(lambda row: np.random.rand() < 0.1)(None)

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
outcome_col = 'quality_of_life'
data = raw_data.dropna(axis=1, how='all').dropna(axis=0, how='any')

X = data.drop(columns=[outcome_col])
y = data[outcome_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(raw_X, y, test_size=0.20, random_state=0)

In [None]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

auc = roc_auc_score(y_test, ebm.predict_proba(X_test)[:, 1])
print("AUC: {:.3f}".format(auc))

In [None]:
from interpret.glassbox import ClassificationTree
dt = ClassificationTree(random_state=0)
dt.fit(X_train, y_train)

auc = roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1])
print("AUC: {:.3f}".format(auc))

In [None]:

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]

In [None]:
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
from interpret import show

In [None]:
seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

ebm = ExplainableBoostingRegressor()
ebm.fit(X_train, y_train)

In [None]:
show(ebm.explain_global())