In [1]:
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from typing import List
import pandas as pd
import numpy as np
import json
import warnings
from langchain_core.messages import HumanMessage, SystemMessage

from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
from interpret import show
pd.set_option('display.max_columns', None)

In [2]:
llm = ChatOpenAI(base_url="http://THOTH.local:1234/v1/", api_key="na", temperature=0.5)

In [3]:
datagen_template = '''
Generate an artificial medical research dataset containing 50 columns.
The subject of this dataset is: {subject}.

Generate a total of 100 rows and return the dataset as a CSV object.
'''
datagen_prompt = PromptTemplate(
    template=datagen_template,
    input_variables=["subject"],
)
datagen_chain = datagen_prompt | llm
datagen_output = datagen_chain.invoke({"subject": "age at first stroke"})


In [4]:
from io import StringIO

In [5]:
#extract the CSV data from the blueprint
resp = datagen_output.content
#extract the part of the string surrounded by ```csv
csv_start = resp.find("```csv")
csv_end = resp.find("```", csv_start+1)
csv_data = resp[csv_start+6:csv_end]

In [6]:
resp

'Here is a CSV object with 50 columns and 100 rows representing data on age at first stroke:\n\n```csv\nAge,Gender,Race,Ethnicity,Smoker,Alcohol_Consumption,Blood_Pressure,Pulse_Rate,Cholesterol,HDL_Cholesterol,Trial,Medication1,Medication2,Medication3,Fasting_Glucose,Diabetes,Hypertension,Hyperlipidemia,Atrial_Fibrillation,Heart_Disease,Stroke_Type,Stroke_Severity,Time_to_Onset,Mobility,Living_Arrangements,Cognitive_Function,Depression,Sleep_Disorder,Anxiety,Pain,Activity_Level,Body_Mass_Index,Blood_Glucose,Fasting_Bilirubin,Prothrombin_Time,Partial_Thromboplastin_Time,International_Normalized_Ratio,Age_at_Onset_of_First_Stroke\n25,Male,Caucasian,Non-Hispanic White,No,Light,120/80,70,180,60,Trial1,Painkiller,N/A,N/A,100,0,0,0,0,Ischemic,Severe,2 years,Independent,Living with family,Normal,Asymptomatic,Mild,Insomnia,Low,3.5,95,1.2,11.8,1.4,1\n45,Female,African American,Non-Hispanic Black,Yes,Heavy,140/90,80,200,50,Trial2,N/A,Acetaminophen,N/A,110,0,1,0,0,Hemorrhagic,Moderate,5 years,De

In [7]:
#parse the CSV data into a pandas dataframe
raw_data = pd.read_csv(StringIO(csv_data))
raw_data

Unnamed: 0,Age,Gender,Race,Ethnicity,Smoker,Alcohol_Consumption,Blood_Pressure,Pulse_Rate,Cholesterol,HDL_Cholesterol,Trial,Medication1,Medication2,Medication3,Fasting_Glucose,Diabetes,Hypertension,Hyperlipidemia,Atrial_Fibrillation,Heart_Disease,Stroke_Type,Stroke_Severity,Time_to_Onset,Mobility,Living_Arrangements,Cognitive_Function,Depression,Sleep_Disorder,Anxiety,Pain,Activity_Level,Body_Mass_Index,Blood_Glucose,Fasting_Bilirubin,Prothrombin_Time,Partial_Thromboplastin_Time,International_Normalized_Ratio,Age_at_Onset_of_First_Stroke
0,25,Male,Caucasian,Non-Hispanic White,No,Light,120/80,70.0,180.0,60.0,Trial1,Painkiller,,,100.0,0.0,0.0,0.0,0.0,Ischemic,Severe,2 years,Independent,Living with family,Normal,Asymptomatic,Mild,Insomnia,Low,3.5,95.0,1.2,11.8,1.4,1.0,,,
1,45,Female,African American,Non-Hispanic Black,Yes,Heavy,140/90,80.0,200.0,50.0,Trial2,,Acetaminophen,,110.0,0.0,1.0,0.0,0.0,Hemorrhagic,Moderate,5 years,Dependent,Living alone,Impaired,Anxious,Severe,Sleep Apnea,High,3.9,105.0,1.4,12.2,1.6,2.0,,,
2,68,Male,Asian,Pacific Islander,No,Light,130/85,65.0,190.0,70.0,Trial3,,,,120.0,0.0,1.0,1.0,0.0,Ischemic,Mild,7 years,Independent,Living with spouse,Normal,Symptomatic,Moderate,OCD,Low,3.2,90.0,1.1,11.5,1.3,3.0,,,
3,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
import numpy as np
import scipy.stats as stats

def best_cont_dist(col):
    col = col.dropna()  # Remove any missing values
    col = col.astype(float)  # Convert to float
    # Calculate basic statistics
    mean = np.mean(col)
    median = np.median(col)
    std_dev = np.std(col)
    
    # Fit different distributions and find the best fit
    distributions = [
        stats.uniform,  # Uniform distribution
        stats.norm,  # Normal distribution
        stats.expon,  # Exponential distribution
        stats.gamma,  # Gamma distribution
        stats.lognorm,  # Log-normal distribution
        stats.beta,  # Beta distribution
    ]
    
    best_fit = None
    best_fit_name = ""
    best_fit_params = ()
    best_fit_error = np.inf
    
    for distribution in distributions:
        try:
            # Fit the distribution to the data
            params = distribution.fit(col)
            
            # Calculate the error between the fitted distribution and the data
            error = stats.kstest(col, distribution.name, args=params).statistic
            
            # Update the best fit if the error is lower
            if error < best_fit_error:
                best_fit = distribution
                best_fit_name = distribution.name
                best_fit_params = params
                best_fit_error = error
        except Exception as e:
            print(f"Error fitting {distribution.name}: {e}")
    
    return {
        "best_fit_distribution": best_fit_name,
        "best_fit_params": best_fit_params,
        "best_fit_error": best_fit_error
    }

In [9]:
#create a function which summarizes every column in a pandas dataframe (both numeric and categorical) and returns a json array which can then be used by a large language model to select relevant columns for answering a specific question
def summarize_dataframe(df):
    summary = []
    
    # Iterate over each column in the dataframe
    for column in df.columns:
        column_summary = {}
        column_summary['name'] = column
        column_summary['type'] = str(df[column].dtype)

        # Check if the column is boolean
        if df[column].dtype == 'bool':
            column_summary['summary'] = df[column].value_counts().to_dict()
            column_summary["dist"] = "Binary"                

        # Check if the column is numeric
        elif pd.api.types.is_numeric_dtype(df[column]):
            column_summary['summary'] = df[column].describe().to_dict()
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                column_summary["dist"] = best_cont_dist(df[column])
                        
        else:
            column_summary['summary'] = df[column].value_counts().to_dict()
            column_summary["dist"] = None
        
        summary.append(column_summary)
    
    return summary
summary = summarize_dataframe(raw_data)
summary

Error fitting uniform: zero-size array to reduction operation minimum which has no identity
Error fitting norm: attempt to get argmax of an empty sequence
Error fitting expon: zero-size array to reduction operation minimum which has no identity
Error fitting gamma: zero-size array to reduction operation minimum which has no identity
Error fitting lognorm: zero-size array to reduction operation minimum which has no identity
Error fitting beta: zero-size array to reduction operation minimum which has no identity
Error fitting gamma: Optimization converged to parameters that are outside the range allowed by the distribution.
Error fitting gamma: Optimization converged to parameters that are outside the range allowed by the distribution.
Error fitting uniform: zero-size array to reduction operation minimum which has no identity
Error fitting norm: attempt to get argmax of an empty sequence
Error fitting expon: zero-size array to reduction operation minimum which has no identity
Error fitti

[{'name': 'Age',
  'type': 'object',
  'summary': {'25': 1, '45': 1, '68': 1, '...': 1},
  'dist': None},
 {'name': 'Gender',
  'type': 'object',
  'summary': {'Male': 2, 'Female': 1},
  'dist': None},
 {'name': 'Race',
  'type': 'object',
  'summary': {'Caucasian': 1, 'African American': 1, 'Asian': 1},
  'dist': None},
 {'name': 'Ethnicity',
  'type': 'object',
  'summary': {'Non-Hispanic White': 1,
   'Non-Hispanic Black': 1,
   'Pacific Islander': 1},
  'dist': None},
 {'name': 'Smoker',
  'type': 'object',
  'summary': {'No': 2, 'Yes': 1},
  'dist': None},
 {'name': 'Alcohol_Consumption',
  'type': 'object',
  'summary': {'Light': 2, 'Heavy': 1},
  'dist': None},
 {'name': 'Blood_Pressure',
  'type': 'object',
  'summary': {'120/80': 1, '140/90': 1, '130/85': 1},
  'dist': None},
 {'name': 'Pulse_Rate',
  'type': 'float64',
  'summary': {'count': 3.0,
   'mean': 71.66666666666667,
   'std': 7.637626158259734,
   'min': 65.0,
   '25%': 67.5,
   '50%': 70.0,
   '75%': 75.0,
   'max'

In [10]:
class DataGuruSchema(BaseModel):
    predictor_cols: List[str] = Field(description="Dataset columns which will be used to predict the outcome variables.")
    output_cols: List[str] = Field(description="Dataset columns which will be predicted by the input columns.")
    excluded_cols: List[str] = Field(description="Dataset columns which will not be used in the model.")

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=DataGuruSchema)

In [11]:
def parser_to_prompt_schema(parser):
    schema = {k: v for k, v in parser._get_schema(parser.pydantic_object).items()}

    # Remove extraneous fields.
    reduced_schema = schema
    if "title" in reduced_schema:
        del reduced_schema["title"]
    if "type" in reduced_schema:
        del reduced_schema["type"]
    # Ensure json in context is well-formed with double quotes.
    schema_str = json.dumps(reduced_schema)

    return schema_str

In [12]:
class ResearchQuestionSchema(BaseModel):
    question: str = Field(description="The research question to be answered by the model.")
    outcome: str = Field(description="A single outcome column from the dataset to be predicted by the model.")
    predictors: List[str] = Field(description="The input columns to be used to predict the outcome column. Try to include as many columns as possible to improve the model's accuracy.")
    model_type: List[str] = Field(description="The sklearn model types to be used for prediction. May include multiple to find the best option. Options include: 'LinearRegression', 'LogisticRegression', 'RandomForestClassifier', 'RandomForestRegressor', 'GradientBoostingClassifier', 'GradientBoostingRegressor', 'SVC', 'KNeighborsClassifier', 'KNeighborsRegressor', 'DecisionTreeClassifier', 'DecisionTreeRegressor', etc")
class ResearcherSchema(BaseModel):
    topics: List[ResearchQuestionSchema] = Field(description="A list of research questions to be answered by the model.")
# Set up a parser + inject instructions into the prompt template.
researcher_parser = JsonOutputParser(pydantic_object=ResearcherSchema)

In [13]:
summary

[{'name': 'Age',
  'type': 'object',
  'summary': {'25': 1, '45': 1, '68': 1, '...': 1},
  'dist': None},
 {'name': 'Gender',
  'type': 'object',
  'summary': {'Male': 2, 'Female': 1},
  'dist': None},
 {'name': 'Race',
  'type': 'object',
  'summary': {'Caucasian': 1, 'African American': 1, 'Asian': 1},
  'dist': None},
 {'name': 'Ethnicity',
  'type': 'object',
  'summary': {'Non-Hispanic White': 1,
   'Non-Hispanic Black': 1,
   'Pacific Islander': 1},
  'dist': None},
 {'name': 'Smoker',
  'type': 'object',
  'summary': {'No': 2, 'Yes': 1},
  'dist': None},
 {'name': 'Alcohol_Consumption',
  'type': 'object',
  'summary': {'Light': 2, 'Heavy': 1},
  'dist': None},
 {'name': 'Blood_Pressure',
  'type': 'object',
  'summary': {'120/80': 1, '140/90': 1, '130/85': 1},
  'dist': None},
 {'name': 'Pulse_Rate',
  'type': 'float64',
  'summary': {'count': 3.0,
   'mean': 71.66666666666667,
   'std': 7.637626158259734,
   'min': 65.0,
   '25%': 67.5,
   '50%': 70.0,
   '75%': 75.0,
   'max'

In [14]:
researcher_template = '''
You are a helpful assistant that answers in JSON.

Your task is to review a summary of a dataset and develop a list of 3-5 research questions to be answered using the available data.

Below is a summary of the dataset in JSON format:
{summary}

Here's the json schema you must adhere to for your response:
<schema>
{schema}
</schema>
'''

researcher_prompt = PromptTemplate(
    template=researcher_template,
    input_variables=["subject", "summary"],
    partial_variables={"schema": parser_to_prompt_schema(researcher_parser)},
)
researcher_chain = researcher_prompt | llm | parser
researcher_output = researcher_chain.invoke({"summary": json.dumps({"columns": summary})})
display(researcher_output)

{'topics': [{'question': 'What are the factors that contribute to high blood pressure among the study participants?',
   'outcome': 'Blood_Pressure',
   'predictors': ['Age',
    'Gender',
    'Race',
    'Ethnicity',
    'Smoker',
    'Alcohol_Consumption'],
   'model_type': ['LinearRegression']},
  {'question': 'Is there a correlation between cholesterol levels and the risk of heart disease?',
   'outcome': 'Heart_Disease',
   'predictors': ['Cholesterol',
    'HDL_Cholesterol',
    'Age',
    'Gender',
    'Race',
    'Ethnicity'],
   'model_type': ['LogisticRegression']},
  {'question': 'Can we predict the likelihood of developing diabetes based on certain health factors?',
   'outcome': 'Diabetes',
   'predictors': ['Fasting_Glucose', 'Age', 'Gender', 'Race', 'Ethnicity'],
   'model_type': ['RandomForestClassifier']}]}

In [15]:
reasearch_results = []

for topic in researcher_output["topics"]:
    print(f"Research Question: {topic['question']}")
    print(f"Outcome: {topic['outcome']}")
    print(f"Predictors: {topic['predictors']}")

    clean_data = raw_data.dropna(subset=topic["predictors"] + [topic["outcome"]])
    
    X = clean_data[topic["predictors"]]
    y = clean_data[topic["outcome"]]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if pd.api.types.is_numeric_dtype(y):
        print("Generating a predictor for a continuous value")
        ebm = ExplainableBoostingRegressor()
        ebm.fit(X_train, y_train)
        print("Score:", ebm.score(X_test, y_test))
        reasearch_results.append({"topic": topic, "model": ebm})
    else:
        ebm = ExplainableBoostingClassifier()
        ebm.fit(X_train, y_train)
        print("Score:", ebm.score(X_test, y_test))
        reasearch_results.append({"topic": topic, "model": ebm})

Research Question: What are the factors that contribute to high blood pressure among the study participants?
Outcome: Blood_Pressure
Predictors: ['Age', 'Gender', 'Race', 'Ethnicity', 'Smoker', 'Alcohol_Consumption']




Score: 0.0
Research Question: Is there a correlation between cholesterol levels and the risk of heart disease?
Outcome: Heart_Disease
Predictors: ['Cholesterol', 'HDL_Cholesterol', 'Age', 'Gender', 'Race', 'Ethnicity']




Score: 0.0
Research Question: Can we predict the likelihood of developing diabetes based on certain health factors?
Outcome: Diabetes
Predictors: ['Fasting_Glucose', 'Age', 'Gender', 'Race', 'Ethnicity']
Generating a predictor for a continuous value
Score: nan




In [16]:
dataguru_template = '''
You are a helpful assistant that answers in JSON.

Your task is to review a summary of a dataset and return which columns should be included as predictor and outcome variables in a machine learning model analysis.
Columns that are irrelevant should be excluded.
There may be more than one outcome column.
All columns must be categorized into one of the three categories: predictor, outcome, or excluded. Each column must be assigned to only one of these categories.

The subject of your investigation is: {subject}.

Below is a summary of the dataset in JSON format:
{summary}

Here's the json schema you must adhere to for your response:
<schema>
{schema}
</schema>
'''

dataguru_prompt = PromptTemplate(
    template=dataguru_template,
    input_variables=["subject", "summary"],
    partial_variables={"schema": parser_to_prompt_schema(parser)},
)

dataguru_chain = dataguru_prompt | llm | parser
dataguru_ouptut = dataguru_chain.invoke({"subject": "age at first stroke", "summary": json.dumps({"columns": summary})})
dataguru_ouptut


{'predictor_cols': ['Age',
  'Gender',
  'Race',
  'Ethnicity',
  'Smoker',
  'Alcohol_Consumption',
  'Blood_Pressure',
  'Pulse_Rate',
  'Cholesterol',
  'HDL_Cholesterol',
  'Trial',
  'Medication1',
  'Medication2',
  'Fasting_Glucose',
  'Diabetes',
  'Hypertension',
  'Hyperlipidemia',
  'Atrial_Fibrillation',
  'Heart_Disease',
  'Stroke_Type',
  'Stroke_Severity',
  'Time_to_Onset',
  'Mobility',
  'Living_Arrangements',
  'Cognitive_Function',
  'Depression',
  'Sleep_Disorder',
  'Anxiety',
  'Pain',
  'Activity_Level',
  'Body_Mass_Index',
  'Blood_Glucose',
  'Fasting_Bilirubin',
  'Prothrombin_Time'],
 'output_cols': ['Age_at_Onset_of_First_Stroke'],
 'excluded_cols': []}

In [22]:
clean_data = raw_data.dropna(axis=1, how="all").dropna()
X = clean_data[dataguru_ouptut["predictor_cols"]]
y = clean_data[dataguru_ouptut["output_cols"][0]]

KeyError: 'Age_at_Onset_of_First_Stroke'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

if pd.api.types.is_numeric_dtype(y):
    ebm = ExplainableBoostingRegressor()
    ebm.fit(X_train, y_train)
else:
    ebm = ExplainableBoostingClassifier()
    ebm.fit(X_train, y_train)
    print("Score:", ebm.score(X_test, y_test))

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
ebm.score(X_train, y_train)

In [None]:
show(ebm.explain_global())