# Train an IntegratedML model on Readmission Dataset
## Use JDBC to connect to InterSystems IRIS database
This Notebook demonstrates:
- Using the JayDeBeApi Python library to connect to InterSystems IRIS
- Creating views to segment data into training and test sets
- Defining and training an IntegratedML model to predict marketing campaign responses
- Comparing the resulting model's predictions to data in the test set (that the model was not trained on)
- Using the IntegratedML "VALIDATE MODEL" command to calculate accuracy metrics on the test set data

In [1]:
!conda activate
!pip3 install jupyterthemes

/bin/sh: 1: conda: not found


In [2]:
!jt -r

Reset css and font defaults in:
/root/.jupyter/custom &
/root/.local/share/jupyter/nbextensions


In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

!ls /tf/intersystems-jdbc-3.1.0.jar
RUN apt-get update && \
    DEBIAN_FRONTEND=noninteractive \
    apt-get -y install default-jre-headless && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

In [4]:
!pip install JayDeBeApi



In [5]:
!pip list JPype1

Package                Version
---------------------- -----------
absl-py                0.9.0
asn1crypto             0.24.0
astunparse             1.6.3
attrs                  19.3.0
backcall               0.1.0
bleach                 3.1.5
cachetools             4.1.0
certifi                2020.4.5.1
chardet                3.0.4
cryptography           2.1.4
cycler                 0.10.0
decorator              4.4.2
defusedxml             0.6.0
entrypoints            0.3
gast                   0.3.3
google-auth            1.14.2
google-auth-oauthlib   0.4.1
google-pasta           0.2.0
grpcio                 1.28.1
h5py                   2.10.0
idna                   2.6
importlib-metadata     1.6.0
ipykernel              5.1.1
ipython                7.14.0
ipython-genutils       0.2.0
ipywidgets             7.5.1
JayDeBeApi             1.2.3
jedi                   0.17.0
Jinja2                 2.11.2
joblib                 1.1.0
JPype1                

### 1. Set environment variables, if necessary

In [6]:
#import os
#os.environ['JAVA_HOME']='C:\Progra~1\Java\jdk1.8.0_241'
#os.environ['CLASSPATH'] = 'C:\interSystems\IRIS20194\dev\java\lib\JDK18\intersystems-jdbc-3.0.0.jar'
#os.environ['HADOOP_HOME']='C:\hadoop\bin'  #winutil binary must be in Hadoop's Home

### 2. Get jdbc connection and cursor

In [7]:

import jaydebeapi
url = "jdbc:IRIS://irisimlsvr:1972/USER" #"jdbc:IRIS://172.17.0.1:8091/USER"
driver = 'com.intersystems.jdbc.IRISDriver'
user = "SUPERUSER"
password = "SYS"
#libx = "C:/InterSystems/IRIS20194/dev/java/lib/JDK18"
#jarfile = "C:/InterSystems/IRIS20194/dev/java/lib/JDK18/intersystems-jdbc-3.0.0.jar"
jarfile = "./intersystems-jdbc-3.1.0.jar"

In [8]:
conn = jaydebeapi.connect(driver, url, [user, password], jarfile)
curs = conn.cursor()

### 3. specify the source data table

In [9]:
dataTable = 'Patient.Readmission'

### 4. Execute a query and display results in Pandas DataFrame

In [10]:
import pandas as pd
from IPython.display import display

df = pd.read_sql("select TOP 20 * from %s" % dataTable, conn)
display(df)

Unnamed: 0,AdmitReason,Com_ANY_MALIGNANCY_INCLUDING_LYMPHOMA_AND_LEUKEMIA_EXCEPT_MALIGNANT_NEOPLASM_OF_SKIN,Com_CEREBROVASCULAR_DISEASE,Com_CHRONIC_PULMONARY_DISEASE,Com_DEMENTIA,Com_DIABETES_WITHOUT_CHRONIC_COMPLICATION,Com_HEMIPLEGIA_OR_PARAPLEGIA,Com_MILD_LIVER_DISEASE,Com_MYOCARDIAL_INFARCTION,Com_PERIPHERY_VASCULAR_DISEASE,...,MxNumAdmitsSixMonth,MxNumAdmitsThreeMonth,MxNumAdmitsTwelveMonth,MxNumDiagOneMonth,MxNumDiagTwelveMonth,MxSmoker,MxStartDateDayOfMonth,MxStartDateDayOfWeek,MxStartDateMonth,MxWillReAdmit
0,SNOMED Code,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,10,7,7,0
1,SNOMED Code,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,0,10,2,12,0
2,SNOMED Code,0,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,1,3,10,0
3,SNOMED Code,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,31,5,8,1
4,SNOMED Code,0,0,0,0,0,0,0,0,1,...,1,1,1,1,2,0,4,2,9,0
5,Encounter Inpatient,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,12,5,8,0
6,SNOMED Code,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,6,8,1
7,SNOMED Code,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,12,6,8,0
8,Encounter Inpatient,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,11,3,3,0
9,Encounter Inpatient,0,0,0,0,0,0,0,0,0,...,0,0,1,1,2,0,15,2,9,0


In [11]:
# Show number rows
df1 = pd.read_sql("SELECT COUNT(*) FROM %s" % dataTable, conn)
display(df1)

Unnamed: 0,Aggregate_1
0,14047


### Cleaning before retrying

In [12]:
curs.execute("DROP VIEW Patient.ReadmissionTraining")
curs.execute("DROP VIEW Patient.ReadmissionPredict")
curs.execute("DROP MODEL ReadmissionModel")

### 5. Make some views to split training and testing datasets

In [13]:
df1 = pd.read_sql("SELECT COUNT(*) FROM %s WHERE  ID>=11000" % dataTable, conn)
display(df1)

Unnamed: 0,Aggregate_1
0,3048


In [14]:
# Training set view
curs.execute("CREATE VIEW Patient.ReadmissionTraining AS SELECT * FROM Patient.Readmission WHERE ID<11000")
# Prediction set
curs.execute("CREATE VIEW Patient.ReadmissionPredict AS SELECT * FROM Patient.Readmission WHERE ID>=11000")

### 6. Look at Data

In [15]:
display(pd.read_sql("select TOP 20 * from Patient.ReadmissionTraining", conn))

Unnamed: 0,AdmitReason,Com_ANY_MALIGNANCY_INCLUDING_LYMPHOMA_AND_LEUKEMIA_EXCEPT_MALIGNANT_NEOPLASM_OF_SKIN,Com_CEREBROVASCULAR_DISEASE,Com_CHRONIC_PULMONARY_DISEASE,Com_DEMENTIA,Com_DIABETES_WITHOUT_CHRONIC_COMPLICATION,Com_HEMIPLEGIA_OR_PARAPLEGIA,Com_MILD_LIVER_DISEASE,Com_MYOCARDIAL_INFARCTION,Com_PERIPHERY_VASCULAR_DISEASE,...,MxNumAdmitsSixMonth,MxNumAdmitsThreeMonth,MxNumAdmitsTwelveMonth,MxNumDiagOneMonth,MxNumDiagTwelveMonth,MxSmoker,MxStartDateDayOfMonth,MxStartDateDayOfWeek,MxStartDateMonth,MxWillReAdmit
0,SNOMED Code,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,10,7,7,0
1,SNOMED Code,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,0,10,2,12,0
2,SNOMED Code,0,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,1,3,10,0
3,SNOMED Code,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,31,5,8,1
4,SNOMED Code,0,0,0,0,0,0,0,0,1,...,1,1,1,1,2,0,4,2,9,0
5,Encounter Inpatient,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,12,5,8,0
6,SNOMED Code,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,6,8,1
7,SNOMED Code,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,12,6,8,0
8,Encounter Inpatient,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,11,3,3,0
9,Encounter Inpatient,0,0,0,0,0,0,0,0,0,...,0,0,1,1,2,0,15,2,9,0


In [16]:
display(pd.read_sql("SELECT COUNT(*) rowCount FROM Patient.ReadmissionTraining", conn))

Unnamed: 0,rowCount
0,10999


In [17]:
df.dtypes

AdmitReason                                                                                object
Com_ANY_MALIGNANCY_INCLUDING_LYMPHOMA_AND_LEUKEMIA_EXCEPT_MALIGNANT_NEOPLASM_OF_SKIN        int64
Com_CEREBROVASCULAR_DISEASE                                                                 int64
Com_CHRONIC_PULMONARY_DISEASE                                                               int64
Com_DEMENTIA                                                                                int64
Com_DIABETES_WITHOUT_CHRONIC_COMPLICATION                                                   int64
Com_HEMIPLEGIA_OR_PARAPLEGIA                                                                int64
Com_MILD_LIVER_DISEASE                                                                      int64
Com_MYOCARDIAL_INFARCTION                                                                   int64
Com_PERIPHERY_VASCULAR_DISEASE                                                              int64
Com_RENAL_DISEASE   

In [18]:
len(df.dtypes)

45

### 7. Choose ML framework with "SET ML CONFIGURATION"


In [19]:
curs.execute("SET ML CONFIGURATION %AutoML")

### 8. Create and Train an IntegratedML Model using default settings
IntegratedML only needs a model name, the name of the column that is the target column to predict, and a table (or SELECT query to specify input columns.

This is a simple DDL query that executes immediately.

In [20]:
res = curs.execute("CREATE MODEL ReadmissionModel PREDICTING (MxWillReAdmit) FROM Patient.ReadmissionTraining")

Now that the model is defined, you can TRAIN it, which invokes the AutoML machine learning procedure.

In [21]:
curs.execute("TRAIN MODEL ReadmissionModel")

Once that finishes, you can see some information about the model in the "ML_TRAINED_MODELS" table.

In [22]:
df3 = pd.read_sql("SELECT * FROM INFORMATION_SCHEMA.ML_TRAINED_MODELS", conn)
display(df3)

Unnamed: 0,MODEL_NAME,TRAINED_MODEL_NAME,PROVIDER,TRAINED_TIMESTAMP,MODEL_TYPE,MODEL_INFO
0,bc,bc_t1,AutoML,2022-04-22 07:54:38.592000,classification,"ModelType:TensorFlow Neural Network, Package:T..."
1,CampaignModel,CampaignModel_t1,AutoML,2022-04-22 08:01:09.859000,classification,"ModelType:TensorFlow Neural Network, Package:T..."
2,ReadmissionModel1,ReadmissionModel1_t1,AutoML,2022-04-22 08:15:12.216000,classification,"ModelType:Random Forest, Package:sklearn, Prob..."
3,ReadmissionModel,ReadmissionModel_t1,AutoML,2022-04-22 10:22:01.750000,classification,"ModelType:Random Forest, Package:sklearn, Prob..."


### 9. Compare model output to data it has not seen yet
Now you can use SQL to SELECT data from another table, run the IntegratedML model on this new data, and see how well the predictions match the data!

In [23]:
df4 = pd.read_sql("SELECT PREDICT(ReadmissionModel) AS PredictedReadmit, \
                  MxWillReAdmit AS ActualReadmit FROM Patient.ReadmissionPredict", conn)
display(df4)

Unnamed: 0,PredictedReadmit,ActualReadmit
0,1,1
1,1,1
2,0,0
3,1,0
4,1,0
...,...,...
3043,0,0
3044,0,0
3045,0,0
3046,0,0


### 10. VALIDATE MODEL command calculates accuracy metrics
You can certainly take that output above and calculate the accuracy using a standard formula, but IntegratedML has a built-in function to do that!

Each time you run the command "VALIDATE MODEL..." it generates a set of metrics calculated on the data passed into the query. Since this table can be a bit difficult to read in its raw form we use a simple "pivot" call to arrange the data.

In [24]:
curs.execute("VALIDATE MODEL ReadmissionModel FROM Patient.ReadmissionPredict")
df5 = pd.read_sql("SELECT * FROM INFORMATION_SCHEMA.ML_VALIDATION_METRICS", conn)
df6 = df5.pivot(index='VALIDATION_RUN_NAME', columns='METRIC_NAME', values='METRIC_VALUE')
display(df6)

METRIC_NAME,Accuracy,F-Measure,Precision,Recall
VALIDATION_RUN_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CampaignModel_t1_v1,0.6,0.2,0.81,0.11
ReadmissionModel1_t1_v1,0.83,0.62,0.66,0.59
ReadmissionModel_t1_v1,0.82,0.58,0.63,0.54


### 9. Query that highlights incorrectly predicted rows

This query retrieves encounters the model incorrectly scored, along with the probability estimate of the prediction.

In [25]:
display(pd.read_sql('''
SELECT
	PROBABILITY(ReadmissionModel FOR '1') ReadmissionProbability,
	PREDICT(ReadmissionModel) PredictedReadmission,
	MxWillReAdmit ActualReadmission,
	*
FROM
	Patient.ReadmissionPredict
WHERE
	MxWillReAdmit = 1
	AND MxWillReAdmit != PREDICT(ReadmissionModel)
''', conn))

Unnamed: 0,ReadmissionProbability,PredictedReadmission,ActualReadmission,AdmitReason,Com_ANY_MALIGNANCY_INCLUDING_LYMPHOMA_AND_LEUKEMIA_EXCEPT_MALIGNANT_NEOPLASM_OF_SKIN,Com_CEREBROVASCULAR_DISEASE,Com_CHRONIC_PULMONARY_DISEASE,Com_DEMENTIA,Com_DIABETES_WITHOUT_CHRONIC_COMPLICATION,Com_HEMIPLEGIA_OR_PARAPLEGIA,...,MxNumAdmitsSixMonth,MxNumAdmitsThreeMonth,MxNumAdmitsTwelveMonth,MxNumDiagOneMonth,MxNumDiagTwelveMonth,MxSmoker,MxStartDateDayOfMonth,MxStartDateDayOfWeek,MxStartDateMonth,MxWillReAdmit
0,0.246878,0,1,Encounter for problem,1,0,1,0,0,0,...,5,3,11,0,0,0,22,4,10,1
1,0.246878,0,1,Encounter for problem,1,0,1,0,0,0,...,5,3,11,0,0,0,19,4,11,1
2,0.150000,0,1,SNOMED Code,0,0,0,0,0,0,...,0,0,0,1,2,0,22,3,1,1
3,0.020000,0,1,SNOMED Code,0,0,0,0,0,0,...,0,0,0,1,1,0,3,4,11,1
4,0.000000,0,1,SNOMED Code,0,0,1,0,0,0,...,0,0,0,1,1,0,3,7,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0.120000,0,1,Emergency Room admission (procedure),0,0,0,0,0,0,...,3,3,3,2,4,0,14,5,11,1
327,0.170000,0,1,Emergency Room admission (procedure),0,0,0,0,0,0,...,4,4,4,2,5,0,14,7,12,1
328,0.150000,0,1,Emergency Room admission (procedure),0,0,0,0,0,0,...,5,3,5,2,6,0,13,2,1,1
329,0.230000,0,1,Emergency Room admission (procedure),0,0,0,0,0,0,...,0,0,0,0,0,0,14,7,12,1


### 9. Query that pulls recent Encounters and filters by those with high probability of readmission

In [26]:
display(pd.read_sql('''
SELECT 
	PROBABILITY(ReadmissionModel FOR '1') ReadmissionProbability,
	PREDICT(ReadmissionModel) PredictedReadmission,
	*
FROM
	Patient.ReadmissionPredict
WHERE
	MxEncounterEndYear >=2019 AND
	MxEndDateMonth >= 12 AND
    PROBABILITY(ReadmissionModel FOR '1') >= 0.25
    ''', conn))

Unnamed: 0,ReadmissionProbability,PredictedReadmission,AdmitReason,Com_ANY_MALIGNANCY_INCLUDING_LYMPHOMA_AND_LEUKEMIA_EXCEPT_MALIGNANT_NEOPLASM_OF_SKIN,Com_CEREBROVASCULAR_DISEASE,Com_CHRONIC_PULMONARY_DISEASE,Com_DEMENTIA,Com_DIABETES_WITHOUT_CHRONIC_COMPLICATION,Com_HEMIPLEGIA_OR_PARAPLEGIA,Com_MILD_LIVER_DISEASE,...,MxNumAdmitsSixMonth,MxNumAdmitsThreeMonth,MxNumAdmitsTwelveMonth,MxNumDiagOneMonth,MxNumDiagTwelveMonth,MxSmoker,MxStartDateDayOfMonth,MxStartDateDayOfWeek,MxStartDateMonth,MxWillReAdmit
0,0.315704,0,Encounter for problem,1,0,0,0,0,0,0,...,5,2,13,0,5,0,24,3,12,0
1,0.33,0,Inpatient Visit,0,0,0,0,0,0,0,...,0,0,0,1,1,0,30,7,11,1
2,0.4,0,Inpatient Follow-Up,0,0,0,0,0,0,0,...,1,1,1,1,1,0,8,1,12,0
