## Imports and Helper Functions

In [8]:
import sys
#Path to Trane for imports
sys.path.append('/Users/Alexander/Documents/Trane/Trane__HDI_REPO')
path_to_datasets = '../Trane__Local_Misc/Formatted Datasets/Saudi ER/'
import pandas as pd
import trane
import json
import random
import datetime
import pickle
import featuretools as ft
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, decomposition, datasets
from sklearn.metrics import accuracy_score
from sklearn import metrics
import scikitplot as skplt

def save_obj(obj, name):
    with open('pickled_objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
    with open('pickled_objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
def generate_probs_and_nl(entity_id_column,
                            label_generating_column,
                            time_column,
                            table_meta,
                            filter_column,
                            is_pick_random_problems = True):
    generator = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
    probs = []

    all_probs = list(generator.generate())

    if is_pick_random_problems:
        random.shuffle(all_probs)

    for idx, prob in enumerate(all_probs):
        probs.append(prob)
        if idx + 1 == NUM_PROBLEMS_TO_GENERATE:
            break
    prediction_problems_json = trane.prediction_problems_to_json_file(
        probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

    nl_descrips = trane.generate_nl_description(
        probs, table_meta, entity_id_column, label_generating_column, time_column, trane.ConstantIntegerCutoffTimes(0))
    return probs, nl_descrips
def convert(str, format = None):
    return datetime.datetime.strptime(str, format)
def file_to_table_meta(filepath):
    return trane.TableMeta(json.loads(open(filepath).read()))

## Load Data

In [16]:
#NOTE:
# The following pairs of dataframes have NO matching patient_ids:
# er_data_df, inp_data_df
# odp_data_df, inp_data_df
# The following pairs of dataframes DO have matching patient_ids:
# er_data_df, odp_data_df
#ER Data
er_data_df = pd.read_csv(path_to_datasets + 'ER.csv', error_bad_lines = False)
er_data_meta = file_to_table_meta(path_to_datasets + 'ER_table_meta.json')
#INP Data
inp_data_df = pd.read_csv(path_to_datasets + 'INP.csv')
inp_data_meta = file_to_table_meta(path_to_datasets + 'INP_table_meta.json')
# #ODP Data
odp_data_df = pd.read_csv(path_to_datasets + 'ODP.csv')
odp_data_meta = file_to_table_meta(path_to_datasets + 'ODP_table_meta.json')


## Generate Problems
#### Note: Moving Forward on Analysis only with the ER dataframe. The other dataframes have very similar information and the data is not related in a relational manner, as Trane expects.

In [35]:
NUM_PROBLEMS_TO_GENERATE = 50
entity_id_column = 'PATIENT_ID'
label_generating_column = 'WORK_ENTITY'
time_column = 'TIME_ARRIVED'
table_meta = er_data_meta
filter_column = 'SEX'
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)

In [38]:
for i, prob in enumerate(probs):
    print("Problem {}'s object form and natural language description: ".format(i + 1))
    print(prob)
    print(nl_descrips[i])

Problem 1's object form and natural language description: 
AllFilterOp(SEX)->ExpRowOp(WORK_ENTITY)->DiffTransformationOp(WORK_ENTITY)->SumAggregationOp(WORK_ENTITY)
For each PATIENT_ID, predict the sum of the fluctuation of the exp of WORK_ENTITY, after TIME_ARRIVED 0.
Problem 2's object form and natural language description: 
AllFilterOp(SEX)->ExpRowOp(WORK_ENTITY)->DiffTransformationOp(WORK_ENTITY)->LastAggregationOp(WORK_ENTITY)
For each PATIENT_ID, predict the last the fluctuation of the exp of WORK_ENTITY, after TIME_ARRIVED 0.
Problem 3's object form and natural language description: 
AllFilterOp(SEX)->NeqRowOp(WORK_ENTITY)->ObjectFrequencyTransformationOp(WORK_ENTITY)->FirstAggregationOp(WORK_ENTITY)
For each PATIENT_ID, predict the firstNone WORK_ENTITY is not equal to 0, after TIME_ARRIVED 0.
Problem 4's object form and natural language description: 
AllFilterOp(SEX)->IdentityRowOp(WORK_ENTITY)->ObjectFrequencyTransformationOp(WORK_ENTITY)->LastAggregationOp(WORK_ENTITY)
For e