In [10]:
from SignalTemporalLogic.STLFactory import STLFactory
import matplotlib.pyplot as plt
plt.rc('font', size=12)
import copy
import pandas as pd
import seaborn as sns
import numpy as np
from collections import Counter
import warnings
import treelib
import re

# warnings.filterwarnings('ignore')
%matplotlib inline


In [12]:
#Helper Functions

def loadClientRules(popSize, dataFilename):
    clientRules = []
    clientTrees = []
    num = 1
    clientsAdded = 0
    while clientsAdded < popSize:
        fileName = dataFilename + repr(num) + "Rules.txt"
        fileFound, trees, rls = loadRuleSet(num, fileName)
        # c.logRuleSet()
        if fileFound:
            clientsAdded += 1
            clientTrees.extend(trees)
            clientRules.extend(rls)

        num += 1   
        
    #get nonduplicate list of trees
    currRls = []
    ct = []
    for t in clientTrees:
        strRl = t.toString()
        strRl = re.sub('>=', '>', strRl)
        strRl = re.sub('<=', '<', strRl)

        if strRl not in currRls:
            ct.append(t)
            currRls.append(strRl)
            
    return ct, clientRules


def loadRuleSet(num, textfile):
    ruleSet = []
    ruleTrees = []
    stlFac = STLFactory()
    try:
        file = open(textfile, "r")
        for line in file:
            if line[0] == "(" and line[-2] == ")":
                line = line[1:-2] + "\n"

            rule = stlFac.constructFormulaTree(line)
            rule.getFormulaNoParams()
                
            ruleTrees.append(rule)
            
            #fix relop for string rule
            strRl = rule.toString()
            strRl = re.sub('>=', '>', strRl)
            strRl = re.sub('<=', '<', strRl)
            ruleSet.append(strRl)

        file.close()
        return True, ruleTrees, ruleSet

    except:
        print("File not found for Client %d" % (num) )
        return False, ruleTrees, ruleSet
            


## Load Client Rules

In [13]:
#get list of client rules sorted by count
popSize = 100
dataFilename = "../Data/ICU/Best/"
# dataFilename = "../Data/Sepsis/Best/"

In [14]:
# Load Client Rule Trees and Text Rule Structures
clientTrees, clientRules = loadClientRules(popSize, dataFilename)

File not found for Client 8
File not found for Client 81


In [15]:
#Make dataframe of rules and their counts
clientDF = pd.DataFrame.from_dict(dict(Counter(clientRules)), orient='index').reset_index()
clientDF.columns=["Rule", "Rule Count"]
clientDF['Percent of Population'] = clientDF['Rule Count'] / popSize * 100
clientDF = clientDF.sort_values("Rule Count", ascending=False)
clientDF

Unnamed: 0,Rule,Rule Count,Percent of Population
69,"((MET > 0.000) U[0,0] (death = 0.000))",230,230.0
79,"G[0,0]((n_evts < 0.000 -> LOS > 0.000))",195,195.0
62,"F[0,0]((BLOOD_UREA_NITROGEN < 0.000 & CREATINI...",77,77.0
102,"G[0,0]((hr > 0.000 & Pulse > 0.000))",46,46.0
619,"((Mort > 0.000) U[0,0] (y = 0.000))",8,8.0
...,...,...,...
1766,"F[0,0]((CREATININE < 0.000 -> y = 0.000))",1,1.0
1768,"G[0,0]((CHLORIDE > 0.000 | GLUCOSE < 0.000))",1,1.0
1769,"F[0,0]((BLOOD_UREA_NITROGEN < 0.000 -> direct ...",1,1.0
1771,"G[0,0]((s2_hr < 0.000 & s8_hr > 0.000))",1,1.0


## Load LDP Ruleset

In [16]:
resultsFilename = "../Results/ICU_Ruleset_MCTS_Baseline.txt"
# resultsFilename = "../Results/ICU_Ruleset_MCTS_Baseline_8000pts_100iters.txt"
# resultsFilename = "../Results/Sepsis_Ruleset_MCTS_Baseline_1000pts_100iters.txt"

ldpTrees = []
ldpRules = []

file = open(resultsFilename, "r")
stlFac = STLFactory()
for line in file:
    if line[0] == "(" and line[-2] == ")":
        line = line[1:-2] + "\n"

    rule = stlFac.constructFormulaTree(line)
    rule.getFormulaNoParams()

    ldpTrees.append(rule)
    ldpRules.append(rule.toString())

file.close()

ldpRules

['ALT_GPT > 0.000',
 'CREATININE > 0.000',
 'ALKALINE_PHOSPHATASE > 0.000',
 'CHLORIDE > 0.000',
 'CO > 0.000',
 'Temp > 0.000',
 'F[0,0](SBP > 0.000)',
 'F[0,0](CREATININE > 0.000)',
 'PARTIAL_THROMBOPLASTIN_TIME > 0.000',
 'CALCIUM > 0.000',
 'F[0,0](srr > 0.000)',
 'death > 0.000',
 'F[0,0](cosen > 0.000)',
 'tte > 0.000',
 'Pulse > 0.000',
 'F[0,0](GLUCOSE > 0.000)',
 'F[0,0](dfa > 0.000)',
 'BLOOD_UREA_NITROGEN > 0.000',
 'F[0,0](MAGNESIUM > 0.000)',
 's_hr > 0.000',
 'F[0,0](ICU_Pt_Days > 0.000)',
 'F[0,0](CO > 0.000)',
 'SBP > 0.000',
 'AST_GOT > 0.000',
 'F[0,0](CHLORIDE > 0.000)',
 'F[0,0](SpO > 0.000)',
 'SpO > 0.000',
 'n_edrk > 0.000',
 'F[0,0](lds > 0.000)',
 'ICU_Pt_Days > 0.000',
 'ALBUMIN > 0.000',
 'TROPONIN_I > 0.000',
 'F[0,0](Temp > 0.000)',
 'F[0,0](POTASSIUM > 0.000)',
 'edrk > 0.000',
 'Mort > 0.000',
 'F[0,0](Mort > 0.000)',
 'F[0,0](Resp > 0.000)',
 'LOS > 0.000',
 'y > 0.000',
 'F[0,0](LOS > 0.000)',
 'WHITE_BLOOD_CELL_COUNT > 0.000',
 'F[0,0](O_Flow > 0.000)'

In [26]:
#TODO here- add 
resultsFilename = "../Results/ICU_Ruleset_MCTS_Baseline.csv"
ldpDF = pd.read_csv(resultsFilename)

ldpTrees = []
ldpRules = []

stlFac = STLFactory()
for r in ldpDF['Rule']:
    rule = stlFac.constructFormulaTree(r)
    rule.getFormulaNoParams()

    ldpTrees.append(rule)
    ldpRules.append(rule.toString())


ldpDF

Unnamed: 0.1,Unnamed: 0,Rule,Percent Count


In [17]:
#Get count of the number of true structures matched in client rules

def getTemplateNodes(temp):
    nodes = []
    
    for node in temp.expand_tree(mode=treelib.Tree.DEPTH, sorting=True):
        n = re.sub('[0-9]', '', node)
        nodes.append(n)

    return nodes

def findRuleMatch(template, clientTrees):
    ldpNodes = getTemplateNodes(template)
    ldpVars = template.getAllVars()
    
    
    for c in clientTrees:
        # check if variables in rule
        clVars = c.getAllVars()
        hasVars = True
        for v in ldpVars:
            if v not in clVars:
                hasVars = False

        if hasVars:
            # check for structural match
            clientNodes = []

            for node in c.expand_tree(mode=treelib.Tree.DEPTH, sorting=True):
                n = re.sub('[0-9]', '', node)
                clientNodes.append(n)

            # print("client nodes", clientNodes)
            if nodeListMatch(ldpNodes, clientNodes):
                return True  # found match

    return False

# check for match  between two lists of template nodes + client nodes
def nodeListMatch(tempList, cList):
    #Fix relop matches
    tempList[:] = [x if x != "LT" else "LE" for x in tempList]
    tempList[:] = [x if x != "GT" else "GE" for x in tempList]
    cList[:] = [x if x != "LT" else "LE" for x in cList]
    cList[:] = [x if x != "GT" else "GE" for x in cList]

    i = 0
    while i < len(tempList):
        if tempList[i] in cList:
            idx = cList.index(tempList[i]) #get idx of element of cList
            cList = cList[idx+1:]
        else:
            return False

        i = i+1

    return True

def getCoverage(ldpTrees, clientRules):
    
    #From client rules, first make client trees
    clientTrees = []
    for c in clientRules:
        c = stlFac.constructFormulaTree(c + "\n")
        clientTrees.append(c)
    
    #Calculate num true rules, num false rules and precision (true rules / total rules found)
    foundRules = 0
    nonRules = 0
    
    for l in ldpTrees:
        if findRuleMatch(l, clientTrees): #check structural match --> will count partial matches as a full match
            foundRules += 1
        else:
            nonRules += 1
            
    
    lst = [len(clientRules), foundRules, nonRules, foundRules / (foundRules + nonRules) ]
    df = pd.DataFrame([lst], columns=["Total Client Rules", "Found Rules", "Non Rules", "Precision"])        
    
    return df


## Get Coverage of Rules Above Threshold

In [23]:
thresh = 1 #Percentage match count

#Get client rules above the threshold
df = clientDF[clientDF['Percent of Population'] >= thresh]
clientThreshRules = df['Rule'].tolist()

#Get Coverage
getCoverage(ldpTrees, clientThreshRules)

Unnamed: 0,Total Client Rules,Found Rules,Non Rules,Precision
0,4085,92,20,0.821429


## Calculate Rules / Query
Found rules / total queries

In [20]:
92 / 50

1.84

In [None]:
clientDF

## Calculate Likelihood of Finding Rule given Client Percent count
graph that shows the likelihood of learning a rule vs the # of clients who actually have the rule

In [None]:
LL = np.sum(stats.norm.logpdf(y, pred, std_dev))

Unnamed: 0,Rule,Rule Count,Percent of Population
69,"((MET > 0.000) U[0,0] (death = 0.000))",230,230.0
79,"G[0,0]((n_evts < 0.000 -> LOS > 0.000))",195,195.0
62,"F[0,0]((BLOOD_UREA_NITROGEN < 0.000 & CREATINI...",77,77.0
102,"G[0,0]((hr > 0.000 & Pulse > 0.000))",46,46.0
619,"((Mort > 0.000) U[0,0] (y = 0.000))",8,8.0
...,...,...,...
1766,"F[0,0]((CREATININE < 0.000 -> y = 0.000))",1,1.0
1768,"G[0,0]((CHLORIDE > 0.000 | GLUCOSE < 0.000))",1,1.0
1769,"F[0,0]((BLOOD_UREA_NITROGEN < 0.000 -> direct ...",1,1.0
1771,"G[0,0]((s2_hr < 0.000 & s8_hr > 0.000))",1,1.0


## Calculate Likelihood of Finding Rule given Client Percent count
graph that shows the likelihood of learning a rule vs the # of clients who actually have the rule

In [None]:
LL = np.sum(stats.norm.logpdf(y, pred, std_dev))