### Prism Algorithm

In [204]:
data_file = "../datasets/covid_categorical_good.csv"

In [205]:
def get_index_of_attribute(attributes, target): # gets the index of the 'target' value in the 'attributes' list
#     start = 0
#     end = len(attributes) - 1

#     while start <= end: 
#         middle = (start + end)/ 2
#         midpoint = attributes[middle]
#         if midpoint > target:
#             end = middle - 1
#         elif midpoint < target:
#             start = middle + 1
#         else:
#             return midpoint
    for i in range(len(attributes)):            
        if target == attributes[i]:
            return i
    return 0



def get_attribute_at_index(attributes, index): # gets the value of the 'attributes' list at 'index'
    return attributes[index]


def construct_rule(dataset, class_label, attributes, acc_t, cov_t): # constructs and returns a rule
    attr = sorted(attributes) # list of attributes (sorted, so I can binary search which is faster than linear search)
    rule = [] 
    
    attribute_value_pairs = []
    
    for i in range(len(attr) - 1): # for each attribute value 
        for data_entry in dataset: # for each data_entry in the dataset
            if isinstance(data_entry[i], int) or isinstance(data_entry[i], float): # if attribute value of data_entry is numeric
                potential_att_val_pair = [get_attribute_at_index(attr, i), data_entry[i], ">="]
                if potential_att_val_pair not in attribute_value_pairs: # if potential attribute-value pair is not in our list
                    attribute_value_pairs.append(potential_att_val_pair) # append the attribute-numeric value pair with >=
                    attribute_value_pairs.append([get_attribute_at_index(attr, i), data_entry[i], "<"]) # append the attribute-numeric value pair with <
            else:
                potential_att_val_pair = [get_attribute_at_index(attr, i), data_entry[i]] # potential attribute-value pair is created
                if potential_att_val_pair not in attribute_value_pairs: # if potential attribute-value pair is not in our list, then append it
                    attribute_value_pairs.append(potential_att_val_pair)
                    
    max_acc = 0 # accuracy of the current rule
    max_cov = 0 # coverage of the current rule
    
    while True:      
        acc = [] # list that will contain the accuracy values of the current rule + every attribute-value pair
        cov = [] # list that will contain the coverage values of the current rule + every attribute-value pair
        
        # calculates the accuracy and coverage of the current rule + every attribute-value pair 
        for attr_val_pair in attribute_value_pairs:
            correct = 0 # number of persons where rule correctly classifies outcome
            total = 0 # number of persons who fit the rule 
            temp_rule = rule[:] # deep copy of 'rule' list
            if isinstance(attr_val_pair[1], int) or isinstance(attr_val_pair[1], float): 
                 temp_rule.append([get_index_of_attribute(attr, attr_val_pair[0]), attr_val_pair[1], attr_val_pair[2]]) # append an attribute-value pair to the prev. rule (where the value is numeric)
            else: 
                temp_rule.append([get_index_of_attribute(attr, attr_val_pair[0]), attr_val_pair[1]]) # append an attribute-value pair to the previous rule
            for i in range(len(dataset)):
                count = 0 # keeps track of the number of attributes in each person correctly classified by temp_rule
                for bools in temp_rule:
                    if isinstance(bools[1], int) or isinstance(bools[1], float): # if attribute value is numeric
                        if bools[2] == ">=": # if the rule we're looking at is asking if the data entry's numeric attribute value is >= the rule's numeric value
                            if dataset[i][bools[0]] >= bools[1]:
                                count = count + 1 # if data_entry's numeric value is greater than or equal to our temp_rule's numeric value, then increment count
                        else: # if the rule we're looking at is asking if the data entry's numeric attribute value is < the rule's numeric value
                            if dataset[i][bools[0]] < bools[1]:
                                count = count + 1 # if data_entry's numeric value is less than our temp_rule's numeric value, then increment count
                    elif dataset[i][bools[0]] == bools[1]: 
                        count = count + 1 # if data_entry's attribute at bools[0] is correctly classified by temp_rule, then increment count
                    
                if count == len(temp_rule): # if temp_rule correctly classifies (on the data entry) every single attribute it contains
                    if dataset[i][len(dataset[i]) - 1] == class_label: # as well as correctly classifying outcome
                        correct = correct + 1 
                        total = total + 1 # then increment correct and total counter
                    else:
                        total = total + 1 # else temp_rule is wrong about the outcome, so only increment total counter
            if total == 0: # prevent division by 0
                acc.append(0)
                cov.append(0)
            else:
                acc.append(correct/total) # append temp_rule's accuracy to 'acc' list
                cov.append(correct) # append temp_rule's coverage to 'cov' list
                
                
        index = -1 # index of attribute with largest accuracy (as long as its coverage meets the coverage threshold)
                    # initially set it to be -1
        for i in range(len(acc)): 
            if acc[i] > max_acc:
                if cov[i] > cov_t:
                    max_acc = acc[i]
                    max_cov = cov[i]
                    index = i
                    
        if index == -1: # if 'index' still is -1, then that means no improvement can be made on the accuracy 
                        # of the old rule while retaining its coverage to be greater than the coverage threshold
                
            return rule, max_acc, max_cov, class_label # so just return 
        
        elif max_acc < acc_t: # if "max_acc", ie the accuracy of the best newly generated rule, falls below the accuracy threshold
            return rule, max_acc, max_cov, class_label # just return 
        
        else: # else, there might be still room for improvement on our rule, so
            best_att_val_pair = attribute_value_pairs[index] 
            if isinstance(best_att_val_pair[1], int) or isinstance(best_att_val_pair[1], float): 
                rule.append([get_index_of_attribute(attr, best_att_val_pair[0]), best_att_val_pair[1], best_att_val_pair[2]]) # update rule(where the value is numeric)
            else: 
                rule.append([get_index_of_attribute(attr, best_att_val_pair[0]), best_att_val_pair[1]]) # update rule where value isn't numeric
            attribute_value_pairs.remove(best_att_val_pair) # delete attribute-value pair from the list since its now in our rule

In [206]:
def prism(col_headers, data, acc_thresh, cov_thresh):
    rules = [] # rules
    acc_and_cov = [] # accuracy and coverage of our rules
    final_class_labels = [] # what the RHS of our rules are (either alive or dead)
    c_data = data
    class_labels = ['alive'] # targetted class label(s)
    
    while len(c_data) >= cov_thresh: # while it is still possible for a non-empty rule to be generated
        for class_label in class_labels:
            rule, acc, cov, cls_label = construct_rule(c_data, class_label, col_headers, acc_thresh, cov_thresh)
            rules.append(rule)
            acc_and_cov.append([acc, cov])
            final_class_labels.append(cls_label)
                
            to_remove = [] # indicies of 'c_data' that our rule covers, so we want to remove these persons from the dataset
            for i in range(len(c_data)): # find the people in 'c_data' that our rule correctly classifies
                count = 0
                for bools in rule:
                    if isinstance(bools[1], int) or isinstance(bools[1], float): # if attribute value is numeric
                        if bools[2] == ">=": # if the rule we're looking at is asking if the data entry's numeric attribute value is >= the rule's numeric value
                            if c_data[i][bools[0]] >= bools[1]:
                                count = count + 1 # if data_entry's numeric value is greater than or equal to our temp_rule's numeric value, then increment count
                        else: # if the rule we're looking at is asking if the data entry's numeric attribute value is < the rule's numeric value
                            if c_data[i][bools[0]] < bools[1]:
                                count = count + 1 # if data_entry's numeric value is less than our temp_rule's numeric value, then increment count
                    elif c_data[i][bools[0]] == bools[1]:
                        count = count + 1    
                    
                if count == len(rule):
                    to_remove.append(i)

            for x in to_remove:
                c_data[x] = 0

            c_data = [s for s in c_data if s != 0] # remove the people that our rule correctly classifies
            
            if(len(c_data) == 0): # break out of for loop if the dataset's length = 0
                break
    
    for rule in rules:
        for x in rule:
            index = x[0]
            x[0] = get_attribute_at_index(col_headers, index) # make output look more readable
            
    return rules, acc_and_cov, final_class_labels
    

In [207]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data_rows = data.to_numpy().tolist()
columns_list = data.columns.to_numpy().tolist()

rules, accuracy_and_coverage, target_class_label = prism(columns_list, data_rows, 0.7, 50000)

In [208]:
for i in range(len(rules)):
    print("rule: ", rules[i], ", accuracy and coverage: ", accuracy_and_coverage[i], ", targeted class label: ", target_class_label[i])

rule:  [['age', 34, '<'], ['hypertension', 'no'], ['diabetes', 'no'], ['age', 6, '>='], ['copd', 'no']] , accuracy and coverage:  [0.988745409311693, 50076] , targeted class label:  alive
rule:  [['age', 44, '<'], ['renal_chronic', 'no'], ['imm_supr', 'no']] , accuracy and coverage:  [0.9608285676032362, 50235] , targeted class label:  alive
rule:  [['age', 56, '<'], ['copd', 'no'], ['age', 34, '>=']] , accuracy and coverage:  [0.8890111840937334, 50078] , targeted class label:  alive
rule:  [] , accuracy and coverage:  [0, 0] , targeted class label:  alive


<p>
Depending on the accuracy and coverage threshold you set, you get different rules. But most of the time, these rules followed my intuition, which is that if you do not suffer from any other chronic illnesses and you are young, then the liklihood of you dying is very minimal. For example, with a coverage threshold of 50,000 and an accuracy threshold of 0.7, I found that the top rule (with an accuracy of 0.9887 and a coverage of 50076), was that below the age of 34, and don't have hypertension, don't have diabetes, and don't have copd. This makes sense to me, and follows my intuition, as I know from the news and everything that if you don't suffer from any prior medical problems, covid most likely won't kill you. 
    
I also found it interesting that if I set the outcome as 'dead', then no rules with decent accuracy and coverage are generated, implying that covid deaths stem from a variety of reasons (since no single rule can predict, with a high accuracy, your death). 
</p>