### Prism Algorithm

In [6]:
data_file = "../datasets/covid_categorical_good.csv"

In [7]:
def get_index_of_attribute(attributes, target): # gets the index of the 'target' value in the 
    for i in range(len(attributes)):            # inputted 'attributes' list
        if target == attributes[i]:
            return i
    return 0




def get_attribute_at_index(attributes, index): # gets the value of the 'attributes' list at 'index'
    return attributes[index]




def construct_rule(dataset, class_label, attributes, acc_t, cov_t): # constructs and returns a rule
    attr = attributes # list of attributes
    rule = [] 
    
    attribute_value_pairs = [['sex', 'male'], ['sex', 'female'], ['diabetes', 'yes'], 
                             ['diabetes', 'no'], 
                            ['copd', 'yes'], ['copd', 'no'], ['asthma', 'yes'], ['asthma', 'no'], ['imm_supr', 'yes'], 
                            ['imm_supr', 'no'], ['hypertension', 'yes'], ['hypertension', 'no'],
                            ['cardiovascular', 'yes'], ['cardiovascular', 'no'], ['obesity', 'yes'], ['obesity', 'no'], 
                           ['renal_chronic', 'yes'], ['renal_chronic', 'no'], ['tobacco', 'yes'], 
                             ['tobacco', 'no']] # All possible attribute-value pairs 
                                                # (excluding the values for attribute = age)
        
    ages = [] # List that will contain the possible values for attribute = age
    
    for person in dataset:
        for attr_val in person:
            if isinstance(attr_val, int) or isinstance(attr_val, float): 
                if attr_val not in ages: # if the age value is not in our ages list then add it to our list
                    ages.append(attr_val)
    
    for age in ages:
        attribute_value_pairs.append(['age', age]) # add each age-value pair to our attribute_value_pairs list
        
        
    max_acc = 0 # accuracy of the current rule
    max_cov = 0 # coverage of the current rule
    
    while True:      
        acc = [] # list that will contain the accuracy values of the current rule + every attribute-value pair
        cov = [] # list that will contain the coverage values of the current rule + every attribute-value pair
        
        # calculates the accuracy and coverage of the current rule + every attribute-value pair 
        for attr_val_pair in attribute_value_pairs:
            correct = 0 # number of persons where rule correctly classifies outcome
            total = 0 # number of persons who fit the rule 
            temp_rule = rule[:] # deep copy of 'rule' list
            temp_rule.append([get_index_of_attribute(attr, attr_val_pair[0]), attr_val_pair[1]]) # append an attribute-value pair to the previous rule
            for i in range(len(dataset)):
                count = 0 # keeps track of the number of attributes in each person correctly classified by temp_rule
                for bools in temp_rule:
                    if bools[0] == 1: # if attribute is age (index = 1 in our 'attr' list)
                        if dataset[i][bools[0]] >= bools[1]:
                            count = count + 1 # if person's age is greater than or equal to our temp_rule's age, then increment count
                    elif dataset[i][bools[0]] == bools[1]: 
                        count = count + 1 # if person's attribute at bools[0] is correctly classified by temp_rule, then increment count
                    
                if count == len(temp_rule): # if temp_rule correctly classifies (on the person) every single attribute it contains
                    if dataset[i][len(dataset[i]) - 1] == class_label: # as well as correctly classifying outcome
                        correct = correct + 1 
                        total = total + 1 # then increment correct and total counter
                    else:
                        total = total + 1 # else temp_rule is wrong about the outcome, so only increment total counter
            if total == 0: # prevent division by 0
                acc.append(0)
                cov.append(0)
            else:
                acc.append(correct/total) # append temp_rule's accuracy to 'acc' list
                cov.append(correct) # append temp_rule's coverage to 'cov' list
                
                
        index = -1 # index of attribute with largest accuracy (as long as its coverage meets the coverage threshold)
                    # initially set it to be -1
        for i in range(len(acc)): 
            if acc[i] > max_acc:
                if cov[i] > cov_t:
                    max_acc = acc[i]
                    max_cov = cov[i]
                    index = i
                    
        if index == -1: # if 'index' still is -1, then that means no improvement can be made on the accuracy 
                        # of the old rule while retaining its coverage to be greater than the coverage threshold
                
            return rule, max_acc, max_cov, class_label # so just return 
        
        elif max_acc < acc_t: # if "max_acc", ie the accuracy of the best newly generated rule, falls below the accuracy threshold
            return rule, max_acc, max_cov, class_label # just return 
        
        else: # else, there might be still room for improvement on our rule, so
            best_att_val_pair = attribute_value_pairs[index] 
            rule.append([get_index_of_attribute(attr, best_att_val_pair[0]), best_att_val_pair[1]]) # update rule
            attribute_value_pairs.remove(best_att_val_pair) # delete attribute-value pair from the list since its now in our rule

In [8]:
def prism(col_headers, data, acc_thresh, cov_thresh):
    rules = [] # rules
    acc_and_cov = [] # accuracy and coverage of our rules
    final_class_labels = [] # what the RHS of our rules are (either alive or dead)
    covid_data = data
    class_labels = ['alive']
    
    while len(covid_data) >= cov_thresh: # while it is still possible for a non-empty rule to be generated
        for class_label in class_labels:
            rule, acc, cov, cls_label = construct_rule(covid_data, class_label, col_headers, acc_thresh, cov_thresh)
            rules.append(rule)
            acc_and_cov.append([acc, cov])
            final_class_labels.append(cls_label)
                
            to_remove = [] # indicies of 'covid_data' that our rule covers, so we want to remove these persons from the dataset
            for i in range(len(covid_data)): # find the people in 'covid_data' that our rule correctly classifies
                count = 0
                for bools in rule:
                    if bools[0] == 1:
                        if covid_data[i][bools[0]] >= bools[1]:
                            count = count + 1
                    elif covid_data[i][bools[0]] == bools[1]:
                        count = count + 1    
                    
                if count == len(rule):
                    to_remove.append(i)

            for x in to_remove:
                covid_data[x] = 0

            covid_data = [s for s in covid_data if s != 0] # remove the people that our rule correctly classifies
            
            if(len(covid_data) == 0): # break out of for loop if the dataset's length = 0
                break
    
    for rule in rules:
        for x in rule:
            index = x[0]
            x[0] = get_attribute_at_index(col_headers, index) # make output look more readable
            
    return rules, acc_and_cov, final_class_labels
    

In [10]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data_rows = data.to_numpy().tolist()
columns_list = data.columns.to_numpy().tolist()

rules, accuracy_and_coverage, target_class_label = prism(columns_list, data_rows, 0.7, 60000)

In [12]:
for i in range(len(rules)):
    print("rule: ", rules[i], ", accuracy and coverage: ", accuracy_and_coverage[i], ", targeted class label: ", target_class_label[i])

rule:  [['hypertension', 'no'], ['sex', 'female'], ['diabetes', 'no'], ['copd', 'no'], ['imm_supr', 'no'], ['renal_chronic', 'no'], ['cardiovascular', 'no'], ['age', 2]] , accuracy and coverage:  [0.9588582111015242, 66679] , targeted class label:  alive
rule:  [['hypertension', 'no'], ['diabetes', 'no'], ['obesity', 'no'], ['copd', 'no'], ['renal_chronic', 'no'], ['imm_supr', 'no'], ['cardiovascular', 'no']] , accuracy and coverage:  [0.9134423726657148, 64616] , targeted class label:  alive
rule:  [['age', 0]] , accuracy and coverage:  [0.774702154626109, 61124] , targeted class label:  alive


<p>
Depending on the accuracy and coverage threshold you set, you get different rules. But most of the time, these rules followed my intuition, which is that if you do not suffer from any other chronic illnesses, then the liklihood of you dying is very minimal. For example, with a coverage threshold of 60,000 and an accuracy threshold of 0.7, I found that the top rule (with an accuracy of 0.959 and a coverage of 66679), was that you are female and above the age of 2, and don't have hypertension, don't have diabetes, don't have copd, don't have imm_supr, don't have chronic renic issues, and don't have cardiovascular issues. This makes sense to me, and follows my intuition, as I know from the news and everything that if you don't suffer from any prior medical problems, covid most likely won't kill you. 
    
I also found it interesting that if I set the outcome as 'dead', then no rules with decent accuracy and coverage are generated, implying that covid deaths stem from a variety of reasons (since no single rule can predict, with a high accuracy, your death). 
</p>