In [1]:
class Rule:
    def __init__ (self, attributes, values, classification, accuracy):
        self.attributes = attributes
        self.values = values
        self.classification = classification
        self.accuracy = accuracy
    def print_self(self):
        print("attr", self.attributes)
        print("values", self.values)
        print("acc", self.accuracy)

In [2]:
from pandas.api.types import is_numeric_dtype
def refine_rows(data, rule):
    rows = data.copy()
    for i in range(len(rule.attributes)):
        if(is_numeric_dtype(rows[rule.attributes[i]])):
            rows = rows[rows[rule.attributes[i]] >= rule.values[i]] #for numeric attribute interpret as greater than or equal to
        else:
            rows = rows[rows[rule.attributes[i]] == rule.values[i]]
    return rows

In [3]:
def find_accuracy(data, rule, c, attr):
    rows = refine_rows(data, rule)
    if len(rows) == 0:
        return 0
    acc = len(rows[rows[attr] == c])/len(rows) #number correct over total rows
    return acc

In [4]:
def find_coverage(data, rule):
    rows = refine_rows(data, rule)
    return len(rows) #return length of rows left after rule is applied

In [5]:
def remove_rows(data, rule):
    rows = refine_rows(data, rule)
    return data[~data.index.isin(rows.index)]

In [6]:
def learn_one_rule(data, attr, classes, acc_thresh=0.9, cvg_thresh=20):
    all_class_rules = []
    #iterate over all possible classifications
    for c in classes:
        rule = Rule([],[],None,0)
        rule.accuracy = find_accuracy(data, rule, c, attr[-1])
        rule.classification = c
        rules_list = []
        rows = data[data[attr[-1]] == c]
        temp = attr[:-1].copy()
        i = len(temp)
        while rule.accuracy < acc_thresh:
            # find all attribute-value pairs
            for a in temp: 
                vals = rows[a].unique() #values of an attribute
                for v in vals:
                    att = rule.attributes.copy()
                    att.append(a)
                    vv = rule.values.copy()
                    vv.append(v)
                    temp_rule = Rule(att, vv, rule.classification, None) #new attribute value pairs are added to list
                    temp_rule.accuracy = find_accuracy(data, temp_rule, c, attr[-1])
                    rules_list.append(temp_rule) #all new possible rules added to list
            
            #find rule with best accuracy
            best_rule = rules_list[0]
            for r in rules_list:
                if r.accuracy >= best_rule.accuracy and find_coverage(data, r) >= cvg_thresh:
                    if r.accuracy == best_rule.accuracy: #coverage breaks ties
                        if find_coverage(data, r) < find_coverage(data, best_rule):
                            continue
                    best_rule = r
            
            #set this rule to the best rule and remove from attribute list
            if best_rule.accuracy > rule.accuracy or (best_rule.accuracy == rule.accuracy and find_coverage(data, best_rule) > find_coverage(data, rule)):
                rule = best_rule
                if rule.attributes[-1] in temp:
                    temp.remove(rule.attributes[-1])
            
            #break while loop if attribute list is empty OR if you have already explored all attribute combinations
            i-=1
            if i < 0 or len(temp) == 0:
                break
        all_class_rules.append(rule)
    
    #find best rule out of rules for all classes
    best_rule = all_class_rules[0]
    for r in all_class_rules:
        if r.accuracy >= best_rule.accuracy and find_coverage(data, r) >= cvg_thresh:
            if r.accuracy == best_rule.accuracy:
                if find_coverage(data, r) < find_coverage(data, best_rule):
                    continue
            best_rule = r
    
    return best_rule, remove_rows(data, best_rule)
    #remove rows covered by R from E
    #return remaining rows
    
"""
Procedure learn-one-rule (set E):
  For each class C
    Initialize EC to all instances with class label C
    Create a rule R with an empty left-hand side that predicts class C
    Until R is perfect (or there are no more attributes to use) do
        For each attribute Ai not mentioned in R, and each attr. value vj,
            consider adding the condition Ai = vj to the LHS of R
        Select Ak = vm to maximize the accuracy: correct/all
            (break ties by choosing the condition with the largest correct)
        Add condition Ak = vm to the LHS of rule R
    Remove the instances covered by R from E
    Return remaining instances

"""

'\nProcedure learn-one-rule (set E):\n  For each class C\n    Initialize EC to all instances with class label C\n    Create a rule R with an empty left-hand side that predicts class C\n    Until R is perfect (or there are no more attributes to use) do\n        For each attribute Ai not mentioned in R, and each attr. value vj,\n            consider adding the condition Ai = vj to the LHS of R\n        Select Ak = vm to maximize the accuracy: correct/all\n            (break ties by choosing the condition with the largest correct)\n        Add condition Ak = vm to the LHS of rule R\n    Remove the instances covered by R from E\n    Return remaining instances\n\n'

In [7]:
import pandas as pd
data = pd.read_csv("../../ml_datasets/covid_categorical_good.csv")
data = data.dropna(how="any")
data_rows = data.to_numpy().tolist()
columns_list = data.columns.to_numpy().tolist()

In [8]:
rows = data.copy()
i = len(rows)/20 #divide by 20 since each rule will cover at least 20 elements
while len(rows) != 0:
    rule, rows = learn_one_rule(rows, columns_list, ['dead', 'alive']) #learn one rule then repeat algorithm on remaining rows
    if i < 0 or rule.accuracy < 0.9: #break after rules stop having good accuracy
        break
    rule.print_self()
    print("covg", find_coverage(data, rule))
    print("-->", rule.classification) 
    i -= 1

attr ['hypertension']
values ['no']
acc 0.9118543984283984
covg 175108
--> alive
attr ['asthma', 'imm_supr', 'sex']
values ['yes', 'yes', 'female']
acc 0.9047619047619048
covg 110
--> alive
attr ['asthma', 'diabetes', 'sex', 'tobacco']
values ['yes', 'no', 'female', 'yes']
acc 0.9615384615384616
covg 170
--> alive
attr ['asthma', 'sex', 'tobacco']
values ['yes', 'female', 'yes']
acc 0.9545454545454546
covg 230
--> alive
attr ['asthma', 'diabetes', 'sex', 'obesity', 'cardiovascular', 'age']
values ['yes', 'no', 'female', 'no', 'no', 71]
acc 0.9130434782608695
covg 63
--> alive
attr ['asthma', 'diabetes', 'sex', 'obesity', 'copd', 'cardiovascular', 'renal_chronic']
values ['yes', 'no', 'female', 'no', 'no', 'no', 'no']
acc 0.9014778325123153
covg 2031
--> alive
attr ['diabetes', 'sex', 'asthma', 'obesity', 'renal_chronic', 'copd', 'cardiovascular', 'age']
values ['no', 'female', 'yes', 'yes', 'no', 'no', 'no', 66]
acc 0.9090909090909091
covg 33
--> alive


# Summary #
<p> With an accuracy threshold of 0.9 and a coverage threshold of 20, many rules were produced as combinations of different attributes.  I was suprised that many patients who had asthma were not affected by Covid. Additionally, I was surprised that age wasn't a large factor.  Also, a large percentage of females who smoke tobacco and had asmtha were reported alive.  The top rules that were produced with accuracy > 0.9 were:</p>
<ul>
    <li>if hypertension=no --> alive</li>
    <li>if asthma=yes, imm_supr=yes, sex=female --> alive</li>
    <li>if asthma=yes, diabetes=no, sex=female. tobacco=yes --> alive</li>
    <li>if asthma=yes, sex=female, tobacco=yes --> alive</li>
    <li>if asthma=yes, diabetes=no, sex=female, obesity=no, cardiovascular=no, age>=71 --> alive</li>
    <li>if asthma=yes, diabetes=no, sex=female, obesity=no, copd=no, cardiovascular=no, renal_chronic=no --> alive</li>
    <li>if diabetes=no, sex=female, asthma=yes, obesity=yes, renal_chronic=no, copd=no, cardiovascular=no, age>=66 --> alive</li>
    
I was surprised to find that a rule could not be found for the death classification with these values for accuracy and coverage thresholds.