In [40]:
import pandas as pd


data = {
    'age': ['30-39', '40-49', '50-59', '60-69'],
    'menopause': ['premeno', 'ge40', 'ge40', 'ge40'],
    'tumor-size': ['15-19', '20-24', '10-14', '25-29'],
    'inv-nodes': ['0-2', '3-5', '0-2', '0-2'],
    'node-caps': ['no', 'yes', 'no', 'no'],
    'deg-malig': [2, 3, 1, 2],
    'breast': ['left', 'right', 'left', 'right'],
    'breast-quad': ['left_low', 'central', 'right_up', 'left_up'],
    'irradiat': ['no', 'yes', 'no', 'yes'],
    'Class': ['no-recurrence-events', 'recurrence-events',
              'no-recurrence-events', 'recurrence-events']
}

df = pd.DataFrame(data)

In [41]:
def evaluate_rule(condition : dict, X : pd.DataFrame, y : pd.Series) :
	"""
	Evaluate a rule's coverage and accuracy.
	Input: condition (dict, e.g., {'tumor-size': '15-19', 'node-caps': 'no'}),
	X (pandas DataFrame), y (pandas Series)
	Output: (coverage, accuracy) tuple
	"""
	# get entries that satisfy the condition
	indices = pd.Series(True, index=X.index)
	for feature, value in condition.items() :
		indices &= (X[feature] == value)


	X_filtered = X[indices]
	y_filtered = y[indices]

	if len(X_filtered) == 0 :
		return (0.0, 0.0)

	coverage = len(X_filtered) / len(X)
	y_major = y_filtered.value_counts().idxmax()
	accuracy = (y_filtered == y_major).sum().item() / len(y_filtered)

	return (coverage, accuracy)

X_all = df.drop(columns=['Class'])
y_all = df['Class']

# test case
print(evaluate_rule({'tumor-size': '15-19', 'node-caps': 'no'}, X_all, y_all))

(0.25, 1.0)


In [42]:
def learn_rule(X : pd.DataFrame, y : pd.Series, min_converage : float) :
	"""
	Learn one best rule that maximizes accuracy with at least min_coverage coverage.
	Input: X, y, min_coverage (int)
	Output: rule (dict), e.g., {'tumor-size': '15-19', 'node-caps': 'no'} â†’ 'no-recurrence-events'
	"""
	attributes = X.columns.tolist()
	best_rule = None
	best_acc = 0.0

	def search_rules(cur_cond : dict, cur_attr_idx : int) :
		nonlocal best_rule, best_acc, X, y

		if cur_attr_idx >= len(attributes) :
			return
		
		if len(cur_cond) > 0 :
			coverage, acc = evaluate_rule(cur_cond, X, y)
			if coverage >= min_converage and acc > best_acc :
				best_acc = acc
				best_rule = cur_cond.copy()
		
		search_rules(cur_cond, cur_attr_idx + 1)
		attr = attributes[cur_attr_idx]
		for val in X[attr].unique() :
			cur_cond[attr] = val
			search_rules(cur_cond, cur_attr_idx + 1)
			del cur_cond[attr]
	
	search_rules({}, 0)

	return best_rule

# test case
print(learn_rule(X_all, y_all, 0.5))

{'breast': 'left'}


In [43]:
def sequential_covering(X : pd.DataFrame, y : pd.Series, min_coverage : float) :
	"""
	Learn a set of rules using the sequential covering algorithm.
	Input: X, y, min_coverage (float)
	Output: list of rules (list of dicts)
	"""
	rules = []
	X_remaining = X.copy()
	y_remaining = y.copy()

	while True :
		if len(X_remaining) == 0 :
			break
		rule = learn_rule(X_remaining, y_remaining, min_coverage)
		if rule is None :
			break

		# remove covered instances
		indices = pd.Series(True, index=X_remaining.index)
		for feature, value in rule.items() :
			indices &= (X_remaining[feature] == value)

		y_major = y_remaining[indices].value_counts().idxmax()
		
		rules.append((rule, y_major))

		X_remaining = X_remaining[~indices]
		y_remaining = y_remaining[~indices]

	return rules

for rule, label in sequential_covering(X_all, y_all, 0.5) :
	print(f"Rule: {rule} -> {label}")

Rule: {'breast': 'left'} -> no-recurrence-events
Rule: {'breast-quad': 'central'} -> recurrence-events
Rule: {'breast-quad': 'left_up'} -> recurrence-events
