In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X = np.array([[0, 0], [1, 0], [0, 1], [1, 1]])
y = (X[:,0] + X[:,1] == 1).astype(int)

In [None]:
for i in [0, 1]:
    plt.scatter(X[y == i][:,0], X[y == i][:,1], marker=['o', 'x'][i], color=['b', 'k'][i],
    s=1000)
    plt.xlabel('Switch 0')
    plt.ylabel('Switch 1')

In [None]:
np.random.seed(0)
y_simple = np.random.binomial(1, .5, size=10)
X_simple = np.array([[e] for e in y_simple])

In [None]:
count = (X_simple[:,0][y_simple == 0] == 0).sum()
print(f"In {count} instances, both the switch and the light are off")

In [None]:
count = (X_simple[:,0][y_simple == 1] == 1).sum()
print(f"In {count} instances, both the switch and the light are on")

In [None]:
def get_co_occurrence(X, y, col=0):
    co_occurrence = []
    for i in [0, 1]:
        counts = [(X[:,col][y == i] == j).sum() for j in [0, 1]]
        co_occurrence.append(counts)
    return np.array(co_occurrence)
M = get_co_occurrence(X_simple, y_simple)
assert M[0][0] == 3
assert M[1][1] == 7
print(M)

In [None]:
X_simple

In [None]:
X_simple = np.vstack([X_simple, [1]])

In [None]:
y_simple

In [None]:
y_simple = np.hstack([y_simple, [0]])

In [None]:
M = get_co_occurrence(X_simple, y_simple)

In [None]:
M

In [None]:
bulb_probs = M[0] / M[0].sum()
print("When the switch is set to 0, the bulb state probabilities are:")
print(bulb_probs)

In [None]:
bulb_probs = M[1] / M[1].sum()
print("When the switch is set to 1, the bulb state probabilities are:")
print(bulb_probs)
prediction = ['off', 'on'][bulb_probs.argmax()]
accuracy = bulb_probs.max()
print(f"\nWe assume the bulb is {prediction} with "
f"{100 * accuracy:.0f}% accuracy")

In [None]:
M.sum(axis=1)

In [None]:
accuracies = [.75, 1.]
total_accuracy = np.average(accuracies, weights=M.sum(axis=1))

In [None]:
total_accuracy

In [None]:
def train_if_else(X, y, feature_col=0, feature_name='feature'):
    M = get_co_occurrence(X, y, col=feature_col)
    probs0, probs1 = [M[i]/M[i].sum() for i in [0, 1]]
    if_else = f"""
    if {feature_name} == 0:
        prediction = {probs0.argmax()}
    else:
        prediction = {probs1.argmax()}
    """.strip()
    if probs0.argmax() == probs1.argmax():
        if_else = f"prediction = {probs0.argmax()}"
    accuracies = [probs0.max(), probs1.max()]
    total_accuracy = np.average(accuracies, weights=M.sum(axis=1))
    return if_else, total_accuracy

In [None]:
if_else, accuracy = train_if_else(X_simple, y_simple, feature_name='switch')
print(if_else)
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
feature_names = [f"switch{i}" for i in range(2)]

In [None]:
for i, name in enumerate(feature_names):
    _, accuracy = train_if_else(X, y, feature_col=i, feature_name=name)
    print(f"The model trained on {name} is {100 * accuracy:.0f}% " "accurate.")

In [None]:
is_off = X[:, 0] == 0

In [None]:
is_off

In [None]:
X_switch0_off = X[is_off]
y_switch0_off = y[is_off]

In [None]:
X_switch0_off

In [None]:
y_switch0_off

In [None]:
X_switch0_off = np.delete(X_switch0_off, 0, axis=1)

In [None]:

X_switch0_off

In [None]:
results = train_if_else(X_switch0_off, y_switch0_off, feature_name='switch1')

In [None]:
results

In [None]:
switch0_off_model, off_accuracy = results

In [None]:
off_accuracy

In [None]:
def filter_X_y(X, y, filter_col=0, condition=0):
    inclusion_criteria = X[:, filter_col] == condition
    y_filtered = y[inclusion_criteria]
    X_filtered = np.delete(X[inclusion_criteria], filter_col, axis=1)
    return X_filtered, y_filtered

In [None]:
X_switch0_on, y_switch0_on = filter_X_y(X, y, filter_col=0, condition=1)

In [None]:
results = train_if_else(X_switch0_on, y_switch0_on, feature_name='switch1')

In [None]:
switch0_on_model, on_accuracy = results

In [None]:
on_accuracy

In [None]:
switch0_on_model

In [None]:
def combine_if_else(if_else_a, if_else_b, feature_name='feature'):
    return f"""
    if {feature_name} == 0:
        {add_indent(if_else_a)}
    else:
        {add_indent(if_else_b)}
    """.strip()
def add_indent(if_else):
    return '\n'.join([4 * ' ' + line for line in if_else.split('\n')])

In [None]:
nested_model = combine_if_else(switch0_off_model, switch0_on_model, feature_name='switch0')

In [None]:
nested_model

In [None]:
accuracies = [off_accuracy, on_accuracy]
weights = [y_switch0_off.size, y_switch0_on.size]
total_accuracy = np.average(accuracies, weights=weights)
print(f"Our total accuracy is {100 * total_accuracy:.0f}%")

In [None]:
X, y

In [None]:
def split(X, y, feature_col=0, condition=0):
    has_condition = X[:, feature_col] == condition
    X_a, y_a = [e[has_condition] for e in [X, y]]
    X_b, y_b = [e[~has_condition] for e in [X, y]]
    X_a, X_b = [np.delete(e, feature_col, axis=1) for e in [X_a, X_b]]
    return [X_a, X_b, y_a, y_b]

In [None]:
X_a, X_b, y_a, y_b = split(X, y)

In [None]:
assert np.array_equal(X_a, X_switch0_off)
assert np.array_equal(X_b, X_switch0_on)

In [None]:
def train_nested_if_else(X, y, split_col=0,
    feature_names=['feature1', 'feature1']):
    split_name = feature_names[split_col]
    simple_model, simple_accuracy = train_if_else(X, y, split_col, split_name)
    if simple_accuracy == 1.0:
        return (simple_model, simple_accuracy)
    X_a, X_b, y_a, y_b = split(X, y, feature_col=split_col)
    in_name = feature_names[1 - split_col]
    if_else_a, accuracy_a = train_if_else(X_a, y_a, feature_name=in_name)
    if_else_b, accuracy_b = train_if_else(X_b, y_b, feature_name=in_name)
    nested_model = combine_if_else(if_else_a, if_else_b, split_name)
    accuracies = [accuracy_a, accuracy_b]
    nested_accuracy = np.average(accuracies, weights=[y_a.size, y_b.size])
    if nested_accuracy > simple_accuracy:
        return (nested_model, nested_accuracy)
    return (simple_model, simple_accuracy)

In [None]:
feature_names = ['switch0', 'switch1']
model, accuracy = train_nested_if_else(X, y, feature_names=feature_names)
print(model)
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
np.random.seed(1)
y_rain = np.random.binomial(1, .6, size=100)
is_wet = [e if np.random.binomial(1, 0.95) else 1 - e for e in y_rain]
is_fall = [e if np.random.binomial(1, 0.6) else 1 - e for e in y_rain]

In [None]:
X_rain = np.array([is_fall, is_wet]).T

In [None]:
X_rain

In [None]:
feature_names = ['is_autumn', 'is_wet']
model, accuracy = train_nested_if_else(X_rain, y_rain,
feature_names=feature_names)
print(model)
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
model, accuracy = train_nested_if_else(X_rain, y_rain, split_col=1,
feature_names=feature_names)

In [None]:
print(model)
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
prob_rain = y_rain.sum()/y_rain.size

In [None]:
print(f"It rains in {100 * prob_rain:.0f}% of our observations.")

In [None]:
y_fall_a, y_fall_b = split(X_rain, y_rain, feature_col=0)[-2:]
for i, y_fall in enumerate([y_fall_a, y_fall_b]):
    prob_rain = y_fall.sum() / y_fall.size
    state = ['not autumn', 'autumn'][i]
    print(f"It rains {100 * prob_rain:.0f}% of the time when it is "
    f"{state}")

In [None]:
y_wet_a, y_wet_b = split(X_rain, y_rain, feature_col=1)[-2:]
for i, y_wet in enumerate([y_wet_a, y_wet_b]):
    prob_rain = y_wet.sum() / y_wet.size
    state = ['not wet', 'wet'][i]
    print(f"It rains {100 * prob_rain:.0f}% of the time when it is "
    f"{state}")

In [None]:
X_rain

In [None]:
def get_class_distribution(y):
    prob_rain = y.sum()/y.size
    return np.array([1 - prob_rain, prob_rain])

In [None]:
def plot_vector(v, label, linestyle='-', color='b'):
    plt.plot([0, v[0]], [0, v[1]], label=label, linestyle=linestyle, c=color)

In [None]:
classes = [y_fall_a, y_fall_b, y_wet_a, y_wet_b]

In [None]:
distributions = [get_class_distribution(y) for y in classes]
labels = ['Not Autumn', 'Autumn', 'Not Wet', 'Wet']
colors = ['y', 'g', 'k', 'b']
linestyles = ['-.', ':', '-', '--']
for tup in zip(distributions, labels, colors, linestyles):
    vector, label, color, linestyle = tup
    plot_vector(vector, label, linestyle=linestyle, color=color)
plt.legend()
plt.xlabel('Probability Not Rain')
plt.ylabel('Probability Rain')
plt.axis('equal')

In [None]:
prob_rain = np.arange(0, 1.001, 0.01)

In [None]:
vectors = [np.array([1 - p, p]) for p in prob_rain]
magnitudes = [np.linalg.norm(v) for v in vectors]

In [None]:
magnitudes

In [None]:
square_magnitudes = [v @ v for v in vectors]
plt.plot(prob_rain, magnitudes, label='Magnitude')
plt.plot(prob_rain, square_magnitudes, label='Squared Magnitude',
linestyle='--')
plt.xlabel('Probability of Rain')
plt.axvline(0.5, color='k', label='Perfect Balance', linestyle=':')
plt.legend()

In [None]:
gini_impurities = [1 - (v @ v) for v in vectors]
plt.plot(prob_rain, gini_impurities)
plt.xlabel('Probability of Rain')
plt.ylabel('Gini Impurity')

In [None]:
def compute_impurity(y_a, y_b):
    v_a = get_class_distribution(y_a)
    v_b = get_class_distribution(y_b)
    impurities = [1 - v @ v for v in [v_a, v_b]]
    weights = [y.size, y_b.size]
    return np.average(impurities, weights=weights)
fall_impurity = compute_impurity(y_fall_a, y_fall_b)
wet_impurity = compute_impurity(y_wet_a, y_wet_b)
print(f"When we split on Autumn, the Impurity is {fall_impurity:0.2f}.")
print(f"When we split on Wetness, the Impurity is {wet_impurity:0.2f}.")

In [None]:
def sort_feature_indices(X, y):
    feature_indices = range(X.shape[1])
    impurities = []
    for i in feature_indices:
        y_a, y_b = split(X, y, feature_col=i)[-2:]
        impurities.append(compute_impurity(y_a, y_b))
    return sorted(feature_indices, key=lambda i: impurities[i])
indices = sort_feature_indices(X_rain, y_rain)
top_feature = feature_names[indices[0]]
print(f"The feature with the minimal impurity is: '{top_feature}'")

In [None]:
def simulate_weather():
    is_fall = np.random.binomial(1, .25)
    is_cloudy = np.random.binomial(1, [.3, .7][is_fall])
    rained_today = np.random.binomial(1, [.05, .4][is_cloudy])
    if rained_today:
        rains_tomorrow = np.random.binomial(1, .5)
    else:
        rains_tomorrow = np.random.binomial(1, [.05, .15][is_fall])
    features = [rained_today, is_fall, is_cloudy]
    return features, rains_tomorrow

In [None]:
np.random.seed(0)
X_train, y_train = [], []
for _ in range(1000):
    features, rains_tomorrow = simulate_weather()
    X_train.append(features)
    y_train.append(rains_tomorrow)

In [None]:
X_rain = np.array(X_train)
y_rain = np.array(y_train)

In [None]:
X_rain

In [None]:
y_rain

In [None]:
feature_names = ['rained_today', 'is_fall', 'is_cloudy']

In [None]:
indices = sort_feature_indices(X_rain, y_rain)

In [None]:
indices

In [None]:
print(f"Features sorted by Gini Impurity:")
print([feature_names[i] for i in indices])

In [None]:
skip_index = indices[-1]

In [None]:
X_subset = np.delete(X_rain, skip_index, axis=1)

In [None]:
name_subset = np.delete(feature_names, skip_index)

In [None]:
name_subset

In [None]:
indices[0]

In [None]:
skip_index

In [None]:
split_col = indices[0] if indices[0] < skip_index else indices[0] - 1

In [None]:
model, accuracy = train_nested_if_else(X_subset, y_rain, split_col=split_col, feature_names=name_subset)

In [None]:
model

In [None]:
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
X_a, X_b, y_a, y_b = split(X_rain, y_rain, feature_col=indices[0])

In [None]:
name_subset = np.delete(feature_names, indices[0])

In [None]:
X_a

In [None]:
split_col = sort_feature_indices(X_a, y_a)[0]
model_a, accuracy_a = train_nested_if_else(X_a, y_a, split_col=split_col, feature_names=name_subset)

In [None]:
print("If it is not autumn, then the following nested model is " f"{100 * accuracy_a:.0f}% accurate.\n\n{model_a}")

In [None]:
split_col = sort_feature_indices(X_b, y_b)[0]
model_b, accuracy_b = train_nested_if_else(X_b, y_b, split_col=split_col, feature_names=name_subset)
print("If it is autumn, then the following nested model is "
f"{100 * accuracy_b:.0f}% accurate.\n\n{model_b}")

In [None]:
nested_model = combine_if_else(model_a, model_b, feature_names[indices[0]])
print(nested_model)
accuracies = [accuracy_a, accuracy_b]
accuracy = np.average(accuracies, weights=[y_a.size, y_b.size])
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
def train(X, y, feature_names):
    if X.shape[1] == 1:
        return train_if_else(X, y, feature_name=feature_names[0])
    indices = sort_feature_indices(X, y)
    X_subset = np.delete(X, indices[-1], axis=1)
    name_subset = np.delete(feature_names, indices[-1])
    simple_model, simple_accuracy = train(X_subset, y, name_subset)
    if simple_accuracy == 1.0:
        return (simple_model, simple_accuracy)
    split_col = indices[0]
    name_subset = np.delete(feature_names, split_col)
    X_a, X_b, y_a, y_b = split(X, y, feature_col=split_col)
    model_a, accuracy_a = train(X_a, y_a, name_subset)
    model_b, accuracy_b = train(X_b, y_b, name_subset)
    accuracies = [accuracy_a, accuracy_b]
    total_accuracy = np.average(accuracies, weights=[y_a.size, y_b.size])
    nested_model = combine_if_else(model_a, model_b, feature_names[split_col])
    if total_accuracy > simple_accuracy:
        return (nested_model, total_accuracy)
    return (simple_model, simple_accuracy)

In [None]:
model, accuracy = train(X_rain, y_rain, feature_names)
print(model)
print(f"\nThis statement is {100 * accuracy:.0f}% accurate.")

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X, y)

In [None]:
from sklearn.tree import plot_tree

In [None]:
feature_names = ['Switch 0', 'Switch 1']
class_names = ['Off', 'On']

In [None]:
plot_tree(clf, feature_names=feature_names, class_names=class_names)

In [None]:
from sklearn.tree import export_text
text_tree = export_text(clf, feature_names=feature_names)

In [None]:
print(text_tree)

In [None]:
np.random.seed(1)
feature = np.random.normal(size=1000)

In [None]:
y = (feature >= .7).astype('int')

In [None]:
thresholds = np.arange(0, 1, .001)

In [None]:
gini_impurities = []

In [None]:
for threshold in thresholds:
    y_left = y[feature < threshold]
    y_right = y[feature >= threshold]
    impurity = compute_impurity(y_left, y_right)
    gini_impurities.append(impurity)

In [None]:
best_thresh = thresholds[np.argmin(gini_impurities)]
print(f"impurity is minimized at a threshold of {best_thresh:.02f}")
plt.plot(thresholds, gini_impurities)
plt.axvline(best_thresh, c='k', linestyle='--')
plt.xlabel('Threshold')
plt.ylabel('impurity')

In [None]:
np.random.seed(0)
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
clf.fit(X, y)
feature_names = load_wine().feature_names
text_tree = export_text(clf, feature_names=feature_names)

In [None]:
print(text_tree)

In [None]:
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X, y)
text_tree = export_text(clf, feature_names=feature_names)
print(text_tree)

In [None]:
clf.feature_importances_

In [None]:
for i in np.argsort(clf.feature_importances_)[::-1]:
    feature = feature_names[i]
    importance = clf.feature_importances_[i]
    if importance == 0:
        break
    print(f"'{feature}' has an importance score of {importance:0.2f}")

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
data = load_breast_cancer()

In [None]:
feature_names = data.feature_names

In [None]:
feature_names

In [None]:
num_features = len(feature_names)

In [None]:
num_features

In [None]:
num_classes = len(data.target_names)

In [None]:
num_classes

In [None]:
print(f"The cancer dataset contains the following {num_classes} classes:")
print(data.target_names)
print(f"\nIt contains these {num_features} features:")
print(feature_names)

In [None]:
X, y = load_breast_cancer(return_X_y=True)

In [None]:
clf = DecisionTreeClassifier()

In [None]:
clf.fit(X, y)

In [None]:
for i in np.argsort(clf.feature_importances_)[::-1]:
    feature = feature_names[i]
    importance = clf.feature_importances_[i]
    if round(importance, 2) == 0:
        break
    print(f"'{feature}' has an importance score of {importance:0.2f}")

In [None]:
index = clf.feature_importances_.argmax()
plt.hist(X[y==0][:, index], label='Maligant', bins='auto')
plt.hist(X[y==1][:, index], label='Benign', bins='auto')
plt.xlabel('Worst Radius')
plt.legend()

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y, clf.predict(X))

In [None]:
print("Our classifier has memorized the training data with "
f"{100 * accuracy:.0f}% accuracy.")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y,)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
accuracy = accuracy_score(y_test, clf.predict(X_test))

In [None]:
accuracy

In [None]:
forest = [DecisionTreeClassifier() for _ in range(100)]

In [None]:
def bootstrap(X, y):
    num_rows = X.shape[0]
    indices = np.random.choice(range(num_rows), size=num_rows, replace=True)
    X_new, y_new = X[indices], y[indices]
    return X_new, y_new

In [None]:
np.random.seed(1)
X_train_new, y_train_new = bootstrap(X_train, y_train)
assert X_train.shape == X_train_new.shape
assert y_train.size == y_train_new.size
assert not np.array_equal(X_train, X_train_new)
assert not np.array_equal(y_train, y_train_new)

In [None]:
np.random.seed(1)
features_train, classes_train = [], []
for _ in range(100):
    X_train_new, y_train_new = bootstrap(X_train, y_train)
    features_train.append(X_train_new)
    classes_train.append(y_train_new)

In [None]:
np.random.seed(1)
sample_size = int(X.shape[1] ** 0.5)
assert sample_size == 5
feature_indices = [np.random.choice(range(30), 5, replace=False) for _ in range(100)]

In [None]:
for i, index_subset in enumerate(feature_indices):
    features_train[i] = features_train[i][:, index_subset]
for index in [0, 99]:
    index_subset = feature_indices[index]
    names = feature_names[index_subset]
    print(f"\nRandom features utilized by Tree {index}:")
    print(names)

In [None]:
for i, clf_tree in enumerate(forest):
    clf_tree.fit(features_train[i], classes_train[i])

In [None]:
features_train[0].shape

In [None]:
from collections import Counter
feature_vector = X_test[0]
votes = []
for i, clf_tree in enumerate(forest):
    index_subset = feature_indices[i]
    vector_subset = feature_vector[index_subset]
    prediction = clf_tree.predict([vector_subset])[0]
    votes.append(prediction)
class_to_votes = Counter(votes)
for class_label, votes in class_to_votes.items():
    print(f"We counted {votes} votes for class {class_label}.")
top_class = max(class_to_votes.items(), key=lambda x: x[1])[0]
print(f"\nClass {top_class} has received the plurality of the votes.")

In [None]:
true_label = y_test[0]
print(f"The true class of the data-point is {true_label}.")

In [None]:
predictions = []

In [None]:
for i, clf_tree in enumerate(forest):
    index_subset = feature_indices[i]
    prediction = clf_tree.predict(X_test[:, index_subset])
    predictions.append(prediction)

In [None]:
predictions = np.array(predictions)

In [None]:
predictions.shape

In [None]:
predictions

In [None]:
y_pred = [Counter(predictions[:,i]).most_common()[0][0] for i in range(y_test.size)]

In [None]:
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

In [None]:
np.random.seed(1)
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier()
clf_forest.fit(X_train, y_train)
y_pred = clf_forest.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print("The forest has predicted the validation outputs with " f"{100 * accuracy:.0f}% accuracy")

In [None]:
np.random.seed(1)
clf_forest = RandomForestClassifier(n_estimators=10)
clf_forest.fit(X_train, y_train)
y_pred = clf_forest.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print("The 10-tree forest has predicted the validation outputs with " f"{100 * accuracy:.0f}% accuracy")

In [None]:
for i in np.argsort(clf_forest.feature_importances_)[::-1][:3]:
    feature = feature_names[i]
    importance = clf_forest.feature_importances_[i]
    print(f"'{feature}' has an importance score of {importance:0.2f}")