In [1]:
import pandas as pd 
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [589]:
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "label"]
train_data = pd.read_csv('car-4/train.csv',names=column_names)
test_data = pd.read_csv('car-4/test.csv',names=column_names)

In [590]:
train_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,vhigh,4,4,big,med,acc
1,low,high,5more,4,med,high,vgood
2,vhigh,med,2,2,big,high,unacc
3,high,high,2,2,small,high,unacc
4,vhigh,low,3,2,big,low,unacc


In [591]:
# Define the entropy
def entropy(target_col):
    value_counts = target_col.value_counts(normalize=True)
    temp_entropy = -np.sum(value_counts * np.log2(value_counts))
    return temp_entropy

In [592]:
# Define the majority error
def majority_error(target_col):
    value_counts = target_col.value_counts(normalize=True)
    return 1 - np.max(value_counts)

In [593]:
# Define the gini index
def gini_index(target_col):
    counts = target_col.value_counts()
    probabilities = counts / len(target_col)
    return 1 - np.sum(probabilities**2)

In [594]:
def info_gain(data, split_attribute_name, target_name, criterion="entropy"):
    criterion_map = {
        "entropy": entropy,
        "majority_error": majority_error,
        "gini_index": gini_index
    }
    if criterion not in criterion_map:
        raise ValueError("Invalid criterion provided")

    total_impurity = criterion_map[criterion](data[target_name])

    weighted_impurity = 0
    for value, group in data.groupby(split_attribute_name):
        weighted_impurity += (len(group) / len(data)) * criterion_map[criterion](group[target_name])

    return total_impurity - weighted_impurity

In [595]:
def get_majority_class(data, target_attribute_name):
    return data[target_attribute_name].value_counts().idxmax()

In [596]:
def ID3(data, original_data, features, target_attribute_name="label", parent_node_class=None,
        max_depth=None, depth=0, criterion="entropy"):
    unique_targets = np.unique(data[target_attribute_name])
    if len(unique_targets) == 1:
        return unique_targets[0]
    
    if len(data) == 0:
        return get_majority_class(original_data, target_attribute_name)

    if len(features) == 0 or (max_depth and depth == max_depth):
        return parent_node_class
    
    parent_node_class = get_majority_class(data, target_attribute_name)
    
    gains = [info_gain(data, feature, target_attribute_name, criterion) for feature in features]
    best_feature = features[np.argmax(gains)]
    tree = {best_feature: {}}

    for value in np.unique(data[best_feature]):
        best_val = data[best_feature] == value
        sub_data = data.where(best_val).dropna()
        remaining_features = [feat for feat in features if feat != best_feature]
        subtree = ID3(
            data=sub_data,
            original_data=original_data,
            features=remaining_features,
            target_attribute_name=target_attribute_name,
            parent_node_class=parent_node_class,
            max_depth=max_depth,
            depth=depth + 1,
            criterion=criterion
        )

        tree[best_feature][value] = subtree

    return tree

In [597]:
def predict(query, tree, default=1):
    current_layer = tree
    while isinstance(current_layer, dict):
        key = next(iter(current_layer))
        value = query.get(key)
        if value is not None:
            current_layer = current_layer[key].get(value, default)
        else:
            return default
    return current_layer

In [598]:
# Function to compute accuracy using lists
def test(data, tree, target):
    queries = data.drop(target, axis=1).to_dict(orient="records")
    predicted_values = [predict(query, tree) for query in queries]
    correct_predictions = sum(1 for actual, predicted in zip(data[target], predicted_values) if actual == predicted)
    return (correct_predictions / len(data)) * 100

In [599]:
# Function to compute error
def compute_error(tree, train_data, test_data, target):
    train_accuracy = test(train_data, tree, target)
    test_accuracy = test(test_data, tree, target)
    return 100 - train_accuracy, 100 - test_accuracy

In [600]:
columns = ["Criterion", "Depth", "Training Error", "Test Error"]

# Create an empty DataFrame with the defined columns
results_df = pd.DataFrame(columns=columns)

criterions = ["entropy", "majority_error", "gini_index"]
max_depths = list(range(1, 7))

for criterion in criterions:
    for max_depth in max_depths:
        tree = ID3(train_data, train_data, list(train_data.columns[:-1]), max_depth=max_depth, criterion=criterion)
        train_error, test_error = compute_error(tree, train_data, test_data, "label")

        # Append the results directly to the DataFrame
        results_df = results_df.append({
            "Criterion": criterion,
            "Depth": max_depth,
            "Training Error": train_error,
            "Test Error": test_error
        }, ignore_index=True)

average_errors = results_df.groupby('Criterion').mean()[['Training Error', 'Test Error']]
print(average_errors)


                Training Error  Test Error
Criterion                                 
entropy              16.500000   20.032051
gini_index           16.616667   19.963370
majority_error       17.433333   21.932234


In [601]:
# Question 3
columns = ["age", "job", "marital", "education", "default", "balance", "housing", "loan", "contact", "day_of_week", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

train_data = pd.read_csv('bank-4/train.csv', names=columns, header=None)
test_data = pd.read_csv('bank-4/test.csv', names=columns, header=None)
train_missing_data = pd.read_csv('bank-4/train.csv', names=columns, header=None)
test_missing_data = pd.read_csv('bank-4/test.csv', names=columns, header=None)

train_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,unknown,5,may,114,2,-1,0,unknown,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,unknown,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,unknown,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,unknown,yes


In [602]:
test_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,41,management,single,secondary,no,764,no,no,cellular,12,jun,230,2,-1,0,unknown,no
1,39,blue-collar,married,secondary,no,49,yes,no,cellular,14,may,566,1,370,2,failure,no
2,60,retired,married,primary,no,0,no,no,telephone,30,jul,130,3,-1,0,unknown,no
3,31,entrepreneur,single,tertiary,no,247,yes,yes,unknown,2,jun,273,1,-1,0,unknown,no
4,26,student,single,unknown,no,2020,no,no,telephone,28,jan,42,3,-1,0,unknown,no


In [603]:
def ID3_numeric(data, original_data, features, target_attribute_name="y", parent_node_class=None,
        max_depth=None, depth=0, criterion="entropy"):
    unique_targets = np.unique(data[target_attribute_name])
    if len(unique_targets) == 1:
        return unique_targets[0]

    if len(data) == 0:
        return get_majority_class(original_data, target_attribute_name)

    if len(features) == 0 or (max_depth and depth == max_depth):
        return parent_node_class

    parent_node_class = get_majority_class(data, target_attribute_name)

    gains = [info_gain(data, feature, target_attribute_name, criterion) for feature in features]
    best_feature = features[np.argmax(gains)]
    tree = {best_feature: {}}

    for value in np.unique(data[best_feature]):
        best_val = data[best_feature] == value
        sub_data = data.where(best_val).dropna()
        remaining_features = [feat for feat in features if feat != best_feature]
        subtree = ID3(
            data=sub_data,
            original_data=original_data,
            features=remaining_features,
            target_attribute_name=target_attribute_name,
            parent_node_class=parent_node_class,
            max_depth=max_depth,
            depth=depth + 1,
            criterion=criterion
        )

        tree[best_feature][value] = subtree

    return tree

In [604]:
max_depths = list(range(1, 17))
criterions = ["entropy", "majority_error", "gini_index"]
# Converting string values to numbers for computing the median and then categorizing based on the median
variables = ["age", "balance", "day_of_week", "duration", "campaign", "previous", "pdays"]
train_data_copy = train_data.copy()

for variable in variables:
    median = train_data_copy[variable].median()
    train_data[variable] = np.where(train_data[variable] >= median, "high", "low")
    test_data[variable] = np.where(test_data[variable] >= median, "high", "low")

results_df = pd.DataFrame(columns=columns)
for criterion in criterions:
    for max_depth in max_depths:
        tree = ID3_numeric(train_data, train_data, list(train_data.columns[:-1]), max_depth=max_depth, criterion=criterion)
        train_error, test_error = compute_error(tree, train_data, test_data, "y")

        results_df = results_df.append({
            "Criterion": criterion,
            "Depth": max_depth,
            "Training Error": train_error,
            "Test Error": test_error
        }, ignore_index=True)

average_errors = results_df.groupby('Criterion').mean()[['Training Error', 'Test Error']]
print(average_errors)
    

                Training Error  Test Error
Criterion                                 
entropy                 4.5950    16.30500
gini_index              4.4100    17.14375
majority_error          5.7975    16.81000


In [606]:
unknown_columns = ["job", "education", "contact", "poutcome"]
test_missing_data = test_missing_data.copy()
train_missing_data = train_missing_data.copy()
for cat in unknown_columns:
    majority_value = train_missing_data[train_missing_data[cat]!="unknown"][cat].mode()[0]
    train_missing_data[cat] = train_missing_data[cat].replace("unknown", majority_value)
for variable in variables:
    median = train_missing_data[variable].median()
    train_missing_data[variable] = np.where(train_missing_data[variable] >= median, "high", "low")
    test_missing_data[variable] = np.where(test_missing_data[variable] >= median, "high", "low")
    
results = []
for criterion in criterions:
    for max_depth in max_depths:
        tree = ID3_numeric(train_missing_data, train_missing_data, train_missing_data.columns[:-1],  max_depth=max_depth, criterion=criterion)
        train_error, test_error = compute_error(tree, train_missing_data, test_missing_data, "y")
        results.append((criterion, max_depth, train_error, test_error))

        results_df = results_df.append({
            "Criterion": criterion,
            "Depth": max_depth,
            "Training Error": train_error,
            "Test Error": test_error
        }, ignore_index=True)

average_errors = results_df.groupby('Criterion').mean()[['Training Error', 'Test Error']]
print(average_errors)

                Training Error  Test Error
Criterion                                 
entropy               4.955625   36.976875
gini_index            4.816875   51.901875
majority_error        6.121250   51.668125
