In [2]:
import os
import pandas as pd

In [None]:
os.getcwd()

In [None]:
mushrooms = pd.read_csv('../data/secondary_data.csv', delimiter = ';')
mushrooms.head()

In [None]:
mushrooms.dtypes

In [None]:
n_rows = mushrooms.shape[0]
n_columns = mushrooms.shape[1]

print(f'There are {n_rows} rows and {n_columns} columns.')

In [None]:
print("There are no missing values." if not mushrooms.isnull().any().any()\
      else "There are missing values in the dataframe.")

In [None]:
nan_counts = mushrooms.isna().sum()
nan_counts[nan_counts > 0]

In [None]:
edible_count = (mushrooms['class'] == 'e').sum()
poisonous_count = mushrooms.shape[0] - edible_count
print(f"Number of edible mushrooms: {edible_count}\nNumber of poisonous mushrooms: {poisonous_count}")

In [3]:
import pandas as pd
import numpy as np
from UDFs import DecisionTreeClassifier, train_test_partition, zero_one_loss, accuracy_metric

# Load the data
mushrooms = pd.read_csv('../data/secondary_data.csv', delimiter = ';')

# Encode the target variable as 0-1
mushrooms['class'] = mushrooms['class'].map({'e': 0, 'p': 1})

# Drop the missing values
#mushrooms_1 = mushrooms.dropna(axis=1)

# Do not drop the mising values
mushrooms_1 = mushrooms.copy()

# Create X and y
X = mushrooms_1.drop('class', axis=1).values
y = mushrooms_1['class'].values

# Partition X and y into training and test sets
X_train, X_test, y_train, y_test = train_test_partition(X, y, test_size=0.3, random_state=42)

# Train the classifier (on the training set)
tree = DecisionTreeClassifier(max_depth=10, criterion="gini", n_features="log2", n_quantiles=10)
tree.fit(X_train, y_train)

# Create predictions on the training set
y_train_predicted = tree.predict(X_train)

# Compute the zero-one loss
loss = zero_one_loss(y_train, y_train_predicted)
print(f"Zero-one loss (training set): {loss}")

# Create predictions on the test set
y_test_predicted = tree.predict(X_test)

# Compute the accuracy
accuracy_train = accuracy_metric(y_train, y_train_predicted)
print(f"Accuracy (training set): {accuracy_train:.2f}")

accuracy_test = accuracy_metric(y_test, y_test_predicted)
print(f"Accuracy (test set): {accuracy_test:.2f}")

Fitting the model...
Considering 4 features at each split.
Considering thresholds for feature index 17: ['g' 'k' 'n' 'p' 'r' 'u' 'w']
Considering thresholds for feature index 9: [4.796 9.062000000000001 13.328000000000001 17.594 21.87
 26.146000000000004 30.682000000000002 35.668 46.23800000000001]
Considering thresholds for feature index 18: ['d' 'g' 'h' 'l' 'm' 'p' 'u' 'w']
Considering thresholds for feature index 12: ['b' 'e' 'f' 'g' 'k' 'l' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
Best feature index: 9
Best threshold value: 9.062000000000001
Considering thresholds for feature index 14: ['k' 'n' 'w']
Considering thresholds for feature index 1: ['b' 'c' 'f' 'o' 'p' 's' 'x']
Considering thresholds for feature index 11: ['f' 'g' 'h' 'i' 'k' 's' 't' 'y']
Considering thresholds for feature index 17: ['k' 'n' 'p' 'r' 'u' 'w']
Best feature index: 11
Best threshold value: k
Considering thresholds for feature index 2: ['d' 'e' 'g' 'h' 'i' 'k' 'l' 's' 't' 'w' 'y']
Considering thresholds for feature index

In [4]:
import pandas as pd
import numpy as np
from UDFs import DecisionTreeClassifier, k_fold_partition, zero_one_loss, accuracy_metric

# Load the data
mushrooms = pd.read_csv('../data/secondary_data.csv', delimiter = ';')

# Encode the target variable as 0-1
mushrooms['class'] = mushrooms['class'].map({'e': 0, 'p': 1})

# Drop the missing values
#mushrooms_1 = mushrooms.dropna(axis=1)

# Do not drop the mising values
mushrooms_1 = mushrooms.copy()

# Create X and y
X = mushrooms_1.drop('class', axis=1).values
y = mushrooms_1['class'].values

# Initialize the classifier
tree = DecisionTreeClassifier(max_depth=10, criterion="gini", n_features="log2", n_quantiles=10)

# Initialize a list with zero-one losses
losses = []

# Initialize a list with accuracies
accuracies = []

# k-fold cross-validation
fold_indices = k_fold_partition(X, random_state=42)

for train_indices, test_indices in fold_indices:
    
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    
    tree.fit(X_train, y_train)
    
    y_train_predicted = tree.predict(X_train)
    loss = zero_one_loss(y_train, y_train_predicted)
    losses.append(loss)
    
    y_test_predicted = tree.predict(X_test)
    accuracy = accuracy_metric(y_test, y_test_predicted)
    accuracies.append(accuracy)

# Calculate mean 0-1 loss and accuracy
mean_loss = np.mean(losses)
mean_accuracy = np.mean(accuracies)

print("K-Fold Cross-Validation Zero-One Loss:", accuracies)
print("Mean Zero-One Loss:", mean_loss)

print("K-Fold Cross-Validation Accuracy:", accuracies)
print("Mean Accuracy:", mean_accuracy)
    

Fitting the model...
Considering 4 features at each split.
Considering thresholds for feature index 17: ['g' 'k' 'n' 'p' 'r' 'u' 'w']
Considering thresholds for feature index 9: [4.9510000000000005 9.362 13.773000000000001 18.184 22.615000000000002
 27.026000000000003 31.657000000000004 36.936 49.63300000000001]
Considering thresholds for feature index 18: ['d' 'g' 'h' 'l' 'm' 'p' 'u' 'w']
Considering thresholds for feature index 12: ['b' 'e' 'f' 'g' 'k' 'l' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
Best feature index: 9
Best threshold value: 9.362
Considering thresholds for feature index 14: ['k' 'n' 'w']
Considering thresholds for feature index 9: [1.422 2.304 3.1860000000000004 4.0680000000000005 4.95 5.832000000000001
 6.714 7.596 8.478000000000002]
Considering thresholds for feature index 18: ['d' 'g' 'h' 'l' 'm' 'p']
Considering thresholds for feature index 17: ['k' 'n' 'p' 'r' 'u' 'w']
Best feature index: 9
Best threshold value: 3.1860000000000004
Considering thresholds for feature index 11: