By Jack O'Brien

# Q1

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("w3classif.csv", names=["X1", "X2", "Y"])


def train_test(data, test_size):
    # Split into features and target
    X = data.drop("Y", axis=1)
    y = data["Y"]
    
    # Split into 70:30 train-test split. This function also shuffles.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    # Create the decision tree classifier
    clf = DecisionTreeClassifier(max_depth=3, random_state=i)
    
    # Fit the model on the training data
    clf.fit(X_train, y_train)
    
    # Make predictions on the training and test data
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    
    # Calculate the training and test loss
    train_loss = 1 - accuracy_score(y_train, y_pred_train)
    test_loss = 1 - accuracy_score(y_test, y_pred_test)

    return (X_train, y_train), (X_test, y_test), train_loss, test_loss
    

def train_test_m_times(data, m, test_size=0.3):
    # Create empty lists to store the results
    train_sets = []
    test_sets = []
    train_losses = []
    test_losses = []

    for i in range(m):
        train, test, train_loss, test_loss = train_test(data, test_size)
        
        # Append the results to the lists
        train_sets.append(train)
        test_sets.append(test)
        train_losses.append(train_loss)
        test_losses.append(test_loss)

    return train_sets, test_sets, train_losses, test_losses


train_sets, test_sets, train_losses, test_losses = train_test_m_times(data, 10)

FileNotFoundError: ignored

# Q2

In [None]:
import numpy as np

def print_losses(train_losses, test_losses): 
    print("Train Losses:", train_losses)
    print("Train loss avg =", np.mean(train_losses))
    print("Test Losses:", test_losses)
    print("Test loss avg =", np.mean(test_losses))

print_losses(train_losses, test_losses)

Train Losses: [0.025000000000000022, 0.025000000000000022, 0.02857142857142858, 0.042857142857142816, 0.03214285714285714, 0.02857142857142858, 0.021428571428571463, 0.017857142857142905, 0.03214285714285714, 0.0357142857142857]
Train loss avg = 0.028928571428571435
Test Losses: [0.050000000000000044, 0.07499999999999996, 0.06666666666666665, 0.04166666666666663, 0.025000000000000022, 0.01666666666666672, 0.06666666666666665, 0.05833333333333335, 0.06666666666666665, 0.025000000000000022]
Test loss avg = 0.04916666666666667


# Q3

In [None]:
# First do 90:10 split
train_sets, test_sets, train_losses_90, test_losses_90 = train_test_m_times(data, 10, test_size=0.1)
print_losses(train_losses_90, test_losses_90)

print("~~~~")

# Now do 50:50
train_sets, test_sets, train_losses_50, test_losses_50 = train_test_m_times(data, 10, test_size=0.5)
print_losses(train_losses_50, test_losses_50)

Train Losses: [0.036111111111111094, 0.047222222222222276, 0.036111111111111094, 0.03888888888888886, 0.03888888888888886, 0.03888888888888886, 0.030555555555555558, 0.04166666666666663, 0.033333333333333326, 0.04166666666666663]
Train loss avg = 0.038333333333333316
Test Losses: [0.09999999999999998, 0.0, 0.09999999999999998, 0.025000000000000022, 0.025000000000000022, 0.07499999999999996, 0.025000000000000022, 0.050000000000000044, 0.050000000000000044, 0.050000000000000044]
Test loss avg = 0.05000000000000001
~~~~
Train Losses: [0.020000000000000018, 0.015000000000000013, 0.015000000000000013, 0.010000000000000009, 0.010000000000000009, 0.025000000000000022, 0.020000000000000018, 0.020000000000000018, 0.040000000000000036, 0.030000000000000027]
Train loss avg = 0.020500000000000018
Test Losses: [0.06999999999999995, 0.06499999999999995, 0.050000000000000044, 0.08999999999999997, 0.06499999999999995, 0.06499999999999995, 0.06499999999999995, 0.06000000000000005, 0.06499999999999995, 

# Q4

In [None]:
# Print the unbiased standard deviation of each array
print("Unbiased standard deviation of train_losses_50:", np.std(train_losses_50, ddof=1))
print("Unbiased standard deviation of test_losses_50:", np.std(test_losses_50, ddof=1))
print("Unbiased standard deviation of train_losses_90:", np.std(train_losses_90, ddof=1))
print("Unbiased standard deviation of test_losses_90:", np.std(test_losses_90, ddof=1))

Unbiased standard deviation of train_losses_50: 0.009264628073124873
Unbiased standard deviation of test_losses_50: 0.010540925533894572
Unbiased standard deviation of train_losses_90: 0.00468485579284205
Unbiased standard deviation of test_losses_90: 0.03333333333333332


So, we are trading off between training loss variance and testing loss variance here.

# Q5

In [None]:
from sklearn.model_selection import cross_val_score

# Split the data into X and y
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Create a decision tree classifier
dtc = DecisionTreeClassifier(max_depth=3)

# Perform 10-fold cross validation
scores = cross_val_score(dtc, X, y, cv=10)

# Calculate the mean and standard deviation of the cross-validation error
mean_error = 1 - np.mean(scores)
std_error = np.std(1 - scores, ddof=1)

print(f"Mean cross-validation error: {mean_error:.4f}")
print(f"Standard deviation of cross-validation error: {std_error:.4f}")

Mean cross-validation error: 0.0600
Standard deviation of cross-validation error: 0.0293
