In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [2]:
# Function to load DataSet
def load_dataset(file_path, num_attributes=2, num_classes=2):
    data = pd.read_csv(file_path)
    X = data.iloc[:, 0 :  num_attributes].values
    y = data.iloc[:,  num_attributes:  num_attributes + num_classes].values
    # y = data.iloc[:, 9:10].values
    return X, y

In [3]:
sys_name = "navigation_old"
n_samples = 500000
X, y = load_dataset(f"Dataset/{sys_name}/{sys_name}_{n_samples}/data_{sys_name}_{n_samples}.csv",num_attributes=4, num_classes=4)

In [4]:
X

array([[ 2.1892 ,  2.4081 , -0.49971, -0.74031],
       [ 1.7434 ,  1.0898 , -0.10954, -0.41258],
       [ 1.0537 ,  0.13005,  0.93942, -0.14472],
       ...,
       [ 2.4862 ,  1.4989 , -0.75639,  0.54465],
       [ 1.352  ,  0.87072, -0.68432, -0.90102],
       [ 2.4397 ,  2.1971 , -0.22278, -0.79128]])

In [5]:
y

array([[ 2.1451 ,  2.3306 , -0.44092, -0.77409],
       [ 1.7419 ,  1.0445 , -0.01462, -0.45312],
       [ 1.1482 ,  0.11716,  0.94498, -0.1289 ],
       ...,
       [ 2.4205 ,  1.5353 , -0.65719,  0.36334],
       [ 1.3018 ,  0.78931, -0.50192, -0.81411],
       [ 2.4202 ,  2.1154 , -0.19575, -0.81685]])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.1)
print(f" Shape of X_Training = {X_train.shape} \n Shape of X_Testing = {X_test.shape}")
print(f" Shape of Y_Training = {y_train.shape} \n Shape of Y_Testing = {y_test.shape}")
tree = DecisionTreeRegressor(max_depth=5)
tree.fit(X_train, y_train)

 Shape of X_Training = (449999, 4) 
 Shape of X_Testing = (50000, 4)
 Shape of Y_Training = (449999, 4) 
 Shape of Y_Testing = (50000, 4)


In [8]:
numLeaves = tree.get_n_leaves()
print(f"Number of leaves: {tree.get_n_leaves()}")
print(f"Total depth of tree: {tree.get_depth()}")
print(f"Number of nodes: {tree.tree_.node_count}")


Number of leaves: 32
Total depth of tree: 5
Number of nodes: 63


In [None]:
# Get pruning path
path = tree.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Train trees for each alpha
trees = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha)
    clf.fit(X, y)
    trees.append(clf)

# Evaluate on validation set and pick best

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. Train/Val/Test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 2. Train full tree and get pruning path
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)

path = tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# 3. Train/prune with different alphas and evaluate on validation set
val_mse = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    val_mse.append(mean_squared_error(y_val, y_val_pred))

# 4. Find best alpha
best_alpha = ccp_alphas[np.argmin(val_mse)]

# 5. Retrain final tree on Train+Val
X_trainval = np.vstack([X_train, X_val])
y_trainval = np.hstack([y_train, y_val])

final_tree = DecisionTreeRegressor(random_state=42, ccp_alpha=best_alpha)
final_tree.fit(X_trainval, y_trainval)

# 6. Test evaluation
y_test_pred = final_tree.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)

print("Best alpha:", best_alpha)
print("Test MSE:", test_mse)

# 7. Plot validation curve
plt.figure(figsize=(8, 5))
plt.plot(ccp_alphas, val_mse, marker="o", drawstyle="steps-post")
plt.axvline(best_alpha, color="red", linestyle="--", label=f"Best α = {best_alpha:.5f}")
plt.xlabel("ccp_alpha (complexity parameter)")
plt.ylabel("Validation MSE")
plt.title("Validation Curve for Cost Complexity Pruning")
plt.legend()
plt.show()
