# Decision Tree

In [20]:
# Import des Apple Datensatzes
%store -r df

In [24]:
# Import der benötigten Bibliotheken
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler

In [25]:
# Aufteilung in Trainings- und Testdaten
X = df.drop(columns = ["close_next"])
y = df["close_next"]

# Trainings- und Testdaten
X_train_test, X_test, y_train_test, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Trainings- und Validierungsdaten
X_train, X_val, y_train, y_val = train_test_split(X_train_test, y_train_test, test_size = 0.25, random_state = 42)

# Daten Normalisieren mithilfe des MinMaxScaler
sc = MinMaxScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)
X_val_scaled = sc.transform(X_val)

### Decision Tree:

In [26]:
# Decision Tree 1

# GridSearchCV mit Repeated KFold cross validation
dt = GridSearchCV(DecisionTreeRegressor(), param_grid = {
"max_depth": [3, 5, 10, 20],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 3, 5, 10],
}, cv = RepeatedKFold(), n_jobs = -1)

dt.fit(X_train_scaled, y_train)

# Überprüfung und Ausgabe der besten Hyperparameter
best_params = dt.best_params_ 
print("Best params DecisionTreeRegressor: ", best_params)
dt_best_params = DecisionTreeRegressor(**best_params)
dt_best_params.fit(X_train_scaled, y_train)
dt_val_score = dt_best_params.score(X_val_scaled, y_val)
print("R^2 Score DecisionTreeRegressor: {:.2f}%".format(dt_val_score * 100))


# Bewertung auf Basis der Testdaten
dt_test_score = dt_best_params.score(X_test_scaled, y_test)
print("test R^2 score DecisionTreeRegressor: {:.2f}%".format(dt_test_score * 100))

# Vorhersage auf Basis der Testdaten
y_pred = dt_best_params.predict(X_test_scaled)

# mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)

# mean_squared_error
mse = mean_squared_error(y_test, y_pred)

# Root mean squared error
rmse = np.sqrt(mse)

# mean absolute percentage error
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Absolute Error (MAE) Decision Tree: {:.2f}".format(mae))
print("Mean Squared Error (MSE) Decision Tree: {:.2f}".format(mse))
print("Root Mean Squared Error (RMSE) Decision Tree: {:.2f}".format(rmse))
print("Mean Absolute Percentage Error (MAPE) Decision Tree: {:.2f}".format(mape))

Best params DecisionTreeRegressor:  {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
R^2 Score DecisionTreeRegressor: 99.90%
test R^2 score DecisionTreeRegressor: 99.94%
Mean Absolute Error (MAE) Decision Tree: 0.29
Mean Squared Error (MSE) Decision Tree: 0.82
Root Mean Squared Error (RMSE) Decision Tree: 0.91
Mean Absolute Percentage Error (MAPE) Decision Tree: 0.03


### Decision Tree Beispiel mit komplexeren GridSearch CV:

In [None]:
# DecisionTree 2 (Beispiel)

# GridSearchCV mit repeated 5-fold cross validation
dt = GridSearchCV(DecisionTreeRegressor(), param_grid = {
"criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
"splitter": ["best", "random"],
"max_depth": [None, 5, 10, 15, 20],
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
"min_weight_fraction_leaf": [0.0, 0.1, 0.2, 0.3, 0.4], 
"max_features": [None, "auto", "sqrt", "log2"], 
"max_leaf_nodes": [5, 10, 15, 20, 25], 
"min_impurity_decrease": [0.0, 0.01, 0.02, 0.03, 0.04],  
"ccp_alpha": [0.0, 0.1, 0.2, 0.3, 0.4]
}, cv = RepeatedKFold(), n_jobs = -1)

dt.fit(X_train_scaled, y_train)

# Überprüfung und Ausgabe der besten Hyperparameter
best_params = dt.best_params_ 
print("Best params DecisionTreeRegressor: ", best_params)
dt_best_params = DecisionTreeRegressor(**best_params)
dt_best_params.fit(X_train_scaled, y_train)
dt_val_score = dt_best_params.score(X_val_scaled, y_val)
print("R^2 Score DecisionTreeRegressor: {:.2f}%".format(dt_val_score * 100))


# Bewertung auf Basis der Testdaten
dt_test_score = dt_best_params.score(X_test_scaled, y_test)
print("test R^2 score DecisionTreeRegressor: {:.2f}%".format(dt_test_score * 100))

# Vorhersage auf Basis der Testdaten
y_pred = dt_best_params.predict(X_test_scaled)

# mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)

# mean_squared_error
mse = mean_squared_error(y_test, y_pred)

# Root mean squared error
rmse = np.sqrt(mse)

# mean absolute percentage error
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Mean Absolute Error (MAE) DecisionTreeRegressor: {:.2f}".format(mae))
print("Mean Squared Error (MSE) DecisionTreeRegressor: {:.2f}".format(mse))
print("Root Mean Squared Error (RMSE) DecisionTreeRegressor: {:.2f}".format(rmse))
print("Mean Absolute Percentage Error (MAPE) Decision Tree: {:.2f}".format(mape))