# Decision Tree

In [3]:
# Load our libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Use a nicer style for plots
plt.style.use("seaborn-v0_8-muted")

# Import the regression tree from scikit-learn and a plotting helper
from sklearn.tree import DecisionTreeRegressor, plot_tree
# Import our train_test_split helper
from sklearn.model_selection import train_test_split

## Prepare Dataset into target and features and split them into test train Validation

In [29]:
target_data = pd.read_csv("/Users/rubenstark/Documents/GitHub/Its-Wekk/4 - Data/2 - Ruben/Final_Target_Data_Combined_resid_Trend")

#features Dataset muss noch angepasst werden
features_data = pd.read_csv("/Users/rubenstark/Documents/GitHub/Its-Wekk/4 - Data/2 - Ruben/Working_DataFrame.csv")

In [30]:
# Split our data intro features and targets
# Teile das Dataset in Features und Zielvariable
y = target_data["PM10_Combined_Trend_Residual"]  # Zielvariable
X = features_data.drop(columns=["Datum"])  # Alle Spalten außer der Zielvariable

X.head(10)

# Daten splitten
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=72)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=72)  # 10% von Gesamt

print("Train Size:", len(X_train))
print("Validation Size:", len(X_val))
print("Test Size:", len(X_test))

Train Size: 17213
Validation Size: 2459
Test Size: 4919


## Decision Tree Regressor

In [6]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# Expanding Cross-Validation (5 Splits)
tscv = TimeSeriesSplit(n_splits=5)
results = []

# Use the initial 50% of the data for training (optional, but keeps your requirement in mind)
train_size = int(0.5 * len(X_train))
X_train_init = X_train.iloc[:train_size]
y_train_init = y_train.iloc[:train_size]

# Perform expanding window cross-validation
for fold, (train_index, val_index) in enumerate(tscv.split(X_train_init)):
    # Expanding training data with each fold
    X_train_fold = X_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    X_val_fold = X_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Train the model
    model = DecisionTreeRegressor()
    model.fit(X_train_fold, y_train_fold)
    
    # Predict and calculate MSE
    y_pred = model.predict(X_val_fold)
    mse = mean_squared_error(y_val_fold, y_pred)
    results.append(mse)
    print(f"Fold {fold + 1}: MSE = {mse}")

# Average MSE across all folds
average_mse = np.mean(results)
print(f"Durchschnittlicher MSE über alle Folds: {average_mse}")

Fold 1: MSE = 79.5971247860948
Fold 2: MSE = 63.290904665250295
Fold 3: MSE = 77.86892305593526
Fold 4: MSE = 88.8806167261738
Fold 5: MSE = 58.723342934462394
Durchschnittlicher MSE über alle Folds: 73.6721824335833


## Use ccp_alpha (and other parameters) to optimize the Decision Tree for example when it comes to overfitting

[`DecisionTreeRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

In [7]:
# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
results_ccp = []

# Use the initial 50% of the data for training (optional, but keeps your requirement in mind)
train_size = int(0.5 * len(X_train))
X_train_init = X_train.iloc[:train_size]
y_train_init = y_train.iloc[:train_size]

# Perform expanding window cross-validation
for fold, (train_index, val_index) in enumerate(tscv.split(X_train_init)):
    # Expanding training data with each fold
    X_train_fold = X_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    X_val_fold = X_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Modell trainieren
    tree_ccp = DecisionTreeRegressor(ccp_alpha=0.01)
    tree_ccp.fit(X_train_fold, y_train_fold)
    
    # Vorhersagen machen
    y_pred_ccp = tree_ccp.predict(X_val_fold)
    
    # Berechne den Fehler
    mse_ccp = mean_squared_error(y_val_fold, y_pred_ccp)
    results_ccp.append(mse_ccp)
    
    print(f"Fold {fold + 1}: MSE = {mse_ccp}")

# Durchschnittlichen Fehler über alle Folds berechnen
average_mse_ccp = np.mean(results_ccp)
print(f"Durchschnittlicher MSE über alle Folds: {average_mse_ccp}")


Fold 1: MSE = 76.4155031002755
Fold 2: MSE = 61.37171437294417
Fold 3: MSE = 73.21213339051128
Fold 4: MSE = 89.6744388735687
Fold 5: MSE = 52.38129933082086
Durchschnittlicher MSE über alle Folds: 70.6110178136241


## Crossvalidation to find best alpha 

 Geht nicht mit so vielen Features

In [19]:
# Import our tools for model selection
from sklearn.model_selection import cross_validate, KFold

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Definiere die Werte für ccp_alpha (Cost Complexity Pruning)
alphas = np.logspace(-4, 0, 50)  # Werte zwischen 10^-4 und 10^0

# Initialisiere Cross-Validation
cv = KFold(n_splits=5, shuffle=True, random_state=72)

# Speicher für Ergebnisse
scores = []  # Durchschnittliche MSE für jedes ccp_alpha
scores_std = []  # Standardabweichung der Scores für Stabilitätsanalyse

# Cross-Validation für jedes ccp_alpha
for alpha in alphas:
    # Decision Tree mit aktuellem ccp_alpha-Wert
    tree_cv = DecisionTreeRegressor(ccp_alpha=alpha, random_state=72)
    
    # Negative MSE, da cross_val_score maximiert; wir wollen minimieren
    mse_scores = cross_val_score(tree_cv, X_train, y_train, cv=cv, scoring="neg_mean_squared_error")
    
    # Durchschnittlichen MSE speichern (negativ, daher multiplizieren mit -1)
    scores.append(-mse_scores.mean())
    scores_std.append(mse_scores.std())

# Optimiere ccp_alpha: Der Wert mit dem niedrigsten MSE
optimal_alpha = alphas[np.argmin(scores)]

print(f"Optimales ccp_alpha: {optimal_alpha}")

In [None]:
# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
results_ccp = []

# Use the initial 50% of the data for training (optional, but keeps your requirement in mind)
train_size = int(0.5 * len(X_train))
X_train_init = X_train.iloc[:train_size]
y_train_init = y_train.iloc[:train_size]

# Perform expanding window cross-validation
for fold, (train_index, val_index) in enumerate(tscv.split(X_train_init)):
    # Expanding training data with each fold
    X_train_fold = X_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    X_val_fold = X_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Modell trainieren
    tree_ccp = DecisionTreeRegressor(ccp_alpha=optimal_alpha)
    tree_ccp.fit(X_train_fold, y_train_fold)
    
    # Vorhersagen machen
    y_pred_ccp = tree_ccp.predict(X_val_fold)
    
    # Berechne den Fehler
    mse_ccp = mean_squared_error(y_val_fold, y_pred_ccp)
    results_ccp.append(mse_ccp)
    
    print(f"Fold {fold + 1}: MSE = {mse_ccp}")

# Durchschnittlichen Fehler über alle Folds berechnen
average_mse_ccp = np.mean(results_ccp)
print(f"Durchschnittlicher MSE über alle Folds: {average_mse_ccp}")


Fold 1: MSE = 76.4155031002755
Fold 2: MSE = 61.37171437294417
Fold 3: MSE = 73.21213339051128
Fold 4: MSE = 89.6744388735687
Fold 5: MSE = 52.38129933082086
Durchschnittlicher MSE über alle Folds: 70.6110178136241


## Bagging

Bagging (Bootstrap Aggregating) ist eine Technik, um die Stabilität und Genauigkeit von Machine-Learning-Algorithmen zu verbessern, insbesondere bei Modellen wie Entscheidungsbäumen, die anfällig für hohe Varianz sind. Es basiert auf dem Bootstrapping-Prinzip, bei dem mehrere Trainingssets durch Zufallsstichproben mit Zurücklegen erzeugt werden.

Jeder Baum wird auf einem dieser zufälligen Datensets trainiert, und die Vorhersagen der B Modelle werden durch Mittelung kombiniert. Mathematisch reduziert Bagging die Varianz der Modelle, weil unabhängige Fehler über die Modelle hinweg geglättet werden. So wird die Vorhersage insgesamt stabiler und robuster gegen Variationen in den Trainingsdaten.

Das Ziel ist, Vorhersagefehler durch Mittelung der Outputs der individuellen Modelle zu minimieren, was insgesamt zu einer besseren Modellleistung führt.

In [8]:
# Import the regression tree from scikit-learn and a plotting helper
from sklearn.tree import DecisionTreeRegressor, plot_tree
# Import our train_test_split helper
from sklearn.model_selection import train_test_split
# Import the mean_squared_error function under the alias mse
from sklearn.metrics import mean_squared_error as mse
# Import the resampling helper
from sklearn.utils import resample
# Import the sklearn implementation of bagging
from sklearn.ensemble import BaggingRegressor

# Create a bagged tree estimator with B=100 trees
bagged_trees = BaggingRegressor(DecisionTreeRegressor(), n_estimators=100)

# Fit the bagged estimator and compute the MSE on the training set
bagged_trees.fit(X_train, y_train)

# Compute the predictions on the training and test sets
y_pred_train_bag = bagged_trees.predict(X_train)
y_pred_test_bag = bagged_trees.predict(X_test)

print("Train MSE: ", mean_squared_error(y_train, y_pred_train_bag))
print("Test MSE : ", mean_squared_error(y_test, y_pred_test_bag))

# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
results_bag = []

# Use the initial 50% of the data for training (optional, but keeps your requirement in mind)
train_size = int(0.5 * len(X_train))
X_train_init = X_train.iloc[:train_size]
y_train_init = y_train.iloc[:train_size]

# Perform expanding window cross-validation
for fold, (train_index, val_index) in enumerate(tscv.split(X_train_init)):
    # Expanding training data with each fold
    X_train_fold = X_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    X_val_fold = X_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Modell trainieren
    bagged_trees.fit(X_train_fold, y_train_fold)
    
    # Vorhersagen machen
    y_pred = bagged_trees.predict(X_val_fold)
    
    # Berechne den Fehler
    mse = mean_squared_error(y_val_fold, y_pred)
    results_bag.append(mse)
    
    print(f"Fold {fold + 1}: MSE = {mse}")

# Durchschnittlichen Fehler über alle Folds berechnen
average_mse_bag = np.mean(results_bag)
print(f"Durchschnittlicher MSE über alle Folds: {average_mse_bag}")


Train MSE:  4.037403449694427
Test MSE :  30.57131299022022
Fold 1: MSE = 34.2032890000025
Fold 2: MSE = 33.61181298958969
Fold 3: MSE = 45.50541730236658
Fold 4: MSE = 30.16984503957023
Fold 5: MSE = 25.242130725349284
Durchschnittlicher MSE über alle Folds: 33.74649901137566


## Random Forrest

Random Forests erweitern Bagging, indem sie jedem Baum eine zusätzliche Zufallskomponente hinzufügen. Jeder Baum wird mit einem bootstrap-Sample der Trainingsdaten trainiert, wobei nur ein zufälliger Teil der Features für die Konstruktion des Baums verwendet wird. Dadurch unterscheidet sich Random Forests von klassischem Bagging, bei dem alle Features verfügbar sind.

Die zufällige Auswahl der Features reduziert die Korrelation zwischen den Bäumen und verbessert die Generalisierung des Modells. Üblicherweise wird die Anzahl der verwendeten Features  m  so gewählt, dass  m \approx \sqrt{p} , wobei  p  die Gesamtzahl der Features ist. Wenn  m = p  gesetzt wird, ist Random Forest gleichbedeutend mit einem Bagging-Modell.

In [10]:
# Import the random forest regressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the random forest regressor
rf = RandomForestRegressor(n_estimators=100, max_features="sqrt")

# Fit the model on the training data
rf.fit(X_train, y_train)

# Compute the predictions on the training and test sets
y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

# Print the mean squared error for training and test sets
print("Train MSE: ", mean_squared_error(y_train, y_pred_train_rf))
print("Test MSE : ", mean_squared_error(y_test, y_pred_test_rf))

# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
results_rf = []

# Use the initial 50% of the data for training (optional, but keeps your requirement in mind)
train_size = int(0.5 * len(X_train))
X_train_init = X_train.iloc[:train_size]
y_train_init = y_train.iloc[:train_size]

# Perform expanding window cross-validation
for fold, (train_index, val_index) in enumerate(tscv.split(X_train_init)):
    X_train_fold, X_val_fold = X_train_init.iloc[train_index], X_train_init.iloc[val_index]
    y_train_fold, y_val_fold = y_train_init.iloc[train_index], y_train_init.iloc[val_index]
    
    # Modell trainieren
    rf.fit(X_train_fold, y_train_fold)
    
    # Vorhersagen machen
    y_pred = rf.predict(X_val_fold)
    
    # Berechne den Fehler
    fold_mse = mean_squared_error(y_val_fold, y_pred)
    results_rf.append(fold_mse)
    
    print(f"Fold {fold + 1}: MSE = {fold_mse}")

# Durchschnittlichen Fehler über alle Folds berechnen
average_mse_rf = np.mean(results_rf)
print(f"Durchschnittlicher MSE über alle Folds: {average_mse_rf}")

Train MSE:  4.566510690777135
Test MSE :  33.17900026871761
Fold 1: MSE = 34.64923660837541
Fold 2: MSE = 34.164589170985934
Fold 3: MSE = 46.85677469923061
Fold 4: MSE = 29.793478737809988
Fold 5: MSE = 27.453165200106437
Durchschnittlicher MSE über alle Folds: 34.58344888330167


Der Unterschied zwischen der Verwendung eines Integers oder eines Floats bei der Angabe von max_features in einem Random Forest Modell (wie in Scikit-learn) liegt in der Bedeutung des Parameters und wie die Anzahl der maximal zu betrachtenden Features berechnet wird:

1. Wenn max_features ein Integer ist:

	•	Der Wert gibt die exakte Anzahl der maximal zu betrachtenden Features an, die bei der Teilung eines Knotens in jedem Decision Tree berücksichtigt werden sollen.
	•	Beispiel: max_features=3 bedeutet, dass 3 Features aus dem gesamten Feature-Set zufällig ausgewählt werden, um die beste Teilung zu bestimmen.

2. Wenn max_features ein Float ist:

	•	Der Wert gibt einen Prozentsatz der verfügbaren Features an, die verwendet werden sollen. Der Float-Wert muss zwischen 0.0 und 1.0 liegen.
	•	Beispiel: max_features=0.5 bedeutet, dass 50 % der Features (aufgerundet) zufällig ausgewählt werden, um die beste Teilung zu bestimmen.
    

# Lagged Variable mit 1h lag

In [11]:
# Ensure 'Datum' column is in datetime format
target_data['Datum'] = pd.to_datetime(target_data['Datum'])

# Create a copy of target_data to apply the offset
lagged_target_variable = target_data.copy()

# Offset von -1 Stunde anwenden
lagged_target_variable['Datum'] = target_data['Datum'] + pd.Timedelta(hours=-1)

lagged_target_variable.head()


Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2021-12-31 23:00:00+00:00,75.197962
1,2022-01-01 00:00:00+00:00,51.472071
2,2022-01-01 01:00:00+00:00,32.710483
3,2022-01-01 02:00:00+00:00,24.801767
4,2022-01-01 03:00:00+00:00,9.68366


In [12]:
# Ensure 'Datum' column in features_data is in datetime format
features_data['Datum'] = pd.to_datetime(features_data['Datum'])

# Merge the dataframes
features_data_lagged = pd.merge(features_data, lagged_target_variable, on="Datum", how="left")

features_data_lagged.head(5)

Unnamed: 0,Datum,Rebgassechange,Clarahuuschange,Citychange,Storchenchange,Post Baselchange,Aeschenchange,Bahnhof Südchange,Bad. Bahnhofchange,Europechange,...,84111104v_sumLief,84111104v_sumLW,84111108n_sumPW,84111108n_sumLief,84111108n_sumLW,84111108v_sumPW,84111108v_sumLief,84111108v_sumLW,Gasverbrauch,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,-0.489808,-1.162325,-0.163786,-1.110112,-5.4e-05,-0.258093,-0.245491,-0.158272,-0.15127,...,1.425346,-0.124348,-0.686018,-0.571421,-1.055084,-0.814853,-0.416962,-1.072471,0.294494,51.472071
1,2022-01-01 01:00:00+00:00,-0.489808,-0.193898,-0.043944,-0.525891,-0.191296,-0.000173,-0.00019,-0.000462,-0.000203,...,0.745474,-0.74492,-0.955306,-0.781539,-1.055084,-1.048762,-0.77327,-1.072471,0.357518,32.710483
2,2022-01-01 02:00:00+00:00,-0.279837,-0.000213,-0.011259,-0.058515,-5.4e-05,-0.000173,-0.12284,-0.000462,-0.000203,...,0.575505,-0.74492,-0.937353,-0.571421,-1.055084,-1.057758,-0.416962,-1.072471,0.385414,24.801767
3,2022-01-01 03:00:00+00:00,-0.209846,-0.000213,-0.011259,-0.350625,-5.4e-05,-0.000173,-0.00019,-0.000462,-0.000203,...,0.269563,-0.74492,-1.045068,-0.571421,-1.055084,-1.201702,-0.416962,-1.072471,0.617131,9.68366
4,2022-01-01 04:00:00+00:00,-0.139856,-0.000213,0.021425,0.116751,-5.4e-05,-0.000173,0.245112,-0.000462,0.150864,...,0.575505,-0.331206,-1.063021,-0.781539,-1.055084,-1.255681,-0.654501,-1.072471,1.10986,5.787813


# Models mit Lagged Value trainieren

In [13]:
# Split our data intro features and targets
# Teile das Dataset in Features und Zielvariable
y = target_data["PM10_Combined_Trend_Residual"]  # Zielvariable
L = features_data_lagged.drop(columns=["Datum"])  # Alle Spalten außer der Zielvariable

X.head(10)

# Split into training and test sets
#L_train, L_test, y_train, y_test = train_test_split(L, y, random_state=72)

# Daten splitten
L_train_val, L_test, y_train_val, y_test = train_test_split(L, y, test_size=0.2, random_state=72)
L_train, L_val, y_train, y_val = train_test_split(L_train_val, y_train_val, test_size=0.125, random_state=72)  # 10% von Gesamt

print("Train Size:", len(L_train))
print("Validation Size:", len(L_val))
print("Test Size:", len(L_test))

Train Size: 17213
Validation Size: 2459
Test Size: 4919


## Normaler Decision Tree

In [None]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np

# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
L_results = []

# Expanding Cross Validation durchführen
for fold, (train_index, test_index) in enumerate(tscv.split(L_train)):
    L_train_fold, L_test_fold = L_train.iloc[train_index], L_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Modell trainieren
    L_tree = DecisionTreeRegressor()
    L_tree.fit(L_train_fold, y_train_fold)
    
    # Vorhersagen machen
    L_y_pred = L_tree.predict(L_test_fold)
    
    # Berechne den Fehler
    L_mse = mean_squared_error(y_test_fold, L_y_pred)
    L_results.append(L_mse)
    
    print(f"Fold {fold + 1}: MSE = {L_mse}")

# Durchschnittlichen Fehler über alle Folds berechnen
L_average_mse = np.mean(L_results)
print(f"Durchschnittlicher MSE über alle Folds: {L_average_mse}")

Fold 1: MSE = 14.931603523065984
Fold 2: MSE = 15.967796501081343
Fold 3: MSE = 31.91006327695002
Fold 4: MSE = 38.70804019180852
Fold 5: MSE = 13.701080650340092
Durchschnittlicher MSE über alle Folds: 36.106418329690094


In [14]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# Expanding Cross-Validation (5 Splits)
tscv = TimeSeriesSplit(n_splits=5)
L_results = []

# Use the initial 50% of the data for training (optional, but keeps your requirement in mind)
train_size = int(0.5 * len(X_train))
L_train_init = L_train.iloc[:train_size]
y_train_init = y_train.iloc[:train_size]

# Perform expanding window cross-validation
for fold, (train_index, val_index) in enumerate(tscv.split(L_train_init)):
    # Expanding training data with each fold
    L_train_fold = L_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    L_val_fold = L_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Train the model
    L_model = DecisionTreeRegressor()
    L_model.fit(L_train_fold, y_train_fold)
    
    # Predict and calculate MSE
    L_y_pred = L_model.predict(L_val_fold)
    L_mse = mean_squared_error(y_val_fold, L_y_pred)
    L_results.append(L_mse)
    print(f"Fold {fold + 1}: MSE = {L_mse}")

# Average MSE across all folds
L_average_mse = np.mean(L_results)
print(f"Durchschnittlicher MSE über alle Folds: {L_average_mse}")

Fold 1: MSE = 12.459803500757198
Fold 2: MSE = 20.783467309902193
Fold 3: MSE = 28.513282028189558
Fold 4: MSE = 12.090112873967685
Fold 5: MSE = 23.74839218935601
Durchschnittlicher MSE über alle Folds: 19.51901158043453


## Decision Tree mit Optimierung ccp_alpha

In [15]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np

# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
L_results_ccp = []

for fold, (train_index, val_index) in enumerate(tscv.split(L_train_init)):
    # Expanding training data with each fold
    L_train_fold = L_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    L_val_fold = L_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Train the model
    L_tree_ccp = DecisionTreeRegressor(ccp_alpha=0.01)
    L_tree_ccp.fit(L_train_fold, y_train_fold)
    
    # Vorhersagen machen
    L_y_pred_ccp = L_tree_ccp.predict(L_test)
    
    # Berechne den Fehler
    L_mse_ccp = mean_squared_error(y_test, L_y_pred_ccp)
    L_results_ccp.append(L_mse)
    
    print(f"Fold {fold + 1}: MSE = {L_mse_ccp}")

# Durchschnittlichen Fehler über alle Folds berechnen
L_average_mse_ccp = np.mean(L_results_ccp)
print(f"Durchschnittlicher MSE über alle Folds: {L_average_mse_ccp}")

Fold 1: MSE = 17.770544497475676
Fold 2: MSE = 16.152672018520786
Fold 3: MSE = 17.408285860065767
Fold 4: MSE = 22.57141699128194
Fold 5: MSE = 15.128690433524028
Durchschnittlicher MSE über alle Folds: 23.74839218935601


## Bagging

In [16]:
# Import the regression tree from scikit-learn and a plotting helper
from sklearn.tree import DecisionTreeRegressor, plot_tree
# Import our train_test_split helper
from sklearn.model_selection import train_test_split
# Import the mean_squared_error function under the alias mse
from sklearn.metrics import mean_squared_error as mse
# Import the resampling helper
from sklearn.utils import resample
# Import the sklearn implementation of bagging
from sklearn.ensemble import BaggingRegressor

B = 100

# Create a bagged tree estimator with B=100 trees
L_bagged_trees = BaggingRegressor(DecisionTreeRegressor(), n_estimators=B)


# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
L_results_bag = []


for fold, (train_index, val_index) in enumerate(tscv.split(L_train_init)):
    # Expanding training data with each fold
    L_train_fold = L_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    L_val_fold = L_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Train the model
    L_bagged_trees = BaggingRegressor(DecisionTreeRegressor(), n_estimators=B)
    L_bagged_trees.fit(L_train_fold, y_train_fold)
    
    # Vorhersagen machen
    L_y_pred = L_bagged_trees.predict(L_test)
    
    # Berechne den Fehler
    L_mse = mean_squared_error(y_test, L_y_pred)
    L_results_bag.append(L_mse)
    
    print(f"Fold {fold + 1}: MSE = {L_mse}")

# Durchschnittlichen Fehler über alle Folds berechnen
L_average_mse_bag = np.mean(L_results_bag)
print(f"Durchschnittlicher MSE über alle Folds: {L_average_mse_bag}")

Fold 1: MSE = 13.166000624419446
Fold 2: MSE = 13.06871747298777
Fold 3: MSE = 12.933144241660413
Fold 4: MSE = 13.608447317055052
Fold 5: MSE = 13.194310450416692
Durchschnittlicher MSE über alle Folds: 13.194124021307875


## Random Forrest

In [17]:
# Import the random forest regressor
from sklearn.ensemble import RandomForestRegressor

# Expanding Cross Validation mit 5 Folds
tscv = TimeSeriesSplit(n_splits=5)

# Liste zur Speicherung der Ergebnisse
L_results_rf = []

for fold, (train_index, val_index) in enumerate(tscv.split(L_train_init)):
    # Expanding training data with each fold
    L_train_fold = L_train_init.iloc[train_index]
    y_train_fold = y_train_init.iloc[train_index]
    
    # Validation data stays fixed (next chunk after the training data)
    L_val_fold = L_train_init.iloc[val_index]
    y_val_fold = y_train_init.iloc[val_index]
    
    # Train the model
    L_rf = RandomForestRegressor(n_estimators=B, max_features="sqrt")
    L_rf.fit(L_train_fold, y_train_fold)

    # Vorhersagen machen
    L_y_pred = L_rf.predict(L_test)
    
    # Berechne den Fehler
    L_fold_mse = mean_squared_error(y_test, L_y_pred)
    L_results_rf.append(L_fold_mse)
    
    print(f"Fold {fold + 1}: MSE = {L_fold_mse}")

# Durchschnittlichen Fehler über alle Folds berechnen
L_average_mse_rf = np.mean(L_results_rf)
print(f"Durchschnittlicher MSE über alle Folds: {L_average_mse_rf}")

Fold 1: MSE = 21.729713399365075
Fold 2: MSE = 20.053905971410313
Fold 3: MSE = 18.93191268376251
Fold 4: MSE = 19.249601786386936
Fold 5: MSE = 18.602762424405704
Durchschnittlicher MSE über alle Folds: 19.713579253066104
