# Exercise 7 - DT, gradient boosting and random forest for regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

We use the customer data again. This time we do not want to perform a classification according to high_revenue, but instead directly predict the numerical value turnover (total_sum).

In [3]:
# space
df = pd.read_csv('prepared_data.csv')
df.head()

Unnamed: 0,weekday,daytime,isHoliday,distance,count,startClusterName,startClusterZip,startClusterID,endClusterName,endClusterZip,endClusterID,year,month,total
0,Di,7,Keine Ferien,3,15,Hunedoara,71171,3254026000002,Hunedoara,71171,3254026000007,2021,3,45
1,Mi,17,Keine Ferien,0,10,Turda,80982,3241013050002,Turda,80982,3241013050008,2021,3,0
2,Sa,11,Keine Ferien,3,10,Turda,80982,3241013050007,Turda,80982,3241013030001,2021,3,30
3,Fr,13,Keine Ferien,0,15,Bran,91157,3254028001012,Bran,91157,3254028001004,2021,3,0
4,Fr,18,Keine Ferien,7,10,Bran,91157,3254028001003,Turda,80982,3241013070001,2021,3,70


## 7.1 Handling missing values
First of all, we start with the treatment of missing values.

In [None]:
from sklearn import preprocessing

dfFilter = df[['gender','age_first_order','user_agent_brand',
         'user_agent_os', 'campaign', 'pages_visited_avg', 'total_sum']]

dfCopy = dfFilter.copy()

dfCopy['gender'] = dfCopy['gender'].fillna(dfCopy['gender'].mode()[0])
dfCopy['age_first_order'] = dfCopy['age_first_order'].fillna(df['age_first_order'].mode()[0])
dfCopy.head()

## 7.2 Coding of relevant attributes
In the documentation of the algorithms to be used, it is clear that the data must be numerical. Try to understand this. Why is a label encoder sufficient here? Why does it not need to be normed?

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble

In [None]:
labelenc = preprocessing.LabelEncoder()
labelenc.fit(dfCopy.gender)
dfCopy['gender'] = labelenc.transform(dfCopy.gender)

labelenc.fit(dfCopy.user_agent_os)
dfCopy['user_agent_os'] = labelenc.transform(dfCopy.user_agent_os)

labelenc.fit(dfCopy.user_agent_brand)
dfCopy['user_agent_brand'] = labelenc.transform(dfCopy.user_agent_brand)

dfCopy["campaign"] = dfCopy["campaign"].astype(int)
dfCopy.head()

In addition, the specialist department would like to discretize the age, as this personal characteristic will soon only be queried in the following intervals ['up to 20', '20-30', '30-40', '40-50', '50-60', '60-70', 'over 70' ] for reasons of data protection and acceptance.

In [None]:
# space

## 7.3 Training the algorithms with k-fold cross-validation
In the following, three tree-based methods from the lecture "Supervised Methods Part 2" will be applied. For this purpose, a 10-fold cross-validation is to be applied and then the model is to be evaluated using statistical key figures.
In each case, use the function cross_val_predict(model, x,y,cv=10) for the cross-validation.

In [None]:
from sklearn.model_selection import cross_val_predict, cross_validate
import sklearn.metrics as metrics

x = dfPrepared.drop(['total_sum'], axis = 1)
y = dfPrepared['total_sum'].astype(int)

#### A1: Decision Tree Regressor
Train with the use of cross_val_predict(XYregressor, x,y,cv=10) at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html?highlight=cross_val_predict#sklearn.model_selection.cross_val_predict


In [None]:
# space

Metrics for the evaluation:

In [None]:
Train as above.mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)  
r2 = metrics.r2_score(y,y_pred)

print("Ergebnisse von sklearn.metrics:\n")
print("R-Squared:", r2)
print()

print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

after training the algorithm, you can  have a look at the important features for this conrete trained algorithm.

In [None]:
from sklearn.tree import DecisionTreeRegressor
treeRegressor.fit(x, y)

feature_importances = treeRegressor.feature_importances_

importance_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

returns a list of importance for each feature, with higher values having a higher importance for the predictions.This allows you to see which features the decision tree considers important and you can use these findings for further analyses.

#### A1.1: Decision Tree Regressor - additional parameters
Pruning is a technique used in decision trees to simplify the tree and avoid overfitting. 
Rules for pre-pruning are, for example, a maximum tree depth.

In [None]:
from sklearn.tree import DecisionTreeRegressor

treeRegressor = DecisionTreeRegressor(min_samples_split = 10, min_samples_leaf = 5,  max_depth=10, max_leaf_nodes=50 )
y_pred = cross_val_predict(treeRegressor, x, y, cv = 10)

In [None]:
mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)  
r2 = metrics.r2_score(y,y_pred)

print("Results from sklearn.metrics:\n")
print("R-Squared:", r2)
print()

print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

In [None]:
from sklearn.tree import DecisionTreeRegressor
treeRegressor.fit(x, y)

feature_importances = treeRegressor.feature_importances_

importance_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

#### A2: Random Forest Regressor
Train the same way as above.

In [None]:
# space

In [None]:
mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)  
r2 = metrics.r2_score(y,y_pred)

print("Ergebnisse von sklearn.metrics:\n")
print("R-Squared:", r2)
print()

print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

In [None]:
forestRegressor.fit(x, y)

feature_importances = forestRegressor.feature_importances_

importance_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

#### A2.1: Random Forest Regressor - additional parameters

In [None]:
from sklearn.ensemble import RandomForestRegressor

forestRegressor = RandomForestRegressor(min_samples_split = 10, min_samples_leaf = 5, n_estimators = 100, max_depth=10, max_leaf_nodes=50 )

y_pred = cross_val_predict(forestRegressor, x, y, cv = 10)

In [None]:
mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)  
r2 = metrics.r2_score(y,y_pred)

print("Results from sklearn.metrics:\n")
print("R-Squared:", r2)
print()

print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

#### A3: Gradient Boost Trees (Regression)
Train the same way as above.

In [None]:
# space

In [None]:
mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)  
r2 = metrics.r2_score(y,y_pred)

print("Ergebnisse von sklearn.metrics:\n")
print("R-Squared:", r2)
print()

print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

#### A3.1: Gradient Boost Trees (Regression) - additional Parameters

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbRegressor = GradientBoostingRegressor(n_estimators = 100, max_depth = 4,  max_leaf_nodes=50,  min_samples_leaf = 5)

y_pred = cross_val_predict(gbRegressor, x, y, cv = 10)

In [None]:
mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)  
r2 = metrics.r2_score(y,y_pred)

print("Results from sklearn.metrics:\n")
print("R-Squared:", r2)
print()

print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

## 7.4 Interpretation of the quality criteria
Interpret the quality criteria and make a statement on the usability of the models.

## 7.5 Train several models simultaneously and compare them visually
Go through the code below and follow the individual steps. What does the visualization tell you?

In [None]:
import time

estimators = [('DT', DecisionTreeRegressor(min_samples_split = 10, min_samples_leaf = 5)),
    ('RF', RandomForestRegressor(min_samples_split = 10, min_samples_leaf = 5, n_estimators = 100)), 
              ('GB', GradientBoostingRegressor(n_estimators = 100, max_depth = 4))]

In [None]:
def plot_regression_results(ax, y, y_pred, title, scores, elapsed_time):
    """Scatter plot of the predicted vs true targets."""
    ax.plot([y.min(), y.max()],
            [y.min(), y.max()],
            '--r', linewidth=2)
    ax.scatter(y, y_pred, alpha=0.2)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    ax.set_xlim([y.min(), y.max()])
    ax.set_ylim([y.min(), y.max()])
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
                          edgecolor='none', linewidth=0)
    ax.legend([extra], [scores], loc='upper left')
    title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
    ax.set_title(title)


fig, axs = plt.subplots(1, 3, figsize=(9, 5))
axs = np.ravel(axs)

for ax, (name, est) in zip(axs, estimators):
    start_time = time.time()
    score = cross_validate(est, x, y,
                           scoring=['r2', 'neg_mean_absolute_error'],
                           n_jobs=-1, verbose=0)
    elapsed_time = time.time() - start_time

    y_pred = cross_val_predict(est, x, y, n_jobs=-1, verbose=0)

    plot_regression_results(
        ax, y, y_pred,
        name,
        (r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$')
        .format(np.mean(score['test_r2']),
                np.std(score['test_r2']),
                -np.mean(score['test_neg_mean_absolute_error']),
                np.std(score['test_neg_mean_absolute_error'])),
        elapsed_time)

plt.suptitle('Performance Single predictors')
plt.tight_layout()
plt.subplots_adjust(top=0.8)
plt.show()

# Additional exercise - using trees as classifier

In [None]:
dfCopy = dfPrepared.copy()

In [None]:
dfCopy['high revenue'] = np.where(dfCopy['total_sum'] > 300, 1, 0)
dfCopy.drop('total_sum', axis=1, inplace=True)

In [None]:
dfCopy

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, recall_score


X = dfCopy.drop('high revenue', axis=1)
y = dfCopy['high revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=42)

clf = DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=5, max_depth=10)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
tpr = recall_score(y_test, y_pred)  

print("Accuracy:", accuracy)
print("True Positive Rate (Recall):", tpr)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

X = dfCopy.drop('high revenue', axis=1)
y = dfCopy['high revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

clf = RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=5, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
tpr = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("True Positive Rate (Recall):", tpr)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

X = dfCopy.drop('high revenue', axis=1)
y = dfCopy['high revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, min_samples_split=10, min_samples_leaf=5, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
tpr = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("True Positive Rate (Recall):", tpr)