In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import metrics, tree
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_parquet('energiematrix_zonder_missing.parquet')

In [None]:
df.info()
df.head()

In [None]:
df_gb = df.loc[df['gem2022'] == 'Gemert-Bakel']
df_gb.info(verbose=True, show_counts=True)

In [None]:
# data preparation
# train-test split
x_train, x_test, y_train, y_test = train_test_split(
    df_gb.iloc[:, :41],
    df_gb.lihk_lilek_nieuw,
    random_state=0,
    test_size=0.3
)

# separate preprocessing for categorical and numerical columns
num_cols = df_gb.select_dtypes(include="number").columns
cat_cols = df_gb.select_dtypes(include="category").columns
categories = [df_gb[col].cat.categories for col in cat_cols]
preprocess_cat_cols = make_pipeline(
    OneHotEncoder(categories=categories))

# pipeline to prepare full dataset
prepare_data = make_column_transformer(
    (preprocess_cat_cols, cat_cols),
    remainder="passthrough")

# full pipeline with decision tree
dt = make_pipeline(
    prepare_data,
    DecisionTreeClassifier(max_depth=3)
)

In [None]:
dt

In [None]:
# Fit and evaluate performance decision tree
dt.fit(x_train, y_train)
y_pred1 = dt.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred1))
print("Precision:", metrics.precision_score(y_test, y_pred1))
print("Recall:", metrics.recall_score(y_test, y_pred1))

In [None]:
# visualize the tree
feature_names = dt[:-1].get_feature_names_out()
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt[1],
                  feature_names=feature_names,
                  proportion=True,
                  filled=True)

In [None]:
# save fig

fig.savefig("Model_GB_2.png")

In [None]:
# create confusion matrix to check performance
confusion_matrix = metrics.confusion_matrix(y_test, y_pred1)
matrix_df = pd.DataFrame(confusion_matrix)

ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(10,7))
sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")

ax.set_title('Confusion Matrix Model 1')
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')
plt.show()

In [None]:
# create classification report to check performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred1))

In [None]:
# calculate feature importance to compare different trees from different runs
importance = pd.DataFrame(dt[1].feature_importances_, index=feature_names).sort_values(by=[0], ascending=False)
importance.head(25)