In [56]:
import joblib
import pandas as pd
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, callback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Loading Models

In [57]:
xgboost = joblib.load('../models/best_model_xgboost.pkl')
random_forest = joblib.load('../models/best_model_random_forest.pkl')
gnb = joblib.load('../models/gnb.pkl')




Trying to unpickle estimator DecisionTreeClassifier from version 1.3.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator RandomForestClassifier from version 1.3.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



# Sankey Diagramm vorbereiten

## Preprocessing

In [58]:
tat = pd.read_csv("../data/lucas_organic_carbon_training_and_test_data_NEW.csv")
targets = pd.read_csv("../data/lucas_organic_carbon_target.csv")

In [59]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(tat)
scaled_df = pd.DataFrame(scaled_data, columns=tat.columns)

In [60]:
tat_train, tat_test, targets_train, targets_test = train_test_split(scaled_data, targets, test_size=0.2,
                                                                    random_state=42)

y_pred_xgboost = xgboost.predict(tat_test)
y_pred_randomforest = random_forest.predict(tat_test)
y_pred_gnb = gnb.predict(tat_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.2s finished


In [61]:
label_encoder = LabelEncoder()

In [62]:
y_pred_randomforest = label_encoder.fit_transform(y_pred_randomforest)
y_pred_gnb = label_encoder.fit_transform(y_pred_gnb)

## Create ground Truth column

In [63]:
ground_truth = label_encoder.fit_transform(targets_test.to_numpy())


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



## Create single dataframe containing model predictions and ground truth
This is the data basis for the sankey diagram

In [64]:
xg_predictions = pd.DataFrame(y_pred_xgboost, columns=['XGBoost'])
random_forest_predictions = pd.DataFrame(y_pred_randomforest, columns=['Random Forest'])
gnb_predictions = pd.DataFrame(y_pred_gnb, columns=['Gaussian Naive Bias'])
ground_truth = pd.DataFrame(ground_truth, columns=['Ground Truth'])
df = pd.concat([xg_predictions, random_forest_predictions, gnb_predictions, ground_truth], axis=1)
df

Unnamed: 0,XGBoost,Random Forest,Gaussian Naive Bias,Ground Truth
0,4,4,4,4
1,2,2,0,0
2,2,2,2,1
3,4,4,2,1
4,3,3,3,3
...,...,...,...,...
1974,4,4,4,4
1975,4,4,1,4
1976,4,4,3,4
1977,4,4,4,4


## Farben erstellen

In [65]:
ncolor0 = 'rgba(230, 159, 0, 1)'
ncolor1 = 'rgba(0, 158, 115, 1)'
ncolor2 = 'rgba(0, 114, 178, 1)'
ncolor3 = 'rgba(213, 94, 0, 1)'
ncolor4 = 'rgba(204, 121, 167, 1)'

color_node = []

for n in range(4):
    color_node.append(ncolor0)
    color_node.append(ncolor1)
    color_node.append(ncolor2)
    color_node.append(ncolor3)
    color_node.append(ncolor4)

lcolor0 = 'rgba(230, 159, 0, 0.5)'
lcolor1 = 'rgba(0, 158, 115, 0.5)'
lcolor2 = 'rgba(0, 114, 178, 0.5)'
lcolor3 = 'rgba(213, 94, 0, 0.5)'
lcolor4 = 'rgba(204, 121, 167, 0.5)'

color_link = []

for n in range(16):
    color_link.append(lcolor0)
    color_link.append(lcolor1)
    color_link.append(lcolor2)
    color_link.append(lcolor3)
    color_link.append(lcolor4)

## Beschriftungen

In [66]:
def integer_target_to_string_target(value):
    mapping_dict = {
        0: 'very_high',
        1: 'high',
        2: 'moderate',
        3: 'low',
        4: 'very_low'
    }

    return mapping_dict.get(value, 'Unknown')


def model_name_to_key_part_prefix(model_name):
    match model_name:
        case 'XGBoost':
            return 'x'
        case 'Gaussian Naive Bias':
            return 'g'
        case 'Random Forest':
            return 'r'
        case 'Ground Truth':
            return 'gt'
        case _:
            raise ValueError(f'Unknown model name: {model_name}')

## Funktion zum Plotten des Diagramms

In [67]:
def build_sankey(df, model_name_list):
    __NUMBER_OF_TARGETS = 5
    flow_numbers_by_model_indices = {}
    model_names_and_prefixes = list(
        map(lambda model_name: (model_name, model_name_to_key_part_prefix(model_name)), model_name_list)
    )

    # build keys for each model with the next (e.g. if we have 3 models A, B, C we want to see the flow from A to B and from B to C
    for i, current in enumerate(model_names_and_prefixes[:-1]):
        (current_model, current_prefix) = current
        (next_model, next_prefix) = model_names_and_prefixes[i + 1]
        
        # build keys from model a to b for each combination of values 0 to 4
        for a in range(__NUMBER_OF_TARGETS):
            for b in range(__NUMBER_OF_TARGETS):
                key_part_1 = f"{current_prefix}{a}"
                key_part_2 = f"{next_prefix}{b}"
                key = (key_part_1, key_part_2)
                
                flow_numbers_by_model_indices[key] = 0

        # count up amount of flow from the dataframe
        for index, row in df.iterrows():
            key_part_1 = f"{current_prefix}{row[current_model]}"
            key_part_2 = f"{next_prefix}{row[next_model]}"
            key = (key_part_1, key_part_2)

            flow_numbers_by_model_indices[key] = flow_numbers_by_model_indices[key] + 1

    # build labels and flow labels
    suffix_labels = []
    model_name_labels = []
    for (model_name, prefix) in model_names_and_prefixes:
        for a in range(__NUMBER_OF_TARGETS):
            suffix_labels.append(f"{prefix}{a}")
            model_name_labels.append(f"{model_name} {integer_target_to_string_target(a)}")

    sources = []
    targets = []
    flow_values = []

    for key in flow_numbers_by_model_indices:
        (source, target) = key
        source_index = suffix_labels.index(source)
        target_index = suffix_labels.index(target)
        value = flow_numbers_by_model_indices[key]
        sources.append(source_index)
        targets.append(target_index)
        flow_values.append(value)

    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=model_name_labels,
            color=color_node
        ),
        link=dict(
            source=sources,
            target=targets,
            value=flow_values,
            color=color_link
        ))])

    return fig

# Application

In [68]:
app = Dash(__name__)
app.layout = html.Div([
    html.H1("Sankey Diagram Dashboard"),
    dcc.Dropdown(['XGBoost', 'Random Forest', 'Gaussian Naive Bias', 'Ground Truth'], placeholder='Select Model 1',
                 id='dropdown_1'),
    dcc.Dropdown(['XGBoost', 'Random Forest', 'Gaussian Naive Bias', 'Ground Truth'], placeholder='Select Model 2',
                 id='dropdown_2'),
    dcc.Dropdown(['XGBoost', 'Random Forest', 'Gaussian Naive Bias', 'Ground Truth'], placeholder='Select Model 3',
                 id='dropdown_3'),
    dcc.Dropdown(['XGBoost', 'Random Forest', 'Gaussian Naive Bias', 'Ground Truth'], placeholder='Select Model 4',
                 id='dropdown_4'),
    html.Div(id='dd-output-container')
])


@callback(
    Output('dd-output-container', 'children'),
    [Input('dropdown_1', 'value'), Input('dropdown_2', 'value'), Input('dropdown_3', 'value'),
     Input('dropdown_4', 'value')]
)
def generate_sankey(model_1, model_2, model_3, model_4):
    models = [v for v in [model_1, model_2, model_3, model_4] if v is not None]
    return dcc.Graph(figure=build_sankey(df, models))


if __name__ == '__main__':
    app.run(debug=True)