In [36]:
import joblib
import pandas as pd
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, State, callback
import dash
import shap
import matplotlib.pyplot as plt
import base64
import io
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Loading Models

In [37]:
xgboost = joblib.load('./models/best_model_xgboost.pkl')
random_forest = joblib.load('./models/best_model_random_forest.pkl')
gnb = joblib.load('./models/gnb.pkl')


Trying to unpickle estimator DecisionTreeClassifier from version 1.3.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator RandomForestClassifier from version 1.3.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



# Sankey Diagramm vorbereiten

## Preprocessing

In [38]:
tat = pd.read_csv("./data/lucas_organic_carbon_training_and_test_data_NEW.csv")
targets = pd.read_csv("./data/lucas_organic_carbon_target.csv")

In [39]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(tat)
scaled_df = pd.DataFrame(scaled_data, columns=tat.columns)

In [40]:
tat_train, tat_test, targets_train, targets_test = train_test_split(scaled_data, targets, test_size=0.2,
                                                                    random_state=42)

y_pred_xgboost = xgboost.predict(tat_test)
y_pred_randomforest = random_forest.predict(tat_test)
y_pred_gnb = gnb.predict(tat_test)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    0.3s


In [41]:
label_encoder = LabelEncoder()

In [42]:
y_pred_randomforest = label_encoder.fit_transform(y_pred_randomforest)
y_pred_gnb = label_encoder.fit_transform(y_pred_gnb)

## Create ground Truth column

In [43]:
ground_truth = label_encoder.fit_transform(targets_test.to_numpy())


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



## Create single dataframe containing model predictions and ground truth
This is the data basis for the sankey diagram

In [44]:
xg_predictions = pd.DataFrame(y_pred_xgboost, columns=['XGBoost'])
random_forest_predictions = pd.DataFrame(y_pred_randomforest, columns=['Random Forest'])
gnb_predictions = pd.DataFrame(y_pred_gnb, columns=['Gaussian Naive Bias'])
ground_truth = pd.DataFrame(ground_truth, columns=['Ground Truth'])
df = pd.concat([xg_predictions, random_forest_predictions, gnb_predictions, ground_truth], axis=1)
df

Unnamed: 0,XGBoost,Random Forest,Gaussian Naive Bias,Ground Truth
0,4,4,4,4
1,2,2,0,0
2,2,2,2,1
3,4,4,2,1
4,3,3,3,3
...,...,...,...,...
1974,4,4,4,4
1975,4,4,1,4
1976,4,4,3,4
1977,4,4,4,4


## Farben erstellen

In [45]:
ncolor0 = 'rgba(230, 159, 0, 1)'
ncolor1 = 'rgba(0, 158, 115, 1)'
ncolor2 = 'rgba(0, 114, 178, 1)'
ncolor3 = 'rgba(213, 94, 0, 1)'
ncolor4 = 'rgba(204, 121, 167, 1)'

color_node = []

for n in range(4):
    color_node.append(ncolor0)
    color_node.append(ncolor1)
    color_node.append(ncolor2)
    color_node.append(ncolor3)
    color_node.append(ncolor4)

lcolor0 = 'rgba(230, 159, 0, 0.5)'
lcolor1 = 'rgba(0, 158, 115, 0.5)'
lcolor2 = 'rgba(0, 114, 178, 0.5)'
lcolor3 = 'rgba(213, 94, 0, 0.5)'
lcolor4 = 'rgba(204, 121, 167, 0.5)'

color_link = []

for n in range(16):
    color_link.append(lcolor0)
    color_link.append(lcolor1)
    color_link.append(lcolor2)
    color_link.append(lcolor3)
    color_link.append(lcolor4)

## Beschriftungen

In [46]:
def integer_target_to_string_target(value):
    mapping_dict = {
        0: 'very_high',
        1: 'high',
        2: 'moderate',
        3: 'low',
        4: 'very_low'
    }

    return mapping_dict.get(value, 'Unknown')


def model_name_to_key_part_prefix(model_name):
    match model_name:
        case 'XGBoost':
            return 'x'
        case 'Gaussian Naive Bias':
            return 'g'
        case 'Random Forest':
            return 'r'
        case 'Ground Truth':
            return 'gt'
        case _:
            raise ValueError(f'Unknown model name: {model_name}')

## Funktion zum Plotten des Diagramms

In [47]:
def build_sankey(df, model_name_list):
    __NUMBER_OF_TARGETS = 5
    flow_numbers_by_model_indices = {}
    model_names_and_prefixes = list(
        map(lambda model_name: (model_name, model_name_to_key_part_prefix(model_name)), model_name_list)
    )

    # build keys for each model with the next (e.g. if we have 3 models A, B, C we want to see the flow from A to B and from B to C
    for i, current in enumerate(model_names_and_prefixes[:-1]):
        (current_model, current_prefix) = current
        (next_model, next_prefix) = model_names_and_prefixes[i + 1]
        
        # build keys from model a to b for each combination of values 0 to 4
        for a in range(__NUMBER_OF_TARGETS):
            for b in range(__NUMBER_OF_TARGETS):
                key_part_1 = f"{current_prefix}{a}"
                key_part_2 = f"{next_prefix}{b}"
                key = (key_part_1, key_part_2)
                
                flow_numbers_by_model_indices[key] = 0

        # count up amount of flow from the dataframe
        for index, row in df.iterrows():
            key_part_1 = f"{current_prefix}{row[current_model]}"
            key_part_2 = f"{next_prefix}{row[next_model]}"
            key = (key_part_1, key_part_2)

            flow_numbers_by_model_indices[key] = flow_numbers_by_model_indices[key] + 1

    # build labels and flow labels
    suffix_labels = []
    model_name_labels = []
    for (model_name, prefix) in model_names_and_prefixes:
        for a in range(__NUMBER_OF_TARGETS):
            suffix_labels.append(f"{prefix}{a}")
            model_name_labels.append(f"{model_name} {integer_target_to_string_target(a)}")

    sources = []
    targets = []
    flow_values = []

    for key in flow_numbers_by_model_indices:
        (source, target) = key
        source_index = suffix_labels.index(source)
        target_index = suffix_labels.index(target)
        value = flow_numbers_by_model_indices[key]
        sources.append(source_index)
        targets.append(target_index)
        flow_values.append(value)

    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=model_name_labels,
            color=color_node
        ),
        link=dict(
            source=sources,
            target=targets,
            value=flow_values,
            color=color_link
        ))])

    return fig

# Shapley

In [48]:
mapping_dict = {
    'XGBoost': joblib.load("./shapley-values/saved_values/xgboost-shapley_values"),
    'Random Forest': joblib.load("./shapley-values/saved_values/randomforest-shapley_values"),
    'Logistic Regression': joblib.load("./shapley-values/saved_values/logreg_shapley_values")
}

def model_name_to_explanation(value):
    return mapping_dict.get(value)

In [49]:
def string_target_to_integer_target(value):
    mapping_dict = {
        'very_high' :0,
        'high' :1,
        'moderate' :2,
        'low' :3,
        'very_low' :4
    }

    return mapping_dict.get(value)

In [50]:
tat_fnames = tat.columns.to_list()

def find_indexes(fnames, start, end):
    try:
        start_index = fnames.index(start)
        end_index = fnames.index(end)
        return start_index, end_index
    except ValueError as e:
        # Handle the case when either start or end value is not found in the list
        print(f"Error: {e}")
        return None, None

# Retraining

In [51]:
def get_data(tat, start, end):
    numeric_columns = pd.to_numeric(tat.columns, errors='coerce')

    # Filter columns based on the interval
    filtered_columns = tat.columns[(numeric_columns >= start) & (numeric_columns <= end)]

    # Create a new DataFrame with only the filtered columns
    tat = tat[filtered_columns]

    tat_train, tat_test, targets_train, targets_test = train_test_split(tat, targets, test_size=0.33, random_state=42)

    targets_train = targets_train.values.ravel()
    targets_test = targets_test.values.ravel()

    feature_names = tat.columns.to_list()
    return tat_train, tat_test, targets_train, targets_test, feature_names

In [53]:
model_list_2 = [
    ('XGBoost', joblib.load('./models/best_model_xgboost.pkl'), 'xgboost', joblib.load('./shapley-values/saved_values/xgboost-shapley_values')),
    ('Random Forest', joblib.load('./models/best_model_random_forest.pkl'), 'random_forest', joblib.load('./shapley-values/saved_values/randomforest-shapley_values')),
    ('Logistic Regression', joblib.load('./models/logreg.pkl'), 'logic_regression', joblib.load('./shapley-values/saved_values/logreg_shapley_values'))
]


Trying to unpickle estimator DecisionTreeClassifier from version 1.3.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator RandomForestClassifier from version 1.3.0 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



In [54]:
def generate_new_model(model_entry, tat, start, end):
    tat_train, tat_test, targets_train, targets_test, feature_names = get_data(tat, start, end)

    new_model_name = f"{model_entry[0]} {start}-{end}"

    match model_entry[2]:
        case 'logic_regression':
            model = LogisticRegression(random_state=0)
            model.fit(tat_train, targets_train)

            explainer = shap.Explainer(model, tat_train, feature_names=feature_names)
            explanation = explainer(tat_train)
            # file_path = f"../shapley-values/saved_values/{new_model_name}-saved_values"
            # joblib.dump(explanation, file_path)
            # model_list_2.append((new_model_name, model, 'logic_regression'))
            model_list_2.append((new_model_name, model, 'logic_regression', explanation))
            
        case 'xgboost':
            use_gpu = False
            label_encoder = LabelEncoder()
            targets_train = label_encoder.fit_transform(targets_train)
            print(targets_train)

            if use_gpu:
                model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=10, objective='multi:softmax',
                                        num_class=len(pd.unique(targets_train)), tree_method="hist", device="cuda")
            else:
                model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=10, objective='multi:softmax',
                                        num_class=len(pd.unique(targets_train)))

            print(tat_train)
            model.fit(tat_train, targets_train)

            explainer = shap.TreeExplainer(model, feature_names=feature_names)
            explanation = explainer(tat_train)
            # file_path = f"../shapley-values/saved_values/{new_model_name}-saved_value.json"
            # joblib.dump(explanation, file_path)
            # model_list_2.append((new_model_name, model, 'xgboost'))
            model_list_2.append((new_model_name, model, 'xgboost', explanation))

        case 'random_forest':
            model = RandomForestClassifier(n_estimators=100, random_state=42, verbose=2)
            print('test')
            model.fit(tat_train, targets_train)

            print('the shap values will be generated now')
            explainer = shap.TreeExplainer(model, feature_names=feature_names)
            explanation = explainer(tat_train)
            # file_path = f"../shapley-values/saved_values/{new_model_name}-saved_value.json"
            # joblib.dump(explanation, file_path)
            # model_list_2.append((new_model_name, model, 'random_forest'))
            model_list_2.append((new_model_name, model, 'random_forest', explanation))
        case _:
            raise Exception('Model is not found.')

    return

In [55]:
def get_model_by_name(model_list, model_name):
    for model_entry in model_list:
        if model_name.lower() in model_entry[0].lower():
            return model_entry
    return None  # Model not found

# Application

In [56]:
model_list = ['XGBoost', 'Random Forest', 'Logistic Regression']

In [62]:
reload=False #Muss nach oben, passiert wenn Button gedrückt
if reload==False:
    models_name = ['XGBoost', 'Random Forest', 'Gaussian Naive Bias', 'Ground Truth']

if reload==True:
    models_name.append('NewModel')

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = Dash(__name__, external_stylesheets=external_stylesheets)

app.layout =html.Div([
    html.Div(className="row", children="Dashboard", style={"textAlign":"center", "color":"blue", "fontSize":30}),
    
    #erste Zeile Block (Sankey, Retraining)
    html.Div(className ="row", children = [
        #Sankey Block
        html.Div(className ="ten columns", children = [
            html.Div(className ="two columns", children = [
                #Überschrift Sankey
                html.Div(children="Sankey Modelle", style={"textAlign":"left", "color":"blue", "fontSize":25}),
                #Dropdown Sankey
                dcc.Dropdown(models_name, placeholder='Select Model 1', id='dropdown_1'),
                dcc.Dropdown(models_name, placeholder='Select Model 2', id='dropdown_2'),
                dcc.Dropdown(models_name, placeholder='Select Model 3', id='dropdown_3'),
                dcc.Dropdown(models_name, placeholder='Select Model 4', id='dropdown_4')
            ]),
            #Diagramm Sankey
            html.Div(className ="ten columns", children = [
                html.Div(id='cc-output-container')
            ]) 
        ]),#Sankey Block ENDE

        #TODO Zwischenstrich

        #Retraing Block
        html.Div(className ="two columns", children = [
            #Überschrift Retraining
            html.Div(children="Retraining", style={"textAlign":"left", "color":"blue", "fontSize":25}),
            dcc.Dropdown([t[0] for t in model_list], placeholder='Select Model', id='dropdown'),
            dcc.Input(type='number', placeholder='Enter Start', id='input_start_1'),
            dcc.Input(type='number', placeholder='Enter End', id='input_end_1'),
            html.Br(),
            html.Button('Retrain', id='btn_retrain'),
            html.Div(id='bb-output-container')
        ])#Retraing ENDE    
    ]),#ENDE erste Zeile Block (Sankey, Retraining)

    html.Hr(),

    #zweite Zeile Shapley
    html.Div(className ="row", children = [
        #Shapley Block
        html.Div(className ="four columns", children = [
            #Überschrift Shapley
            html.Div(children="Shapley Values", style={"textAlign":"left", "color":"blue", "fontSize":25}),
            #Dropdown Shapley
            dcc.Dropdown(model_list, placeholder='Select Model', id='dropdown_model'),
            dcc.Dropdown(['very_high', 'high', 'moderate', 'low', 'very_low'], placeholder='Select Target', id='dropdown_target'),
            dcc.Input(type='text', placeholder='Enter Start', id='input_start_2'),
            dcc.Input(type='text', placeholder='Enter End', id='input_end_2'),
            dcc.Input(type='number', placeholder='Enter Steps', id='input_steps'),
            html.Br(),
            html.Button('Generate Plot', id='btn_generate_plot')
        ]),
        #Diagramm Shapley 1
        html.Div(className ="four columns", children = [
            #html.Div(children="Shapley Values", style={"textAlign":"left", "color":"blue", "fontSize":25}),
            html.Div(id='dd-output-container'),
        ]) 
        #Diagramm Shapley 2
        #html.Div(children="Shapley Values", style={"textAlign":"left", "color":"blue", "fontSize":25}),
         #Shapley Block ENDE
        
    ])#ENDE Zweite Zeile
])
 
# SANKEY DASHBOARD
@callback(
    Output('cc-output-container', 'children'),
    [Input('dropdown_1', 'value'), Input('dropdown_2', 'value'), Input('dropdown_3', 'value'),
     Input('dropdown_4', 'value')]
)
def generate_sankey(model_1, model_2, model_3, model_4):
    models = [v for v in [model_1, model_2, model_3, model_4] if v is not None]
    return dcc.Graph(figure=build_sankey(df, models))

# SHAPLEY DASHBOARD
@callback(
    Output('dd-output-container', 'children'),
    State('dropdown_model', 'value'),           # State instead of Input is important for button
    State('dropdown_target', 'value'), 
    State('input_start_2', 'value'),
    State('input_end_2', 'value'),
    State('input_steps', 'value'),
    Input('btn_generate_plot', 'n_clicks'),
)
def generate_shapley_plots(model, target, start, end, steps, n_clicks):
    if n_clicks is None:
        return dash.no_update # Do nothing if the button is not clicked
    
    explanation = model_name_to_explanation(model)
    target = string_target_to_integer_target(target)
    
    if explanation is not None and target is not None:
        fig, ax = plt.subplots()
        ax.cla() # Clear the previous plot

        start_index, end_index = find_indexes(tat_fnames, start, end)
        shap.plots.beeswarm(explanation[:, start_index:end_index:steps, target], show=False)
        
        # Convert Matplotlib figure to base64-encoded string
        img_buf = io.BytesIO()
        plt.savefig(img_buf, format='svg')

        plt.clf() # Clear the entire figure
        plt.close() # Close the figure to release memory

        img_buf.seek(0)
        img_str = "data:image/svg+xml;base64," + base64.b64encode(img_buf.read()).decode('utf-8')

        return html.Img(src=img_str)
    else:
        return
    
# RETRAINING DASHBOARD
@callback(
    Output('bb-output-container', 'children'),
    State('dropdown', 'value'),
    State('input_start_1', 'value'),
    State('input_end_1', 'value'),
    Input('btn_retrain', 'n_clicks')
)
def retrain_model(model_name, start, end, n_clicks):
    if n_clicks is None:
        return dash.no_update  # Do nothing if the button is not clicked
    if model_name is None or start is None or end is None:
        return dash.no_update  # Do nothing if not all values are provided
    
    model_entry = get_model_by_name(model_list_2, model_name)

    generate_new_model(model_entry, tat, start, end)
    print('model has been generated')
    return [t[0] for t in model_list_2] # this is not really nice (from a clean code perspective) but it works \n

if __name__ == '__main__':
    app.run(debug=True)

