In [None]:
import dash
import joblib
import pandas as pd
import shap
import xgboost as xgb
from dash import Dash, dcc, html, Input, Output, State, callback
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
tat = pd.read_csv("../data/lucas_organic_carbon_training_and_test_data_NEW.csv")
targets = pd.read_csv("../data/lucas_organic_carbon_target.csv")

# TODO: scale data
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(df)
# scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
# scaled_df

# Actual Retraining

In [None]:
def get_data(input_tat, start, end):
    numeric_columns = pd.to_numeric(input_tat.columns, errors='coerce')

    # Filter columns based on the interval
    filtered_columns = input_tat.columns[(numeric_columns >= start) & (numeric_columns <= end)]

    # Create a new DataFrame with only the filtered columns
    input_tat = input_tat[filtered_columns]

    tat_train, tat_test, targets_train, targets_test = train_test_split(input_tat, targets, test_size=0.33, random_state=42)

    targets_train = targets_train.values.ravel()
    targets_test = targets_test.values.ravel()

    feature_names = input_tat.columns.to_list()
    return tat_train, tat_test, targets_train, targets_test, feature_names


In [None]:
'''
def save_explanation(model, new_model_name, tat_train, fnames):
    # TODO: if model ist Linear Regression:
    explainer = shap.Explainer(model, tat_train, feature_names=fnames)

    # TODO: if model ist XGBoost oder Random Forest
    explainer = shap.TreeExplainer(model, feature_names=fnames)

    explanation = explainer(tat_train)

    file_path = f"../shapley_values/saved_values/{new_model_name}-saved_values"
    joblib.dump(explanation, file_path)
    return
'''

In [None]:
model_list = [
    ('XGBoost', joblib.load('../models/best_model_xgboost.pkl'), 'xgboost', joblib.load('../shapley-values/saved_values/xgboost-shapley_values')),
    ('Random Forest', joblib.load('../models/best_model_random_forest.pkl'), 'random_forest', joblib.load('../shapley-values/saved_values/randomforest-shapley_values')),
    ('Logistic Regression', joblib.load('../models/logreg.pkl'), 'logic_regression', joblib.load('../shapley-values/saved_values/logreg_shapley_values'))
]

In [None]:
def generate_new_model(model_entry, tat, start, end):
    tat_train, tat_test, targets_train, targets_test, feature_names = get_data(tat, start, end)

    new_model_name = f"{model_entry[0]} {start}-{end}"
    print(f"building {new_model_name}")
    match model_entry[2]:
        case 'logic_regression':
            model = LogisticRegression(random_state=0)
            model.fit(tat_train, targets_train)

            explainer = shap.Explainer(model, tat_train, feature_names=feature_names)
            explanation = explainer(tat_train)
            # file_path = f"../shapley-values/saved_values/{new_model_name}-saved_values"
            # joblib.dump(explanation, file_path)
            # model_list.append((new_model_name, model, 'logic_regression'))
            model_list.append((new_model_name, model, 'logic_regression', explanation))
            
        case 'xgboost':
            use_gpu = False
            label_encoder = LabelEncoder()
            targets_train = label_encoder.fit_transform(targets_train)
            print(targets_train)

            if use_gpu:
                model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=10, objective='multi:softmax',
                                        num_class=len(pd.unique(targets_train)), tree_method="hist", device="cuda")
            else:
                model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=10, objective='multi:softmax',
                                        num_class=len(pd.unique(targets_train)))

            print(tat_train)
            model.fit(tat_train, targets_train)

            explainer = shap.TreeExplainer(model, feature_names=feature_names)
            explanation = explainer(tat_train)
            # file_path = f"../shapley-values/saved_values/{new_model_name}-saved_value.json"
            # joblib.dump(explanation, file_path)
            # model_list.append((new_model_name, model, 'xgboost'))
            model_list.append((new_model_name, model, 'xgboost', explanation))

        case 'random_forest':
            model = RandomForestClassifier(n_estimators=100, random_state=42, verbose=2, n_jobs=-1)
            print('start training')
            print(tat_train.shape)
            print(targets_train.shape)
            model.fit(tat_train, targets_train)
            print('Finished training')

            print('the shap values will be generated now')
            explainer = shap.TreeExplainer(model, feature_names=feature_names)
            explanation = explainer(tat_train)
            # file_path = f"../shapley-values/saved_values/{new_model_name}-saved_value.json"
            # joblib.dump(explanation, file_path)
            # model_list.append((new_model_name, model, 'random_forest'))
            model_list.append((new_model_name, model, 'random_forest', explanation))
        case _:
            raise Exception('Model is not found.')

    return

In [None]:
def get_model_by_name(model_list, model_name):
    for model_entry in model_list:
        if model_name.lower() in model_entry[0].lower():
            return model_entry
    return None  # Model not found

# Application

In [None]:
app = Dash(__name__)
app.layout = html.Div([
    html.H1("Retraining"),
    dcc.Dropdown([t[0] for t in model_list], placeholder='Select Model', id='dropdown'),
    dcc.Input(type='number', placeholder='Enter Start', id='input_start'),
    dcc.Input(type='number', placeholder='Enter End', id='input_end'),
    html.Br(),  
    html.Button('Retrain', id='btn_retrain'),
    html.Div(id='dd-output-container')
])

@callback(
    Output('dropdown', 'options'),
    State('dropdown', 'value'),
    State('input_start', 'value'),
    State('input_end', 'value'),
    Input('btn_retrain', 'n_clicks')
)
def retrain_model(model_name, start, end, n_clicks):
    if n_clicks is None:
        return dash.no_update  # Do nothing if the button is not clicked
    if model_name is None or start is None or end is None:
        return dash.no_update  # Do nothing if not all values are provided
    
    model_entry = get_model_by_name(model_list, model_name)

    generate_new_model(model_entry, tat, start, end)
    print('model has been generated')
    return [t[0] for t in model_list] # this is not really nice (from a clean code perspective) but it works 


if __name__ == '__main__':
    app.run(debug=True)