In [31]:
# Import necessary libraries and modules
import sys
sys.path.append("ACC_Project")
import pickle
import pandas as pd
import torch as t
import numpy as np
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# Import custom modules
from llm_classes.transformer import Transformer
from llm_classes.block import Block
from llm_classes.config import Config
from llm_classes.dataprep import DataSet,DataPrep
from functions.value_encoding import DataEncoder
from functions.model_evaluator import (
    evaluation,
)  

# Load the dataset and select relevant columns
data = pd.read_csv("processed_data.csv", low_memory=False)
llm_own_model = pickle.load(open("pickle_files/llm_trained.pkl", "rb"))
dataprep = pickle.load(open("pickle_files/llm_data_prep.pkl", "rb"))


In [32]:
# Split the data into training and validation sets
data_wo_valid, data_valid = train_test_split(
    data, test_size=0.2, random_state=2, stratify=data.label
)
val = dataprep.prep(data_valid, train=False)


In [33]:
# Drop the 'description' column from the dataset and reset the index for gbm model
data.drop(columns=["description"], inplace=True)

data_wo_valid = data_wo_valid.reset_index()
data_wo_valid.drop(columns=["description", "index"], inplace=True)

data_valid = data_valid.reset_index()
data_valid.drop(columns=["description", "index"], inplace=True)


In [34]:
encoder = DataEncoder(data)  # Initialize with the entire dataset
encoded_training = encoder.transform(data_wo_valid)
encoded_validation = encoder.transform(data_valid)

X_train, X_test, y_train, y_test = encoder.split_data(
    encoded_training, test_size=0.2, random_state=2
)
X_valid, y_valid = encoder.split_data(encoded_validation, test_size=0)


In [35]:
# Train a Gradient Boosting Machine (GBM) classifier
gbm = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=300)
gbm.fit(X_train, y_train)


In [36]:
# Get predictions from the LLM model
llm_probs = [llm_own_model(inputs) for inputs, _ in val]


In [37]:
# Create dictionaries to map labels to numbers and vice versa for both models, as gbm
# and llm have different orderings of categories, so for ensembling they need to be matchd
labels = {
    label: number for number, label in enumerate(data.label.unique())
}  # Labels to numbers dictionary
reversed_labels = {v: k for k, v in labels.items()}
gbm_dict = {item: i for i, item in enumerate(gbm.classes_.tolist())}
reversed_gbm_dict = {v: k for k, v in gbm_dict.items()}


In [38]:

# Convert LLM probabilities from tensors to lists and call the ensemble function
llm_probs_list = [prob.detach().numpy().tolist() for prob in llm_probs]


# Function to reorder the LLM probabilities to match the GBM class order
def reorder_lists(llm_probs):
    order_dict = {i: gbm_dict[reversed_labels[i]] for i in range(6)}
    reordered_lists = []
    for lst in llm_probs:
        # Create a list of the same length as the inner list, filled with None
        lst = lst[0]
        new_list = [None] * len(lst)
        for original_index, new_index in order_dict.items():
            new_list[new_index] = lst[original_index]
        reordered_lists.append(new_list)
    return reordered_lists


reordered_llm = reorder_lists(llm_probs_list)


In [39]:

# Function to combine predictions from LLM and GBM models
def ensemble_predictions(llm_probs, gbm, X_valid, y_valid, gbm_weight=1):

    # Combine probabilities from both models
    gbmpreds = gbm.predict(X_valid)
    combined_probs = []
    for llm_prob, gbm_prob in zip(llm_probs, gbm.predict_proba(X_valid)):
        llm_pred = reversed_gbm_dict[np.array(llm_prob).argmax().item()]
        gbm_pred = list(gbm.classes_)[gbm_prob.argmax()]

        if llm_pred != gbm_pred:
            # Apply weighting when there is a disagreement
            combined_prob = [
                (p_llm + gbm_weight * p_gbm) / (1 + gbm_weight)
                for p_llm, p_gbm in zip(llm_prob, gbm_prob)
            ]
        else:
            # No weighting applied when there is agreement
            combined_prob = [p_llm * p_gbm for p_llm, p_gbm in zip(llm_prob, gbm_prob)]

        combined_probs.append(combined_prob)

    # Make predictions based on combined probabilities
    ensemble_preds = [
        list(gbm.classes_)[prob.index(max(prob))] for prob in combined_probs
    ]

    # Print predictions and actual labels
    print("Actual Label | LLM Prediction | GBM Prediction | Ensemble Prediction")
    for i in range(len(y_valid)):
        actual = y_valid.iloc[i]
        llm_pred = reversed_gbm_dict[np.array(llm_probs[i]).argmax().item()]
        gbm_pred = gbmpreds[i]
        if gbm_pred == "Income" and llm_pred == "LSP cash payout":
            ensemble_preds[i] = "Income"
        ensemble_pred = ensemble_preds[i]

        if actual != llm_pred != gbm_pred:
            print("")
            print(i, "actual", actual)
            print("llm_pred", llm_pred, llm_probs[i])
            print("gbm_pred", gbm_pred, gbm.predict_proba(X_valid)[i])
            print("ensemble_preds", ensemble_pred, combined_probs[i])
            print("")

    # Calculate and print metrics
    print("")
    print("Metrics:")
    print(
        "Accuracy of LLM:",
        accuracy_score(
            y_valid,
            [reversed_gbm_dict[np.array(prob).argmax().item()] for prob in llm_probs],
        ),
    )
    print("Accuracy of GBM:", accuracy_score(y_valid, gbmpreds))
    print("Accuracy of Ensemble:", accuracy_score(y_valid, ensemble_preds))
    print("")
    print("Classification Report for Ensemble:")
    print(classification_report(y_valid, ensemble_preds))
    return ensemble_preds


# Call the function
ensemble_preds = ensemble_predictions(reordered_llm, gbm, X_valid, y_valid)


Actual Label | LLM Prediction | GBM Prediction | Ensemble Prediction

23 actual d
llm_pred f [1.3376209153648233e-06, 1.9574458747229073e-06, 2.6164471478296036e-07, 1.129617612605216e-05, 1.876340860462733e-07, 0.9999849796295166]
gbm_pred d [0.00101956 0.0050411  0.00201557 0.98463192 0.00227119 0.00502065]
ensemble_preds f [0.0005104505031871337, 0.002521529996419562, 0.0010079144399117318, 0.492321609779664, 0.0011356892276023637, 0.502502816128832]


30 actual e
llm_pred f [6.384113476087805e-06, 1.2751524991472252e-05, 1.558464759909839e-06, 3.0007795430719852e-05, 1.1840694469356094e-06, 0.9999481439590454]
gbm_pred e [0.05672702 0.03041944 0.03181896 0.06152124 0.7923903  0.02712304]
ensemble_preds f [0.0283667007718486, 0.015216093918079392, 0.015910260307662925, 0.03077562498082321, 0.3961957422446055, 0.5135355927405557]


38 actual a
llm_pred f [6.692498573102057e-07, 1.0652238415786996e-06, 1.2386243497530813e-07, 4.335364337748615e-06, 9.835595449203538e-08, 0.99999368190

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
#more metrics for evaluating the ensembling 
def evaluation(y_pred, y_test):

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print("Accuracy =", accuracy)
    print("Precision =", precision)
    print("Recall =", recall)
    print("F1 Score =", f1)
    print("Confusion Matrix:")
    print(conf_matrix)


evaluation(ensemble_preds, y_valid)


Accuracy = 0.8771929824561403
Precision = 0.7999618611746758
Recall = 0.8771929824561403
F1 Score = 0.8321433376385748
Confusion Matrix:
[[  0   0   0   0   0  17]
 [  0  81   0   0   0   0]
 [  0   0   0   0   0  15]
 [  0   0   0  77   0   6]
 [  0   0   0   0   0   4]
 [  0   0   0   0   0 142]]
