# Unification of Knowledge: Creation of new input

Concatenating the predictors with the responses of the LLM generated using various prompts from the refinement process (e.g., Resp_aware_in = preds + "YES" + "NO" + "YES"). These new inputs were then passed through a ML model for classification.


In [1]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier

In [2]:
data_change = pd.read_csv("../dat/dips/DIPS_Data_cleaned_change.csv", sep = ",", low_memory = False)

In [3]:
y_pred_Gemma_few_shot_prompt = pd.read_csv("y_pred_LLMs/Gemma/y_pred_gemma_few_shot_prompt.csv", sep = ",")
y_pred_Grok_few_shot_prompt = pd.read_csv("y_pred_LLMs/Grok/y_pred_Grok_few_shot_prompt.csv", sep = ",") # higher sensistivity but overall not better
y_pred_GPT_o3_cot_prompt = pd.read_csv("y_pred_LLMs/GPT/y_pred_GPT_o3_cot_prompt.csv", sep = ",") # better than alone
y_pred_Gemini_few_shot_prompt = pd.read_csv("y_pred_LLMs/Gemini/y_pred_Gemini_few_shot_prompt.csv", sep = ",") # works very well, migh MCC but low specificity
y_pred_Claude_few_shot_prompt = pd.read_csv("y_pred_LLMs/Claude/y_pred_Claude_few_shot_prompt.csv", sep = ",") # also works very well, migh MCC but low specificity
y_pred_DeepSeek_few_shot_prompt = pd.read_csv("y_pred_LLMs/DeepSeek/y_pred_deeps_few_shot_prompt.csv", sep = ",") # works well, but not as good as Gemini or Claude

## 1 DNN Model

In [15]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("DNN \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

DNN 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [19]:
X_test

Unnamed: 0,whi_ges,soz_ges,gke_ges,lzh_ges,ile,woc_gesp,woc_gese,asi_ges,bsq_ges,das_ges,...,bmi_kat,ses_kom,whi_change,soz_change,gke_change,lzh_change,asi_change,bsq_change,das_change,scl_change
372,-0.027917,0.142124,0.364979,0.337289,0.441936,1.731937,0.207830,0.159416,0.286375,0.275069,...,-1,0,-0.752017,0.705710,-0.181980,0.140738,-0.861724,-0.846980,0.484902,-0.825664
484,-0.027917,1.079045,-2.354937,-0.471819,0.041981,0.853278,-0.845628,0.555751,1.241667,-0.225081,...,0,0,-0.752017,-0.460475,1.753215,0.316709,0.518517,1.781472,0.809727,-1.091050
1046,-0.994398,0.811959,2.404917,0.984575,-0.957908,2.698461,-0.764593,-0.369032,0.286375,-1.308738,...,0,1,0.281218,0.204409,0.094477,1.548507,-1.137772,-1.431081,0.430765,0.368574
1031,-0.027917,0.900988,0.364979,1.631861,-0.957908,0.150351,-0.278381,-1.161703,-1.242092,-1.100342,...,0,0,-0.235399,-0.571288,-0.458436,-0.035233,2.864926,2.754973,1.188689,0.191649
541,-0.511158,0.811959,0.591639,1.470039,-0.957908,-0.288978,-0.602522,-1.029591,-0.764446,-1.016984,...,0,0,0.281218,-0.349661,-4.052369,-1.091060,-0.033579,-0.360230,-1.030946,-0.250661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,-0.511158,0.722930,0.818299,0.660932,0.841892,-0.376844,-0.926663,-0.236920,-0.382329,-1.683850,...,0,0,-0.235399,-0.238847,-0.734893,-0.563147,-0.861724,-0.360230,0.863864,0.633960
1134,-0.511158,-0.167357,0.364979,0.337289,-1.157886,-1.343368,-0.440452,-0.104808,-0.286800,0.900255,...,0,0,0.281218,-0.017219,-0.734893,-0.739118,-0.033579,0.223871,-0.110610,-0.065785
377,0.455324,0.277787,-0.541660,-0.309997,-0.957908,1.995534,2.476817,0.687863,-0.955504,-0.641872,...,0,0,-0.235399,-2.787567,-1.287805,-1.443003,-0.861724,0.223871,2.000750,-0.029506
230,-0.994398,-0.434443,0.138320,0.822753,0.041981,-1.255502,-0.035276,0.555751,-0.095742,1.650480,...,0,0,0.797835,0.980106,0.094477,-0.211204,0.380493,-0.457580,-1.897145,-0.206430


In [4]:
def sklearn_dnn(x_test_dnn, x_train_dnn, y_train_dnn, y_test_dnn):

    # 1: GRID SEARCH
    dnn_model = MLPClassifier(random_state = 42)

    param_grid = {
        'hidden_layer_sizes': [(50, ), (100, ), (50, 50), (10, 30, 10), (50, 50, 50)], # (50, 100, 50)
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.05],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [100, 500, 1000, 2500, 5000]
    }

    grid_search = GridSearchCV(
        estimator = dnn_model,
        param_grid = param_grid,
        cv = 10
    )
    grid_search.fit(x_train_dnn, y_train_dnn)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = MLPClassifier(
        hidden_layer_sizes = best_model.hidden_layer_sizes,
        activation = best_model.activation,
        solver = best_model.solver,
        alpha = best_model.alpha,
        learning_rate = best_model.learning_rate,
        max_iter = best_model.max_iter,
        random_state = 42
    )
    model.fit(x_train_dnn, y_train_dnn)

    # 3: ESTIMATING WEIGHTS
    weights_d = model.coefs_
    features_d = model.feature_names_in_

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_d = model.predict(x_test_dnn)
    y_pred_proba_d = model.predict_proba(x_test_dnn)

    # 5: COMPUTE METRICS
    accuracy_dnn = model.score(x_test_dnn, y_test_dnn)
    macro_f1_dnn = recall_score(y_test_dnn, y_pred_d, average = "macro")
    micro_f1_dnn = recall_score(y_test_dnn, y_pred_d, average = "micro")
    mcc_dnn = matthews_corrcoef(y_test_dnn, y_pred_d)

    cm_dnn = confusion_matrix(y_test_dnn, y_pred_d)
    precision_dnn = cm_dnn[1, 1] / (cm_dnn[1, 1] + cm_dnn[0, 1])
    recall_dnn = cm_dnn[1, 1] / (cm_dnn[1, 1] + cm_dnn[1, 0])

    # store metrics in a dictionary
    metrics_d = {
        "accuracy": accuracy_dnn,
        "macro_f1": macro_f1_dnn,
        "micro_f1": micro_f1_dnn,
        "mcc": mcc_dnn,
        "precision": precision_dnn,
        "recall": recall_dnn,
        "confusion_matrix": cm_dnn
    }

    return weights_d, features_d, y_pred_d, y_pred_proba_d, metrics_d