In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import os
import sys
import numpy as np
from itertools import product
import logging
import tqdm
from sklearn.metrics import average_precision_score, roc_auc_score
sys.path.append(os.path.abspath(os.path.join('..', 'src',)))

from interfaces.classifier import Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier

from models.classifiers.hierarchical import Hierarchical
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from models.classifiers.h2o import H2OMultiLabel
from interfaces.embedding import Embedding

from models.classifiers.ridge import CustomRidge

from models.embeddings.openai_embedding import OpenAIEmbedding


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


embedding = OpenAIEmbedding()

In [12]:
outputs_df = pd.read_parquet("../data/processed/outputs_openai_embeddings_v1.parquet")
ppas = outputs_df.PPAs_list.explode().drop_duplicates().sort_values().to_list()
train_df = outputs_df[~outputs_df.test_set]
test_df = outputs_df[outputs_df.test_set]
X_labels = outputs_df.columns[outputs_df.columns.str.contains("openai_embedding_small")]
# y_label = "PPAs_list"
oversampling = True

X_train = train_df[X_labels].to_numpy()
# y_train = train_df[y_label].to_numpy()
y_train = train_df[ppas].astype(int).to_numpy()

X_test = test_df[X_labels].to_numpy()
# y_test = test_df[y_label].to_numpy()
y_test = test_df[ppas].astype("int").to_numpy()

X_full = outputs_df[X_labels].to_numpy()
y_full = outputs_df[ppas].astype("int").to_numpy()

ros = RandomOverSampler(random_state=42)
train_oversamples_df, _ = ros.fit_resample(train_df, y=train_df["primary_ppa"])
X_train_oversampled = train_oversamples_df[X_labels].to_numpy()
# y_train = train_df[y_label].to_numpy()
y_train_oversampled = train_oversamples_df[ppas].astype(int).to_numpy() 

def list_col_to_df(list_col: pd.Series, prefix: str):
    list_df = pd.DataFrame(
        list_col.to_list(),
        index=list_col.index
        ).add_prefix(prefix)
    return list_df

def ppa_hierarchy(ppas_list: pd.Series):
    return ppas_list.apply(lambda ppa_list: [[ppa[:2], ppa] for ppa in ppa_list]).to_numpy()


def experiment(name: str, clf: Classifier, X_train: np.ndarray, y_train: np.ndarray, 
               X_test: np.ndarray, y_test_df: pd.DataFrame, classes: list[str]):
    clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)
    y_pred_proba_df = pd.DataFrame(y_pred_proba, columns=classes)

    experiment = []
    for ppa in ppas:
        y_test_ppa = y_test_df[ppa]
        experiment.append({
            "name": name,
            "ppa": ppa,
            "roc_auc": roc_auc_score(y_test_ppa, y_pred_proba_df[ppa]),
            "average_precision": average_precision_score(y_test_ppa, y_pred_proba_df[ppa]),
        })
    return experiment

def convert_to_input(df: pd.DataFrame, embedding: Embedding, text_col="text", ppas_col="ppas"):
    X_df = df[text_col].apply(lambda t: pd.Series(embedding.generate(t)))
    # calculate embeddings
    X = X_df.to_numpy()
    # X = pd.DataFrame(
    #     df["embedding"].to_list(),
    #     ).to_numpy()
    
    for ppa in ppas:
        df[ppa] = df[ppas_col].apply(lambda x: ppa in x)
    
    y = df[ppas].to_numpy()
    # calculate embeddings
    return X, y

In [4]:
df = pd.read_excel("../data/raw/WP sugerencia de outputs.xlsx")

df = df.assign(
    ppa_list=df.PPAs.str.split(", ")
)

In [None]:
X1, y1 = convert_to_input(df.head(5), embedding, text_col="Output Statement", ppas_col="ppa_list")

# Data

In [5]:
ppas_description_df = pd.read_excel("../data/processed/ppa.xlsx", sheet_name="ppa")

ppas_description_df = (ppas_description_df
 .set_index("initials")[["name", "outcome", "description",	"key_thematic_components"]]
 .unstack()
 .reset_index()
 .drop(columns="level_0")
 .rename(columns={"initials": "ppa", 0: "text"})
 .assign(
     ppa = lambda _df: _df.ppa.apply(lambda x: [x])
 ))

ppas_description_df


Unnamed: 0,ppa,text
0,[BP1],Innovation for Sustainable Agriculture Production
1,[BP2],Blue Transformation
2,[BP3],One Health
3,[BP4],Small-scale Producers’ Equitable Access to Res...
4,[BP5],Digital Agriculture
...,...,...
75,[BL2],Thethematiccomponentsspecificallytargettheneed...
76,[BL3],The thematic building blocks of the PPA are de...
77,[BL4],The key components of the PPA are grouped acco...
78,[BL5],The thematic building blocks of the Priority P...


In [51]:
ppas_description_df = pd.read_excel("../data/processed/ppa.xlsx", sheet_name="ppa")

ppas_description_df = (ppas_description_df
 .set_index("initials")[["name", "outcome", "description",	"key_thematic_components"]]
 .unstack()
 .reset_index()
 .drop(columns="level_0")
 .rename(columns={"initials": "ppa", 0: "text"})
 .assign(
     ppa = lambda _df: _df.ppa.apply(lambda x: [x])
 ))

X_ppa, y_ppa = convert_to_input(ppas_description_df, embedding, text_col="text", ppas_col="ppa")


In [53]:
X_ppa.shape, y_ppa.shape

((80, 1536), (80, 20))

In [17]:
backup_df = pd.read_excel("/Users/jm/Downloads/backup_predictions_v3_JES.xlsx")
backup_df = backup_df.dropna(subset=["Incluir"]).assign(
    ppa_list=backup_df.PPAs.str.split(", ")
)[["Output Statement", "ppa_list"]]
X_backup, y_backup = convert_to_input(backup_df, embedding, text_col="Output Statement", ppas_col="ppa_list")


In [56]:
X_backup.shape, y_backup.shape

((209, 1536), (209, 20))

In [57]:
# Concatenate the arrays along the row axis (axis=0)
concatenated_array = np.concatenate((X_ppa, X_backup), axis=0)

print(concatenated_array.shape)

(289, 1536)


# Base training

In [69]:
base_clf = CustomRidge()
base_exp = experiment("base", base_clf, X_train, y_train, X_test, test_df[ppas], ppas)

In [85]:
pd.DataFrame(base_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.888587,0.609846
std,0.145322,0.315809


In [101]:
pd.DataFrame(base_exp)["average_precision"].describe()

count    20.000000
mean      0.609846
std       0.315809
min       0.080952
25%       0.288663
50%       0.695833
75%       0.866667
max       1.000000
Name: average_precision, dtype: float64

In [86]:
ppa_clf = CustomRidge()
ppa_exp = experiment("base", ppa_clf, X_ppa, y_ppa, X_test, test_df[ppas], ppas)
pd.DataFrame(ppa_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.5,0.0725
std,0.0,0.025521


In [87]:
backup_clf = CustomRidge()
backup_exp = experiment("base", backup_clf, X_backup, y_backup, X_test, test_df[ppas], ppas)
pd.DataFrame(backup_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.890414,0.561192
std,0.112171,0.303512


# Train with base + backup

In [18]:
# Concatenate the arrays along the row axis (axis=0)
X_train_backup = np.concatenate((X_train, X_backup), axis=0)
y_train_backup = np.concatenate((y_train, y_backup), axis=0)

print(X_train_backup.shape, y_train_backup.shape)
mixed_clf = CustomRidge()
mixed_exp = experiment("base", mixed_clf, X_train_backup, y_train_backup, X_test, test_df[ppas], ppas)
pd.DataFrame(mixed_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

(282, 1536) (282, 20)


Unnamed: 0,roc_auc,average_precision
mean,0.912195,0.630024
std,0.104675,0.281352


## V2

In [19]:
mixed_clf_balanced = CustomRidge(class_weight="balanced")
mixed_exp = experiment("base", mixed_clf_balanced, X_train_backup, y_train_backup, X_test, test_df[ppas], ppas)
pd.DataFrame(mixed_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.736366,0.329889
std,0.185657,0.269126


# V3

In [20]:
mixed_clf_balanced = CustomRidge(fit_intercept=False)
mixed_exp = experiment("base", mixed_clf_balanced, X_train_backup, y_train_backup, X_test, test_df[ppas], ppas)
pd.DataFrame(mixed_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.910694,0.613421
std,0.120693,0.273301


# V4

In [28]:
mixed_clf_balanced = CustomRidge(alpha=1, fit_intercept=True, class_weight="balanced")
mixed_exp = experiment("base", mixed_clf_balanced, X_train_backup, y_train_backup, X_test, test_df[ppas], ppas)
pd.DataFrame(mixed_exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.736366,0.329889
std,0.185657,0.269126


# Train with all

In [96]:
# Concatenate the arrays along the row axis (axis=0)
X_all = np.concatenate((X_train, X_backup, X_ppa), axis=0)
y_all = np.concatenate((y_train, y_backup, y_ppa), axis=0)

print(X_all.shape, y_all.shape)

(362, 1536) (362, 20)


In [97]:
mixed_clf2 = CustomRidge()
mixed_exp2 = experiment("base", mixed_clf2, X_all, y_all, X_test, test_df[ppas], ppas)
pd.DataFrame(mixed_exp2).drop(columns=["name", "ppa"]).agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.903022,0.630064
std,0.12708,0.286584


In [99]:
pd.DataFrame(mixed_exp2).sort_values("roc_auc")

Unnamed: 0,name,ppa,roc_auc,average_precision
1,base,BE.2,0.539474,0.076812
12,base,BN.3,0.697368,0.1125
5,base,BL.2,0.731429,0.469921
15,base,BP.1,0.782857,0.270122
18,base,BP.4,0.81982,0.256614
9,base,BL.6,0.847222,0.620238
14,base,BN.5,0.916667,0.590909
11,base,BN.2,0.947368,0.45
17,base,BP.3,0.954955,0.555556
6,base,BL.3,0.963964,0.7


# Train con backup y base completa

In [29]:
# X_backup, y_backup = convert_to_input(backup_df, embedding, text_col="Output Statement", ppas_col="ppa_list")

# Concatenate the arrays along the row axis (axis=0)
X_train_backup = np.concatenate((X_full, X_backup), axis=0)
y_train_backup = np.concatenate((y_full, y_backup), axis=0)

print(X_train_backup.shape, y_train_backup.shape)
clf = CustomRidge()
exp = experiment("base", clf, X_train_backup, y_train_backup, X_test, test_df[ppas], ppas)
pd.DataFrame(exp).drop(columns=["name", "ppa"]).agg(["mean", "std"])

(322, 1536) (322, 20)


Unnamed: 0,roc_auc,average_precision
mean,0.992994,0.931472
std,0.015385,0.151725


In [30]:
import pickle


with open('../data/classifiers/ridge_clf_backup.pkl', 'wb') as f:
    pickle.dump(clf, f)


In [15]:
from imblearn.over_sampling import SMOTE
import numpy as np
def multilabel_smote(X, Y, k_neighbors=5):
    X_res_list = []
    Y_res_list = []
    
    for i in range(Y.shape[1]):
        label_counts = np.bincount(Y[:, i])
        min_class_samples = label_counts.min()
        
        # Adjust k_neighbors if there are not enough samples
        if min_class_samples < k_neighbors:
            smote = SMOTE(k_neighbors=min_class_samples - 1, random_state=42)
        else:
            smote = SMOTE(k_neighbors=k_neighbors, random_state=42)
        
        X_res, y_res = smote.fit_resample(X, Y[:, i])
        X_res_list.append(X_res)
        Y_res_list.append(y_res.reshape(-1, 1))
    
    # Combine the resampled data for all labels
    X_res_combined = np.vstack(X_res_list)
    Y_res_combined = np.hstack(Y_res_list)
    
    return X_res_combined, Y_res_combined

# Example usage:
X_res, Y_res = multilabel_smote(X_train, y_train)
# smote = SMOTE(random_state=42)
# X_res, Y_res = smote.fit_resample(X_train, y_train)



InvalidParameterError: The 'k_neighbors' parameter of SMOTE must be an int in the range [1, inf) or an object implementing 'kneighbors' and 'kneighbors_graph'. Got 0 instead.