In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
import sys
import pandas as pd
import pickle


sys.path.append("../src")

from application.evaluation import ppa_experiment
from models.classifiers.ridge import CustomRidge
from application.data_transformation import convert_to_input
from models.embeddings.openai_embedding import OpenAIEmbedding



In [3]:
outputs_df = pd.read_parquet("../data/processed/outputs_openai_embeddings_v1.parquet")
outputs_df = outputs_df.assign(
    primary_ppa=outputs_df["Primary PPA"].str[:4]
)


In [26]:

ppas = outputs_df.PPAs_list.explode().drop_duplicates().sort_values().to_list()
train_df = outputs_df[~outputs_df.test_set]
test_df = outputs_df[outputs_df.test_set]
X_labels = outputs_df.columns[outputs_df.columns.str.contains("openai_embedding_small")]
y_label = "primary_ppa"
# oversampling = True

X_train = train_df[X_labels].to_numpy()
# y_train = train_df[y_label].to_numpy()
y_train_principal = train_df[y_label].to_numpy()
y_train_multi = train_df[ppas].to_numpy()

X_test = test_df[X_labels].to_numpy()
# y_test = test_df[y_label].to_numpy()
y_test_principal = test_df[y_label].to_numpy()
y_test_multi = test_df[ppas].to_numpy()

X_full = outputs_df[X_labels].to_numpy()
y_full_principal = outputs_df[y_label].to_numpy()
y_full_multi = outputs_df[ppas].to_numpy()


In [5]:
principal_ppa_clf = CustomRidge()
multi_ppa_clf = CustomRidge()

principal_ppa_clf.fit(X_train, y_train_principal)
multi_ppa_clf.fit(X_train, y_train_multi)

In [6]:
principal_exp = ppa_experiment(principal_ppa_clf, X_train, y_train_principal, X_test, test_df, ppas)
principal_exp[["roc_auc", "average_precision"]].agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.895924,0.662375
std,0.143985,0.311131


In [7]:
multi_exp = ppa_experiment(multi_ppa_clf, X_train, y_train_multi, X_test, test_df, ppas)
multi_exp[["roc_auc", "average_precision"]].agg(["mean", "std"])

Unnamed: 0,roc_auc,average_precision
mean,0.888587,0.609846
std,0.145322,0.315809


In [8]:
y_pred_principal = principal_ppa_clf.predict_proba(X_test)
y_pred_multi = multi_ppa_clf.predict_proba(X_test)

In [20]:
from application.evaluation import get_top_predictions


a = get_top_predictions(y_pred_principal, ppas, 3)
b = get_top_predictions(y_pred_multi, ppas, 3)
pd.DataFrame(zip(a, b), columns=["a", "b"]).assign(inter=lambda _df: _df.apply(lambda row: len(set(row["a"]).intersection(set(row["b"]))),  axis=1))

Unnamed: 0,a,b,inter
0,"[BE.1, BL.3, BP.1]","[BE.1, BL.3, BL.1]",2
1,"[BE.3, BL.4, BL.3]","[BE.3, BP.1, BL.5]",1
2,"[BN.2, BP.4, BL.6]","[BP.4, BL.2, BL.1]",1
3,"[BN.2, BN.1, BL.3]","[BN.1, BN.2, BL.6]",2
4,"[BL.2, BL.6, BN.5]","[BP.4, BL.2, BL.5]",1
5,"[BL.1, BL.3, BL.2]","[BL.1, BL.2, BN.1]",2
6,"[BN.2, BN.1, BP.4]","[BN.1, BN.2, BP.4]",3
7,"[BL.5, BL.2, BP.4]","[BP.4, BL.2, BL.5]",3
8,"[BP.5, BE.4, BN.5]","[BP.4, BE.4, BP.5]",2
9,"[BP.3, BP.1, BE.3]","[BP.3, BP.1, BE.3]",3


In [23]:
x1 = pd.DataFrame(y_pred_principal, columns=ppas).apply(lambda x: x.sort_values(ascending=False).head(3).index.to_list(), axis=1)

In [24]:
x2 = pd.DataFrame(y_pred_multi, columns=ppas).apply(lambda x: x.idxmax(), axis=1)

In [25]:
x3 = pd.DataFrame(zip(x1, x2), columns=["top3", "principal"])
x3["correct"] = x3.apply(lambda x: x["principal"] in x["top3"], axis=1)
x3["correct"].value_counts()

correct
True     30
False    10
Name: count, dtype: int64

# Train final classifiers

In [32]:


embedding = OpenAIEmbedding()
backup_df = pd.read_excel("/Users/jm/Downloads/backup_predictions_v3_JES.xlsx")
backup_df = backup_df.dropna(subset=["Incluir"]).assign(
    ppa_list=backup_df.PPAs.str.split(", ")
)[["Output Statement", "ppa_list"]]
X_backup, y_backup = convert_to_input(backup_df, embedding, ppas, text_col="Output Statement", ppas_col="ppa_list")

X_v1 = np.concatenate((X_full, X_backup), axis=0)
y_principal_v1 = np.concatenate((y_full_principal, y_backup), axis=0)
y_multi_v1 = np.concatenate((y_full_multi, y_backup), axis=0)



In [34]:

principal_ppa_clf = CustomRidge()
multi_ppa_clf = CustomRidge()

principal_ppa_clf.fit(X_full, y_full_principal)
multi_ppa_clf.fit(X_full, y_full_multi)

In [35]:


with open('../models/multi_ppa_clf/ridge_v1.pkl', 'wb') as f:
    pickle.dump(multi_ppa_clf, f)
    
with open('../models/principal_ppa_clf/ridge_v1.pkl', 'wb') as f:
    pickle.dump(principal_ppa_clf, f)

In [None]:
y_pred_principal.shape, y_pred_multi.shape

In [None]:
y_pred_test = clf.predict_proba(X_test)

In [None]:
clf.predict(X_test)

In [None]:
import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score


experiment = []
for i, ppa in enumerate(ppas):
    y_test_ppa = (y_test == ppa).astype(int)
    experiment.append({
        "ppa": ppa,
        "roc_auc": roc_auc_score(y_test_ppa, y_pred_test[:, i]),
        "average_precision": average_precision_score(y_test_ppa, y_pred_test[:, i]),
    })
pd.DataFrame(experiment).sort_values("roc_auc", ascending=False)[["roc_auc", "average_precision"]].agg(["mean", "std"])

In [None]:
pd.Series(y_test).value_counts()

In [None]:
for x in y_test:
    print(x)

In [None]:
"BL.3" in y_test

In [None]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5])
value = 3

position = np.where(arr == value)
print(position)