# ensembleを試したい
* embeddingデータをとるのを忘れてしまった。。。ので、とりあえずはdistances, indicesから最適アンサンブルの閾値を探す。。。

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import glob
import tqdm

In [2]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [3]:
posting_id = df["posting_id"].values

In [4]:
def get_cv(df, similarity_matrix, threshold, posting_id, indices=None, pred_name="pred", min_n=2):
    preds = []
    for k in range(len(df)):
        IDX = np.where(similarity_matrix[k, ] < threshold)[0]
        if indices is None:
            pred = posting_id[IDX]
        else:
            if len(IDX) < min_n:
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
            idx = indices[k, IDX]
            pred = posting_id[idx]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [5]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(get_f1(col_name),axis=1)
    df["precision"] = df.apply(get_precision(col_name), axis=1)
    df["recall"] = df.apply(get_recall(col_name), axis=1)
    return df["f1"].mean(), df["precision"].mean(), df["recall"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [12]:
def get_best_epochs(model_path, early_stopping_round=3):
    file_len = len(glob.glob(f"{model_path}/indices_epoch*.npy"))
    return file_len - early_stopping_round - 1 # epoch は0はじまりなのでさらに1ひく


model_dict = {
    "xlm_roberta_base": "../output/exp020/20210414065420", # CV: 0.847
    "bert_indonesian": "../output/exp033/20210418043810", # CV: 0.847
    "distilbert_base": "../output/exp033/20210418023429", # CV: 0.852
    "bert_base": "../output/exp033/20210417225343" # CV: 0.852
}

model_dist_dict = {}
    
for k, path in model_dict.items():
    best_epochs = get_best_epochs(path)
    
    model_dist_dict[k] = {
        "distances": np.load(f"{path}/distances_epoch{best_epochs}.npy"),
        "indices": np.load(f"{path}/indices_epoch{best_epochs}.npy")
    }

In [17]:
def combine_predictions_any(row):
    x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"], row["bert_indonesian"]])
    x = np.unique(x)
    return x

In [18]:
def combine_predictions_major(row):
    x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"], row["bert_indonesian"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [19]:
def combine_predictions_all(row):
    x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"], row["bert_indonesian"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

## get_cv(1件→2件むりやり)

In [20]:
df_result = []
for th in np.arange(15, 22, 0.5):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["distances"],
                                            th,
                                            posting_id,
                                            model_dist_dict[model_name]["indices"],
                                            pred_name=model_name,
                                            min_n=2)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

THRESHOLD: 15.0
model=xlm_roberta_base [f1] 0.8278, [precision] 0.9366, [recall] 0.7889
model=bert_indonesian [f1] 0.8206, [precision] 0.9401, [recall] 0.7772
model=distilbert_base [f1] 0.8163, [precision] 0.9428, [recall] 0.7693
model=bert_base [f1] 0.8211, [precision] 0.9448, [recall] 0.7763
<<<ensemble_ANY>>>: [f1] 0.8486, [precision] 0.9126, [recall] 0.8423
<<<ensemble_MAJOR>>>: [f1] 0.8387, [precision] 0.9555, [recall] 0.7975
<<<ensemble_ALL>>>: [f1] 0.0692, [precision] 0.1936, [recall] 0.0457
THRESHOLD: 15.5
model=xlm_roberta_base [f1] 0.836, [precision] 0.9343, [recall] 0.8023
model=bert_indonesian [f1] 0.8268, [precision] 0.9371, [recall] 0.7882
model=distilbert_base [f1] 0.8255, [precision] 0.94, [recall] 0.7838
model=bert_base [f1] 0.8293, [precision] 0.9424, [recall] 0.7892
<<<ensemble_ANY>>>: [f1] 0.8553, [precision] 0.9091, [recall] 0.8553
<<<ensemble_MAJOR>>>: [f1] 0.847, [precision] 0.9528, [recall] 0.8112
<<<ensemble_ALL>>>: [f1] 0.0715, [precision] 0.1988, [recall] 0.0