# ensembleを試したい
* embeddingデータをとるのを忘れてしまった。。。ので、とりあえずはdistances, indicesから最適アンサンブルの閾値を探す。。。

In [25]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import glob
import tqdm

In [26]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [27]:
posting_id = df["posting_id"].values

In [110]:
def get_cv(df, similarity_matrix, threshold, posting_id, indices=None, pred_name="pred", min_n=2):
    preds = []
    for k in range(len(df)):
        IDX = np.where(similarity_matrix[k, ] < threshold)[0]
        if indices is None:
            pred = posting_id[IDX]
        else:
            if len(IDX) < min_n:
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
            idx = indices[k, IDX]
            pred = posting_id[idx]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [57]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(get_f1(col_name),axis=1)
    df["precision"] = df.apply(get_precision(col_name), axis=1)
    df["recall"] = df.apply(get_recall(col_name), axis=1)
    return df["f1"].mean(), df["precision"].mean(), df["recall"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [58]:
def get_best_epochs(model_path, early_stopping_round=3):
    file_len = len(glob.glob(f"{model_path}/indices_epoch*.npy"))
    return file_len - early_stopping_round - 1 # epoch は0はじまりなのでさらに1ひく


model_dict = {
    "xlm_roberta_base": "../output/exp020/20210414065420", # CV: 0.847
    "distilbert_base": "../output/exp020/20210414091557", # CV: 0.856
    "bert_base": "../output/exp020/20210414113059" # CV: 0.854
}

model_dist_dict = {}
    
for k, path in model_dict.items():
    best_epochs = get_best_epochs(path)
    
    model_dist_dict[k] = {
        "distances": np.load(f"{path}/distances_epoch{best_epochs}.npy"),
        "indices": np.load(f"{path}/indices_epoch{best_epochs}.npy")
    }

In [75]:
def combine_predictions_any(row):
    x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"]])
    x = np.unique(x)
    return x

In [76]:
def combine_predictions_major(row):
    x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [77]:
def combine_predictions_all(row):
    x = np.concatenate([row['xlm_roberta_base'], row['distilbert_base'], row["bert_base"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

## get_cv (normal)

In [111]:
df_result = []
for th in np.arange(15, 22, 0.5):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["distances"],
                                            th,
                                            posting_id,
                                            model_dist_dict[model_name]["indices"],
                                            pred_name=model_name,
                                            min_n=1)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

THRESHOLD: 15.0
model=xlm_roberta_base [f1] 0.7771, [precision] 0.975, [recall] 0.7084
model=distilbert_base [f1] 0.7689, [precision] 0.9785, [recall] 0.6955
model=bert_base [f1] 0.7657, [precision] 0.9787, [recall] 0.6888
<<<ensemble_ANY>>>: [f1] 0.8081, [precision] 0.9698, [recall] 0.7537
<<<ensemble_MAJOR>>>: [f1] 0.7718, [precision] 0.9789, [recall] 0.6991
<<<ensemble_ALL>>>: [f1] 0.7306, [precision] 0.9848, [recall] 0.6399
THRESHOLD: 15.5
model=xlm_roberta_base [f1] 0.792, [precision] 0.9713, [recall] 0.7308
model=distilbert_base [f1] 0.7855, [precision] 0.9741, [recall] 0.7204
model=bert_base [f1] 0.7848, [precision] 0.9752, [recall] 0.7174
<<<ensemble_ANY>>>: [f1] 0.8225, [precision] 0.9642, [recall] 0.7775
<<<ensemble_MAJOR>>>: [f1] 0.7889, [precision] 0.9759, [recall] 0.7243
<<<ensemble_ALL>>>: [f1] 0.7496, [precision] 0.982, [recall] 0.6668
THRESHOLD: 16.0
model=xlm_roberta_base [f1] 0.8036, [precision] 0.9655, [recall] 0.7508
model=distilbert_base [f1] 0.7989, [precision] 0.

## get_cv(1件→2件むりやり)

In [113]:
df_result = []
for th in np.arange(15, 22, 0.5):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["distances"],
                                            th,
                                            posting_id,
                                            model_dist_dict[model_name]["indices"],
                                            pred_name=model_name,
                                            min_n=2)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

THRESHOLD: 15.0
model=xlm_roberta_base [f1] 0.8278, [precision] 0.9366, [recall] 0.7889
model=distilbert_base [f1] 0.8258, [precision] 0.9448, [recall] 0.783
model=bert_base [f1] 0.8235, [precision] 0.9429, [recall] 0.778
<<<ensemble_ANY>>>: [f1] 0.8497, [precision] 0.9204, [recall] 0.8363
<<<ensemble_MAJOR>>>: [f1] 0.8313, [precision] 0.9622, [recall] 0.7844
<<<ensemble_ALL>>>: [f1] 0.794, [precision] 0.979, [recall] 0.7292
THRESHOLD: 15.5
model=xlm_roberta_base [f1] 0.836, [precision] 0.9343, [recall] 0.8023
model=distilbert_base [f1] 0.8343, [precision] 0.9419, [recall] 0.7967
model=bert_base [f1] 0.8327, [precision] 0.9405, [recall] 0.7931
<<<ensemble_ANY>>>: [f1] 0.8564, [precision] 0.9169, [recall] 0.8492
<<<ensemble_MAJOR>>>: [f1] 0.8408, [precision] 0.9598, [recall] 0.7993
<<<ensemble_ALL>>>: [f1] 0.8038, [precision] 0.9769, [recall] 0.7437
THRESHOLD: 16.0
model=xlm_roberta_base [f1] 0.8428, [precision] 0.9303, [recall] 0.8154
model=distilbert_base [f1] 0.841, [precision] 0.937

# get_cv(1件→3件むりやり)

In [114]:
df_result = []
for th in np.arange(15, 22, 0.5):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["distances"],
                                            th,
                                            posting_id,
                                            model_dist_dict[model_name]["indices"],
                                            pred_name=model_name,
                                            min_n=3)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

THRESHOLD: 15.0
model=xlm_roberta_base [f1] 0.78, [precision] 0.8078, [recall] 0.8277
model=distilbert_base [f1] 0.7796, [precision] 0.815, [recall] 0.8243
model=bert_base [f1] 0.7781, [precision] 0.8127, [recall] 0.821
<<<ensemble_ANY>>>: [f1] 0.732, [precision] 0.7177, [recall] 0.8726
<<<ensemble_MAJOR>>>: [f1] 0.8348, [precision] 0.9131, [recall] 0.8267
<<<ensemble_ALL>>>: [f1] 0.819, [precision] 0.9611, [recall] 0.7737
THRESHOLD: 15.5
model=xlm_roberta_base [f1] 0.7859, [precision] 0.8065, [recall] 0.8372
model=distilbert_base [f1] 0.7853, [precision] 0.813, [recall] 0.8338
model=bert_base [f1] 0.7848, [precision] 0.8111, [recall] 0.8321
<<<ensemble_ANY>>>: [f1] 0.7369, [precision] 0.7156, [recall] 0.8816
<<<ensemble_MAJOR>>>: [f1] 0.8413, [precision] 0.9115, [recall] 0.8373
<<<ensemble_ALL>>>: [f1] 0.8261, [precision] 0.9596, [recall] 0.7842
THRESHOLD: 16.0
model=xlm_roberta_base [f1] 0.7904, [precision] 0.8041, [recall] 0.8456
model=distilbert_base [f1] 0.7903, [precision] 0.8104

In [115]:
df_result = []
for th in np.arange(15, 22, 0.5):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["distances"],
                                            th,
                                            posting_id,
                                            model_dist_dict[model_name]["indices"],
                                            pred_name=model_name,
                                            min_n=4)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

THRESHOLD: 15.0
model=xlm_roberta_base [f1] 0.7262, [precision] 0.7074, [recall] 0.8531
model=distilbert_base [f1] 0.7279, [precision] 0.7148, [recall] 0.8529
model=bert_base [f1] 0.7261, [precision] 0.712, [recall] 0.8496
<<<ensemble_ANY>>>: [f1] 0.644, [precision] 0.5973, [recall] 0.8946
<<<ensemble_MAJOR>>>: [f1] 0.8211, [precision] 0.8592, [recall] 0.8555
<<<ensemble_ALL>>>: [f1] 0.8277, [precision] 0.9351, [recall] 0.8056
THRESHOLD: 15.5
model=xlm_roberta_base [f1] 0.7306, [precision] 0.7066, [recall] 0.8599
model=distilbert_base [f1] 0.732, [precision] 0.7134, [recall] 0.8597
model=bert_base [f1] 0.7311, [precision] 0.7109, [recall] 0.8578
<<<ensemble_ANY>>>: [f1] 0.6477, [precision] 0.5961, [recall] 0.9012
<<<ensemble_MAJOR>>>: [f1] 0.8257, [precision] 0.858, [recall] 0.863
<<<ensemble_ALL>>>: [f1] 0.8331, [precision] 0.9341, [recall] 0.8132
THRESHOLD: 16.0
model=xlm_roberta_base [f1] 0.7341, [precision] 0.7052, [recall] 0.8661
model=distilbert_base [f1] 0.7357, [precision] 0.71