# ensemble

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import glob
import tqdm

In [2]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [3]:
posting_id = df["posting_id"].values

In [4]:
def get_cosine_similarity(embeddings):
    normed_emb = normalize(embeddings).astype(np.float16)
    distances = np.matmul(normed_emb, normed_emb.T).T
    
    return distances

In [5]:
def get_cv(df, similarity_matrix, threshold, posting_id, indices=None, pred_name="pred", min_n=2, mode="min"):
    preds = []
    for k in range(len(df)):
        if mode == "min": # euclid distance etc
            IDX = np.where(similarity_matrix[k, ] < threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
        if mode == "max": # cosine similarlity
            IDX = np.where(similarity_matrix[k, ] > threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[-min_n:]
            
        pred = posting_id[IDX]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [6]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(get_f1(col_name),axis=1)
    df["precision"] = df.apply(get_precision(col_name), axis=1)
    df["recall"] = df.apply(get_recall(col_name), axis=1)
    return df["f1"].mean(), df["precision"].mean(), df["recall"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [28]:
def get_best_epochs(model_path, early_stopping_round=3):
    file_len = len(glob.glob(f"{model_path}/indices_epoch*.npy"))
    return file_len - early_stopping_round - 1 # epoch は0はじまりなのでさらに1ひく


model_dict = {
    "bert": "../output/exp044/20210420161425", # CV: 0.862
    "bert_indonesian": "../output/exp057_2/20210422215407", # CV: 0.847
    "xlm_roberta": "../output/exp057/20210422123000", # CV: 0.855
    "distilbert": "../output/exp057_3/20210423001437", # CV: 0.857
    "bert_transformer": "../output/exp058_2/20210423035344" # CV: 0.854,    
}

model_dist_dict = {}
    
for k, path in model_dict.items():
    best_epochs = get_best_epochs(path)
    
    model_dist_dict[k] = {
        "distances": np.load(f"{path}/distances_epoch{best_epochs}.npy"),
        "indices": np.load(f"{path}/indices_epoch{best_epochs}.npy"),
        "embeddings":  np.load(f"{path}/embeddings_epoch{best_epochs}.npy")
    }
    
    model_dist_dict[k]["cosine_similarity"] = get_cosine_similarity(model_dist_dict[k]["embeddings"]).astype(np.float16)

In [17]:
def combine_predictions_any(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x = np.unique(x)
    return x

In [18]:
def combine_predictions_major(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [19]:
def combine_predictions_all(row):
    x = np.concatenate(row[model_dict.keys()].values)

    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

In [20]:
def combine_predictions_major2(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 2
    return x[ret_idx]

## get_cv(1件→2件むりやり)

In [21]:
def aggregate_distance(model_dist_dict, mode):
    ary = []
    
    for k, v in model_dist_dict.items():
        ary.append(model_dist_dict[k]["cosine_similarity"])
    
    ary = np.array(ary)
    
    if mode == "min":
        ary = ary.min(axis=0)
    if mode == "mean":
        ary = ary.mean(axis=0)
    if mode == "max":
        ary = ary.max(axis=0)

    return ary

In [22]:
df_result = []
for th in np.arange(0.47, 0.55, 0.01):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["cosine_similarity"],
                                            th,
                                            posting_id,
                                            pred_name=model_name,
                                            min_n=2,
                                            mode="max")
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major2, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    for mode in ["mean", "min", "max"]:
        f1score, precision, recall = get_cv(df,
                                            aggregate_distance(model_dist_dict, mode=mode),
                                            th,
                                            posting_id,
                                            pred_name=model_name,
                                            min_n=2,
                                            mode="max")
        print(f"<<aggregate_{mode}>> [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
        

THRESHOLD: 0.47
model=bert [f1] 0.8711, [precision] 0.8688, [recall] 0.9218
model=bert_indonesian [f1] 0.8696, [precision] 0.8675, [recall] 0.9202
model=xlm_roberta [f1] 0.8627, [precision] 0.8594, [recall] 0.9174
model=distilbert [f1] 0.8731, [precision] 0.871, [recall] 0.9239
model=bert_transformer [f1] 0.8724, [precision] 0.8872, [recall] 0.9057
<<<ensemble_ANY>>>: [f1] 0.8415, [precision] 0.7962, [recall] 0.9587
<<<ensemble_MAJOR>>>: [f1] 0.8762, [precision] 0.8644, [recall] 0.9393
<<<ensemble_MAJOR>>>: [f1] 0.8849, [precision] 0.8963, [recall] 0.9227
<<<ensemble_ALL>>>: [f1] 0.0321, [precision] 0.1005, [recall] 0.0213
<<aggregate_mean>> [f1] 0.886, [precision] 0.8916, [recall] 0.9248
<<aggregate_min>> [f1] 0.8778, [precision] 0.9204, [recall] 0.8833
<<aggregate_max>> [f1] 0.8472, [precision] 0.809, [recall] 0.9535
THRESHOLD: 0.48
model=bert [f1] 0.8737, [precision] 0.8765, [recall] 0.9179
model=bert_indonesian [f1] 0.8719, [precision] 0.8748, [recall] 0.916
model=xlm_roberta [f1] 

In [None]:
def get_cv(df, similarity_matrix, threshold, posting_id, indices=None, pred_name="pred", min_n=2):
    preds = []
    for k in range(len(df)):
        IDX = np.where(similarity_matrix[k, ] < threshold)[0]
        if indices is None:
            pred = posting_id[IDX]
        else:
            if len(IDX) < min_n:
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
            idx = indices[k, IDX]
            pred = posting_id[idx]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [57]:
def combine_predictions_major2(row):
    cossim = [f"{x}_cos" for x in model_dict.keys()]
    knn = [f"{x}_knn" for x in model_dict.keys()]
    x = np.concatenate(row[cossim + knn].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 2
    return x[ret_idx]

In [58]:
def combine_predictions_major3(row):
    cossim = [f"{x}_cos" for x in model_dict.keys()]
    knn = [f"{x}_knn" for x in model_dict.keys()]
    x = np.concatenate(row[cossim + knn].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 3
    return x[ret_idx]

In [62]:
def combine_predictions_major4(row):
    cossim = [f"{x}_cos" for x in model_dict.keys()]
    knn = [f"{x}_knn" for x in model_dict.keys()]
    x = np.concatenate(row[cossim + knn].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 4
    return x[ret_idx]

In [63]:
def combine_predictions_major5(row):
    cossim = [f"{x}_cos" for x in model_dict.keys()]
    knn = [f"{x}_knn" for x in model_dict.keys()]
    x = np.concatenate(row[cossim + knn].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 5
    return x[ret_idx]

In [82]:
df_result = []
for model_name in model_dict.keys():
    f1score, precision, recall = get_cv(df, 
                                        model_dist_dict[model_name]["cosine_similarity"],
                                        0.54,
                                        posting_id,
                                        pred_name=f"{model_name}_cos",
                                        min_n=2,
                                        mode="max")
    print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    if model_name == "bert_transformer":
        th = 22
    else:
        th = 18.5
    f1score, precision, recall = get_cv(df, 
                                        model_dist_dict[model_name]["distances"],
                                        th,
                                        posting_id,
                                        model_dist_dict[model_name]["indices"],
                                        pred_name=f"{model_name}_knn",
                                        min_n=2,
                                        mode="min")
    print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    

df["pred"] = df.apply(combine_predictions_major2, axis=1)
f1score, precision, recall = calc_cv(df, col_name="pred")
print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

df["pred"] = df.apply(combine_predictions_major3, axis=1)
f1score, precision, recall = calc_cv(df, col_name="pred")
print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

df["pred"] = df.apply(combine_predictions_major4, axis=1)
f1score, precision, recall = calc_cv(df, col_name="pred")
print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

df["pred"] = df.apply(combine_predictions_major5, axis=1)
f1score, precision, recall = calc_cv(df, col_name="pred")
print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")


model=bert [f1] 0.8774, [precision] 0.9087, [recall] 0.8914
model=bert [f1] 0.8721, [precision] 0.9121, [recall] 0.8785
model=bert_indonesian [f1] 0.8759, [precision] 0.9072, [recall] 0.8895
model=bert_indonesian [f1] 0.8645, [precision] 0.9096, [recall] 0.8683
model=xlm_roberta [f1] 0.8748, [precision] 0.9063, [recall] 0.8885
model=xlm_roberta [f1] 0.8635, [precision] 0.9209, [recall] 0.8551
model=distilbert [f1] 0.8786, [precision] 0.9093, [recall] 0.8934
model=distilbert [f1] 0.8679, [precision] 0.8982, [recall] 0.8854
model=bert_transformer [f1] 0.8668, [precision] 0.9172, [recall] 0.868
model=bert_transformer [f1] 0.8622, [precision] 0.8953, [recall] 0.8795
<<<ensemble_MAJOR>>>: [f1] 0.8883, [precision] 0.9028, [recall] 0.9182
<<<ensemble_MAJOR>>>: [f1] 0.8884, [precision] 0.9177, [recall] 0.9053
<<<ensemble_MAJOR>>>: [f1] 0.8853, [precision] 0.9288, [recall] 0.8918
<<<ensemble_MAJOR>>>: [f1] 0.8811, [precision] 0.938, [recall] 0.8788


In [83]:
df[["bert_knn", "bert_cos", "target"]]

Unnamed: 0,bert_knn,bert_cos,target
0,"[train_2865605743, train_1382500866]","[train_1382500866, train_2865605743]","[train_2865605743, train_1382500866]"
1,"[train_1382500866, train_3251720961]","[train_2865605743, train_1382500866]","[train_2865605743, train_1382500866]"
10,"[train_256668053, train_859155235]","[train_256668053, train_859155235]","[train_256668053, train_859155235]"
11,"[train_859155235, train_256668053]","[train_256668053, train_859155235]","[train_256668053, train_859155235]"
20,"[train_913614970, train_4292360632]","[train_4292360632, train_913614970]","[train_913614970, train_4292360632]"
...,...,...,...
34092,"[train_4199111972, train_3454652975, train_515...","[train_3497907844, train_4175229751, train_293...","[train_3497907844, train_4175229751, train_293..."
34093,"[train_2114123891, train_1699906038, train_395...","[train_3497907844, train_4175229751, train_515...","[train_3497907844, train_4175229751, train_293..."
34094,"[train_4178955354, train_112182868, train_2401...","[train_3497907844, train_515008716, train_3480...","[train_3497907844, train_4175229751, train_293..."
34095,"[train_112182868, train_4178955354, train_2401...","[train_3497907844, train_515008716, train_1354...","[train_3497907844, train_4175229751, train_293..."


In [75]:
def weight_distance(model_dist_dict, weight_ary):
    ary = []
    
    for k, v in model_dist_dict.items():
        ary.append(model_dist_dict[k]["cosine_similarity"])
    
    ary = np.array(ary)
    ary = ary * np.array(weight_ary).reshape(-1, 1, 1) * np.array(weight_ary).sum()
    return ary.sum(axis=0)

In [76]:
for mode in ["mean"]:
    f1score, precision, recall = get_cv(df,
                                        weight_distance(model_dist_dict, weight_ary=[1, 1, 1, 0.8, 0.8]),
                                        0.54,
                                        posting_id,
                                        pred_name=model_name,
                                        min_n=2,
                                        mode="max")
    print(f"<<aggregate_{mode}>> [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")


<<aggregate_mean>> [f1] 0.0049, [precision] 0.0024, [recall] 0.9994
