# ensemble

In [9]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import glob
import torch
import tqdm

In [2]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [3]:
posting_id = df["posting_id"].values

In [10]:
def get_cosine_similarity(embeddings):

    CHUNK = 1024 * 2
    CTS = len(embeddings) // CHUNK
    if (len(embeddings)%CHUNK) != 0:
        CTS += 1
    distances = []
    
    with torch.no_grad():
        embeddings = normalize(embeddings)
        embeddings = torch.tensor(embeddings).cuda()

        for j in range( CTS ):
            a = j * CHUNK
            b = (j+1) * CHUNK
            b = min(b, len(embeddings))
            # print('chunk', a, 'to', b)

            # COSINE SIMILARITY DISTANCE
            cts = torch.matmul(embeddings, embeddings[a:b].T).T
            distances.append(cts.detach().cpu().numpy())
    
    return np.concatenate(distances)

In [11]:
def get_best_epochs(model_path, early_stopping_round=3):
    file_len = len(glob.glob(f"{model_path}/indices_epoch*.npy"))
    return file_len - early_stopping_round - 1 # epoch は0はじまりなのでさらに1ひく


model_dict = {
    "bert": "../output/exp044/20210420161425", # CV: 0.862
    "bert_indonesian": "../output/exp057_2/20210422215407", # CV: 0.862
    "xlm_roberta": "../output/exp057/20210422123000", # CV: 0.855
    "distilbert": "../output/exp057_3/20210423001437", # CV: 0.855
    "bert_transformer": "../output/exp058_2/20210423035344" # CV: 0.854,    
}

model_dist_dict = {}
    
for k, path in model_dict.items():
    best_epochs = get_best_epochs(path)
    
    model_dist_dict[k] = {
        "distances": np.load(f"{path}/distances_epoch{best_epochs}.npy"),
        "indices": np.load(f"{path}/indices_epoch{best_epochs}.npy"),
        "embeddings":  np.load(f"{path}/embeddings_epoch{best_epochs}.npy")
    }
    
    model_dist_dict[k]["cosine_similarity"] = get_cosine_similarity(model_dist_dict[k]["embeddings"]).astype(np.float16)

In [29]:
def get_cv(df, similarity_matrix, threshold, indices=None, pred_name="pred", min_n=2, mode="min"):
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        if mode == "min": # euclid distance etc
            IDX = np.where(similarity_matrix[k, ] < threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
        if mode == "max": # cosine similarlity
            IDX = np.where(similarity_matrix[k, ] > threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[-min_n:]
            
        if indices is None:
            pred = posting_id[IDX]
        else:
            if len(IDX) < min_n:
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
            idx = indices[k, IDX]
            pred = posting_id[idx]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [13]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(get_f1(col_name),axis=1)
    df["precision"] = df.apply(get_precision(col_name), axis=1)
    df["recall"] = df.apply(get_recall(col_name), axis=1)
    return df["f1"].mean(), df["precision"].mean(), df["recall"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [14]:
def combine_predictions_any(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x = np.unique(x)
    return x

In [15]:
def combine_predictions_major(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [16]:
def combine_predictions_all(row):
    x = np.concatenate(row[model_dict.keys()].values)

    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

In [17]:
def combine_predictions_major2(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 2
    return x[ret_idx]

## get_cv(1件→2件むりやり)

In [18]:
def aggregate_distance(model_dist_dict, mode):
    ary = []
    
    for k, v in model_dist_dict.items():
        ary.append(model_dist_dict[k]["cosine_similarity"])
    
    ary = np.array(ary)
    
    if mode == "min":
        ary = ary.min(axis=0)
    if mode == "mean":
        ary = ary.mean(axis=0)
    if mode == "max":
        ary = ary.max(axis=0)

    return ary

In [19]:
def f(x):
    return -0.0045*x + 0.53
    
threshold_ary = [0, 0.4] + f(np.arange(48)).tolist()

In [48]:
for model_name in model_dict.keys():
    f1score, precision, recall = get_cv(df, 
                                        
                                        model_dist_dict[model_name]["cosine_similarity"],
                                        0.45,
                                        pred_name=f"{model_name}_cossim",
                                        min_n=2,
                                        mode="max")
    print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")


for model_name in model_dict.keys():
    if "transformer" in model_name:
        th = 19
    else:
        th = 16
    f1score, precision, recall = get_cv(df, 
                                         model_dist_dict[model_name]["distances"],
                                         th,
                                         indices=model_dist_dict[model_name]["indices"],
                                         pred_name=f"{model_name}_euclid",
                                         min_n=2,
                                         mode="min")
    print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")




model=bert [f1] 0.8645, [precision] 0.852, [recall] 0.9296
model=bert_indonesian [f1] 0.8627, [precision] 0.8512, [recall] 0.9272
model=xlm_roberta [f1] 0.8539, [precision] 0.8395, [recall] 0.9253
model=distilbert [f1] 0.8655, [precision] 0.8534, [recall] 0.9309
model=bert_transformer [f1] 0.8702, [precision] 0.8747, [recall] 0.9151
model=bert [f1] 0.8439, [precision] 0.9418, [recall] 0.8112
model=bert_indonesian [f1] 0.8352, [precision] 0.9376, [recall] 0.8006
model=xlm_roberta [f1] 0.8318, [precision] 0.9427, [recall] 0.7912
model=distilbert [f1] 0.8471, [precision] 0.937, [recall] 0.8177
model=bert_transformer [f1] 0.8325, [precision] 0.9361, [recall] 0.7987


In [52]:
def voting(x, n):
    x = np.concatenate(x)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts >= n
    return x[ret_idx]

In [53]:
target_cols = [x for x in df.columns if "cossim" in x or "euclid" in x]

In [54]:
for n in [2, 3, 4, 5, 6, 7, 8, 9]:
    df["pred"] = [voting(x, n) for x in df[target_cols].values]
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_vote_{n}>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")


<<<ensemble_vote_2>>>: [f1] 0.8638, [precision] 0.8357, [recall] 0.949
<<<ensemble_vote_3>>>: [f1] 0.8809, [precision] 0.8741, [recall] 0.9365
<<<ensemble_vote_4>>>: [f1] 0.8864, [precision] 0.8983, [recall] 0.9209
<<<ensemble_vote_5>>>: [f1] 0.8854, [precision] 0.92, [recall] 0.8998
<<<ensemble_vote_6>>>: [f1] 0.8731, [precision] 0.946, [recall] 0.8576
<<<ensemble_vote_7>>>: [f1] 0.8592, [precision] 0.9573, [recall] 0.829
<<<ensemble_vote_8>>>: [f1] 0.8429, [precision] 0.9653, [recall] 0.8012
<<<ensemble_vote_9>>>: [f1] 0.8212, [precision] 0.9731, [recall] 0.7685
