# ensemble

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import glob
import tqdm

In [2]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [3]:
posting_id = df["posting_id"].values

In [4]:
def get_cosine_similarity(embeddings):
    normed_emb = normalize(embeddings).astype(np.float16)
    distances = np.matmul(normed_emb, normed_emb.T).T
    
    return distances

In [5]:
def get_cv(df, similarity_matrix, threshold, posting_id, indices=None, pred_name="pred", min_n=2, mode="min"):
    preds = []
    for k in range(len(df)):
        if mode == "min": # euclid distance etc
            IDX = np.where(similarity_matrix[k, ] < threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
        if mode == "max": # cosine similarlity
            IDX = np.where(similarity_matrix[k, ] > threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[-min_n:]
            
        if indices is None:
            pred = posting_id[IDX]
        else:
            if len(IDX) < min_n:
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
            idx = indices[k, IDX]
            pred = posting_id[idx]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [6]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(get_f1(col_name),axis=1)
    df["precision"] = df.apply(get_precision(col_name), axis=1)
    df["recall"] = df.apply(get_recall(col_name), axis=1)
    return df["f1"].mean(), df["precision"].mean(), df["recall"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [7]:
def get_best_epochs(model_path, early_stopping_round=3):
    file_len = len(glob.glob(f"{model_path}/indices_epoch*.npy"))
    return file_len - early_stopping_round - 1 # epoch は0はじまりなのでさらに1ひく


model_dict = {
    "bert_indonesian": "../output/exp033/20210418043810", # CV: 0.847
    "distilbert_base": "../output/exp033/20210418023429", # CV: 0.852
    "bert_base": "../output/exp033/20210417225343" # CV: 0.852
}

model_dist_dict = {}
    
for k, path in model_dict.items():
    best_epochs = get_best_epochs(path)
    
    model_dist_dict[k] = {
        "distances": np.load(f"{path}/distances_epoch{best_epochs}.npy"),
        "indices": np.load(f"{path}/indices_epoch{best_epochs}.npy"),
        "embeddings":  np.load(f"{path}/embeddings_epoch{best_epochs}.npy")
    }
    
    model_dist_dict[k]["cosine_similarity"] = get_cosine_similarity(model_dist_dict[k]["embeddings"]).astype(np.float16)

In [8]:
def combine_predictions_any(row):
    x = np.concatenate([row['bert_indonesian'], row['distilbert_base'], row["bert_base"]])
    x = np.unique(x)
    return x

In [9]:
def combine_predictions_major(row):
    x = np.concatenate([row['bert_indonesian'], row['distilbert_base'], row["bert_base"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [10]:
def combine_predictions_all(row):
    x = np.concatenate([row['bert_indonesian'], row['distilbert_base'], row["bert_base"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

In [11]:
def combine_predictions_major2(row):
    x = np.concatenate([row['bert_indonesian'], row['distilbert_base'], row["bert_base"]])
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 2
    return x[ret_idx]

## get_cv(1件→2件むりやり)

In [12]:
def aggregate_distance(model_dist_dict, mode):
    ary = []
    
    for k, v in model_dist_dict.items():
        ary.append(model_dist_dict[k]["cosine_similarity"])
    
    ary = np.array(ary)
    
    if mode == "min":
        ary = ary.min(axis=0)
    if mode == "mean":
        ary = ary.mean(axis=0)
    if mode == "max":
        ary = ary.max(axis=0)

    return ary

In [13]:
df_result = []
for th in np.arange(0.4, 0.55, 0.01):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["cosine_similarity"],
                                            th,
                                            posting_id,
                                            pred_name=model_name,
                                            min_n=2,
                                            mode="max")
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major2, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    for mode in ["mean", "min", "max"]:
        f1score, precision, recall = get_cv(df,
                                            aggregate_distance(model_dist_dict, mode=mode),
                                            th,
                                            posting_id,
                                            pred_name=model_name,
                                            min_n=2,
                                            mode="max")
        print(f"<<aggregate_{mode}>> [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
        

THRESHOLD: 0.4
model=bert_indonesian [f1] 0.8392, [precision] 0.8051, [recall] 0.9428
model=distilbert_base [f1] 0.8451, [precision] 0.8127, [recall] 0.944
model=bert_base [f1] 0.8375, [precision] 0.7968, [recall] 0.9495
<<<ensemble_ANY>>>: [f1] 0.7954, [precision] 0.7275, [recall] 0.9663
<<<ensemble_MAJOR>>>: [f1] 0.8592, [precision] 0.834, [recall] 0.9481
<<<ensemble_MAJOR>>>: [f1] 0.8759, [precision] 0.8857, [recall] 0.9219
<<<ensemble_ALL>>>: [f1] 0.8759, [precision] 0.8857, [recall] 0.9219
<<aggregate_mean>> [f1] 0.8634, [precision] 0.8356, [recall] 0.9505
<<aggregate_min>> [f1] 0.8775, [precision] 0.8741, [recall] 0.9284
<<aggregate_max>> [f1] 0.798, [precision] 0.7332, [recall] 0.9646
THRESHOLD: 0.41000000000000003
model=bert_indonesian [f1] 0.8468, [precision] 0.8185, [recall] 0.9398
model=distilbert_base [f1] 0.8515, [precision] 0.8252, [recall] 0.9398
model=bert_base [f1] 0.847, [precision] 0.8134, [recall] 0.9459
<<<ensemble_ANY>>>: [f1] 0.8102, [precision] 0.7494, [recall] 

<<<ensemble_ALL>>>: [f1] 0.8733, [precision] 0.9329, [recall] 0.8729
<<aggregate_mean>> [f1] 0.8856, [precision] 0.9049, [recall] 0.9098
<<aggregate_min>> [f1] 0.8777, [precision] 0.9171, [recall] 0.8839
<<aggregate_max>> [f1] 0.8777, [precision] 0.8736, [recall] 0.9299
THRESHOLD: 0.5100000000000001
model=bert_indonesian [f1] 0.8757, [precision] 0.8975, [recall] 0.8988
model=distilbert_base [f1] 0.8766, [precision] 0.9003, [recall] 0.8976
model=bert_base [f1] 0.8802, [precision] 0.8985, [recall] 0.9059
<<<ensemble_ANY>>>: [f1] 0.8767, [precision] 0.8689, [recall] 0.9319
<<<ensemble_MAJOR>>>: [f1] 0.8842, [precision] 0.9138, [recall] 0.9028
<<<ensemble_MAJOR>>>: [f1] 0.872, [precision] 0.9367, [recall] 0.8675
<<<ensemble_ALL>>>: [f1] 0.872, [precision] 0.9367, [recall] 0.8675
<<aggregate_mean>> [f1] 0.8849, [precision] 0.9081, [recall] 0.9055
<<aggregate_min>> [f1] 0.8767, [precision] 0.9207, [recall] 0.8789
<<aggregate_max>> [f1] 0.8797, [precision] 0.88, [recall] 0.9261
THRESHOLD: 0.5