# ensemble

In [6]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import glob
import tqdm

In [7]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [8]:
posting_id = df["posting_id"].values

In [16]:
def get_best_epochs(model_path, early_stopping_round=3):
    file_len = len(glob.glob(f"{model_path}/indices_epoch*.npy"))
    return file_len - early_stopping_round - 1 # epoch は0はじまりなのでさらに1ひく


model_dict = {
    "bert": "../output/exp044/20210420161425", # CV: 0.862
    # "bert_indonesian": "../output/exp057_2/20210422215407", # CV: 0.862
    # "xlm_roberta": "../output/exp057/20210422123000", # CV: 0.855
    # "xlm_roberta": "../output/exp057_3/20210423001437", # CV: 0.855
    # "bert_transformer": "../output/exp058_2/20210423035344" # CV: 0.854,    
}

model_dist_dict = {}
    
for k, path in model_dict.items():
    best_epochs = get_best_epochs(path)
    
    model_dist_dict[k] = {
        # "distances": np.load(f"{path}/distances_epoch{best_epochs}.npy"),
        # "indices": np.load(f"{path}/indices_epoch{best_epochs}.npy"),
        "embeddings":  np.load(f"{path}/embeddings_epoch{best_epochs}.npy")
    }
    
    model_dist_dict[k]["cosine_similarity"] = get_cosine_similarity(model_dist_dict[k]["embeddings"]).astype(np.float16)

In [9]:
def get_cosine_similarity(embeddings):
    normed_emb = normalize(embeddings).astype(np.float16)
    distances = np.matmul(normed_emb, normed_emb.T).T
    
    return distances

In [22]:
def get_cv1(df, similarity_matrix, threshold, pred_name="pred", min_n=2):
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        IDX = np.where(similarity_matrix[k, ] > threshold)[0]
        if len(IDX) < min_n:                
            IDX = np.argsort(similarity_matrix[k, ])[-min_n:]
            
        pred = posting_id[IDX]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [47]:
def get_cv2(df, similarity_matrix, threshold_ary, pred_name="pred"):
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        IDX = np.argsort(similarity_matrix[k, ])[-50:][::-1]
        idx = similarity_matrix[k, ][IDX] > threshold_ary
        
        IDX = IDX[idx]
        pred = posting_id[IDX]
        preds.append(pred)
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [66]:
def get_cv3(df, similarity_matrix, threshold, pred_name="pred", th_2nd=0.4):
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        IDX = np.where(similarity_matrix[k, ] > threshold)[0]
        if len(IDX) < 2:                
            IDX = np.argsort(similarity_matrix[k, ])[-2:]
            if similarity_matrix[k, ][0] < th_2nd:
                IDX = IDX[-1]
            
        pred = posting_id[IDX]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [34]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(get_f1(col_name),axis=1)
    df["precision"] = df.apply(get_precision(col_name), axis=1)
    df["recall"] = df.apply(get_recall(col_name), axis=1)
    return df["f1"].mean(), df["precision"].mean(), df["recall"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [35]:
def combine_predictions_any(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x = np.unique(x)
    return x

In [36]:
def combine_predictions_major(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [37]:
def combine_predictions_all(row):
    x = np.concatenate(row[model_dict.keys()].values)

    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

In [38]:
def combine_predictions_major2(row):
    x = np.concatenate(row[model_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 2
    return x[ret_idx]

## get_cv(1件→2件むりやり)

In [12]:
def aggregate_distance(model_dist_dict, mode):
    ary = []
    
    for k, v in model_dist_dict.items():
        ary.append(model_dist_dict[k]["cosine_similarity"])
    
    ary = np.array(ary)
    
    if mode == "min":
        ary = ary.min(axis=0)
    if mode == "mean":
        ary = ary.mean(axis=0)
    if mode == "max":
        ary = ary.max(axis=0)

    return ary

In [53]:
def f(x):
    return -0.0045*x + 0.53
    
threshold_ary = [0, 0.4] + f(np.arange(48)).tolist()

In [55]:
for th in np.arange(0.4, 0.55, 0.01):
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv1(df, 
                                             model_dist_dict[model_name]["cosine_similarity"],
                                             th,
                                             pred_name=model_name,
                                             min_n=1)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
        


model=bert [f1] 0.8279, [precision] 0.7938, [recall] 0.9432
model=bert [f1] 0.8378, [precision] 0.8115, [recall] 0.9393
model=bert [f1] 0.8452, [precision] 0.827, [recall] 0.9346
model=bert [f1] 0.8521, [precision] 0.8407, [recall] 0.9304
model=bert [f1] 0.8575, [precision] 0.8524, [recall] 0.9261
model=bert [f1] 0.8614, [precision] 0.8637, [recall] 0.921
model=bert [f1] 0.8629, [precision] 0.8728, [recall] 0.9138
model=bert [f1] 0.8656, [precision] 0.882, [recall] 0.9092
model=bert [f1] 0.8677, [precision] 0.891, [recall] 0.9041
model=bert [f1] 0.8686, [precision] 0.8989, [recall] 0.8984
model=bert [f1] 0.8687, [precision] 0.905, [recall] 0.8931
model=bert [f1] 0.8689, [precision] 0.9113, [recall] 0.8877
model=bert [f1] 0.868, [precision] 0.9172, [recall] 0.8813
model=bert [f1] 0.8665, [precision] 0.9226, [recall] 0.8743
model=bert [f1] 0.8634, [precision] 0.9279, [recall] 0.8655
model=bert [f1] 0.8598, [precision] 0.931, [recall] 0.8575


In [50]:
for th in np.arange(0.4, 0.55, 0.01):
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv1(df, 
                                             model_dist_dict[model_name]["cosine_similarity"],
                                             th,
                                             pred_name=model_name,
                                             min_n=2)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
        


model=bert [f1] 0.8285, [precision] 0.7864, [recall] 0.9471
model=bert [f1] 0.8387, [precision] 0.8031, [recall] 0.9439
model=bert [f1] 0.8465, [precision] 0.8175, [recall] 0.9401
model=bert [f1] 0.854, [precision] 0.8306, [recall] 0.937
model=bert [f1] 0.8594, [precision] 0.8413, [recall] 0.9333
model=bert [f1] 0.8645, [precision] 0.852, [recall] 0.9296
model=bert [f1] 0.8676, [precision] 0.8602, [recall] 0.9252
model=bert [f1] 0.8711, [precision] 0.8688, [recall] 0.9218
model=bert [f1] 0.8737, [precision] 0.8765, [recall] 0.9179
model=bert [f1] 0.8757, [precision] 0.8832, [recall] 0.914
model=bert [f1] 0.8766, [precision] 0.8885, [recall] 0.9099
model=bert [f1] 0.8777, [precision] 0.8942, [recall] 0.9059
model=bert [f1] 0.8778, [precision] 0.8993, [recall] 0.9011
model=bert [f1] 0.878, [precision] 0.9039, [recall] 0.8967
model=bert [f1] 0.8774, [precision] 0.9087, [recall] 0.8914
model=bert [f1] 0.8758, [precision] 0.9114, [recall] 0.8863


In [86]:
def get_cv3(df, similarity_matrix, threshold, pred_name="pred", th_2nd=0.4):
    posting_id = df["posting_id"].values
    preds = []
    for k in range(len(df)):
        IDX = np.where(similarity_matrix[k, ] > threshold)[0]
        if len(IDX) < 2:                
            IDX = np.argsort(similarity_matrix[k, ])[::-1][:2]
            if similarity_matrix[k, IDX[1]] < th_2nd:
                IDX = IDX[0]
            
        pred = posting_id[IDX]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [87]:
f1score, precision, recall = get_cv3(df, 
                                     model_dist_dict[model_name]["cosine_similarity"],
                                     threshold=0.52,
                                     pred_name=model_name,
                                     th_2nd=0)
print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

for th in np.arange(0.3, 0.4, 0.01):
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv3(df, 
                                             model_dist_dict[model_name]["cosine_similarity"],
                                             0.52,
                                             pred_name=model_name,
                                             th_2nd=th)
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
        


model=bert [f1] 0.8778, [precision] 0.8993, [recall] 0.9011
model=bert [f1] 0.8771, [precision] 0.8984, [recall] 0.9011
model=bert [f1] 0.8769, [precision] 0.8982, [recall] 0.9011


KeyboardInterrupt: 

In [54]:
f1score, precision, recall = get_cv2(df, 
                                     model_dist_dict[model_name]["cosine_similarity"],
                                     threshold_ary,
                                     pred_name=model_name,
                                     min_n=2)
print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")


model=bert [f1] 0.8801, [precision] 0.9026, [recall] 0.9062


In [13]:
df_result = []
for th in np.arange(0.4, 0.55, 0.01):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dict.keys():
        f1score, precision, recall = get_cv(df, 
                                            model_dist_dict[model_name]["cosine_similarity"],
                                            th,
                                            posting_id,
                                            pred_name=model_name,
                                            min_n=2,
                                            mode="max")
        print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

    df["pred"] = df.apply(combine_predictions_any, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ANY>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_major2, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    df["pred"] = df.apply(combine_predictions_all, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_ALL>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    
    for mode in ["mean", "min", "max"]:
        f1score, precision, recall = get_cv(df,
                                            aggregate_distance(model_dist_dict, mode=mode),
                                            th,
                                            posting_id,
                                            pred_name=model_name,
                                            min_n=2,
                                            mode="max")
        print(f"<<aggregate_{mode}>> [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
        

THRESHOLD: 0.4
model=bert [f1] 0.8285, [precision] 0.7864, [recall] 0.9471
model=bert_indonesian [f1] 0.8304, [precision] 0.7898, [recall] 0.9447
model=xlm_roberta [f1] 0.8339, [precision] 0.7936, [recall] 0.9484
model=bert_transformer [f1] 0.852, [precision] 0.8295, [recall] 0.935
<<<ensemble_ANY>>>: [f1] 0.7711, [precision] 0.6931, [recall] 0.9693
<<<ensemble_MAJOR>>>: [f1] 0.8437, [precision] 0.8046, [recall] 0.9558
<<<ensemble_MAJOR>>>: [f1] 0.8682, [precision] 0.8571, [recall] 0.9389
<<<ensemble_ALL>>>: [f1] 0.0395, [precision] 0.1057, [recall] 0.0276
<<aggregate_mean>> [f1] 0.8633, [precision] 0.8364, [recall] 0.95
<<aggregate_min>> [f1] 0.8813, [precision] 0.8874, [recall] 0.9207
<<aggregate_max>> [f1] 0.7745, [precision] 0.6997, [recall] 0.9671
THRESHOLD: 0.41000000000000003
model=bert [f1] 0.8387, [precision] 0.8031, [recall] 0.9439
model=bert_indonesian [f1] 0.8396, [precision] 0.8063, [recall] 0.9408
model=xlm_roberta [f1] 0.8426, [precision] 0.8086, [recall] 0.9451
model=be

<<aggregate_min>> [f1] 0.876, [precision] 0.9227, [recall] 0.8778
<<aggregate_max>> [f1] 0.8684, [precision] 0.8486, [recall] 0.9422
THRESHOLD: 0.5000000000000001
model=bert [f1] 0.8766, [precision] 0.8885, [recall] 0.9099
model=bert_indonesian [f1] 0.876, [precision] 0.8884, [recall] 0.9083
model=xlm_roberta [f1] 0.8778, [precision] 0.8904, [recall] 0.9108
model=bert_transformer [f1] 0.8722, [precision] 0.902, [recall] 0.8904
<<<ensemble_ANY>>>: [f1] 0.8681, [precision] 0.8455, [recall] 0.9443
<<<ensemble_MAJOR>>>: [f1] 0.8865, [precision] 0.897, [recall] 0.922
<<<ensemble_MAJOR>>>: [f1] 0.8831, [precision] 0.9236, [recall] 0.8956
<<<ensemble_ALL>>>: [f1] 0.0567, [precision] 0.1572, [recall] 0.038
<<aggregate_mean>> [f1] 0.888, [precision] 0.9082, [recall] 0.9114
<<aggregate_min>> [f1] 0.8739, [precision] 0.9254, [recall] 0.8723
<<aggregate_max>> [f1] 0.8726, [precision] 0.858, [recall] 0.9384
THRESHOLD: 0.5100000000000001
model=bert [f1] 0.8777, [precision] 0.8942, [recall] 0.9059
mo