# ensemble

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, StandardScaler
import glob
import tqdm
import torch

In [2]:
df = pd.read_csv("../input/shopee-product-matching/train_fold.csv")
df = df[df["fold"] == 0]
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)


In [3]:
posting_id = df["posting_id"].values

In [4]:
def get_cosine_similarity(embeddings):

    CHUNK = 1024 * 2
    CTS = len(embeddings) // CHUNK
    if (len(embeddings)%CHUNK) != 0:
        CTS += 1
    distances = []
    
    with torch.no_grad():
        embeddings = normalize(embeddings)
        embeddings = torch.tensor(embeddings).cuda()

        for j in range( CTS ):
            a = j * CHUNK
            b = (j+1) * CHUNK
            b = min(b, len(embeddings))
            # print('chunk', a, 'to', b)

            # COSINE SIMILARITY DISTANCE
            cts = torch.matmul(embeddings, embeddings[a:b].T).T
            distances.append(cts.detach().cpu().numpy())
    
    return np.concatenate(distances)

In [5]:
def get_cv(df, similarity_matrix, threshold, posting_id, indices=None, pred_name="pred", min_n=2, mode="min"):
    preds = []
    for k in range(len(df)):
        if mode == "min": # euclid distance etc
            IDX = np.where(similarity_matrix[k, ] < threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[:min_n]
        if mode == "max": # cosine similarlity
            IDX = np.where(similarity_matrix[k, ] > threshold)[0]
            if len(IDX) < min_n:                
                IDX = np.argsort(similarity_matrix[k, ])[-min_n:]
            
        pred = posting_id[IDX]
        preds.append(pred)
    
    df[pred_name] = preds
    f1score, precision, recall = calc_cv(df, col_name=pred_name)
    return f1score, precision, recall

In [6]:
def calc_cv(df, col_name):
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df[f'f1_{col_name}'] = df.apply(get_f1(col_name),axis=1)
    df[f"precision_{col_name}"] = df.apply(get_precision(col_name), axis=1)
    df[f"recall_{col_name}"] = df.apply(get_recall(col_name), axis=1)
    return df[f"f1_{col_name}"].mean(), df[f"precision_{col_name}"].mean(), df[f"recall_{col_name}"].mean()

def get_f1(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

def get_precision(col):
    def precision_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        if len(row[col]) == 0:
            return 0
        else:
            return n / len(row[col])
    return precision_score

def get_recall(col):
    def recall_score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return n / len(row.target)
    return recall_score

In [7]:
df_kiccho = pd.read_csv("kiccho-san/valid_df_0.csv")
df_kiccho = pd.merge(df, df_kiccho.reset_index()[["posting_id", "index"]])
kiccho_index = df_kiccho["index"].values

In [8]:
model_dist_dict = {}
model_dicts = [
    # {"model_dir": "exp091/20210430175943", "best_epoch": 11, "name": "swin_base_patch4_window7_224_cv0.871"},
    {"model_dir": "exp090/20210430010824", "best_epoch": 16, "name": "vit_base_patch16_384_cv0.875"},
    {"model_dir": "exp088/20210429111544", "best_epoch": 11, "name": "swin_base_patch4_window12_384_cv0.873"},
    # {"model_dir": "exp091/20210430200744", "best_epoch": 10, "name": "swin_large_patch4_window7_224_cv0.871"}
]

for model_dict in model_dicts:
    name = model_dict["name"]
    epoch = model_dict["best_epoch"]
    base_dir = f"../output/{model_dict['model_dir']}"
    model_dist_dict[f"{name}_all"] = {
        "embeddings": np.load(f"{base_dir}/embeddings_all_epoch{epoch}.npy")
    }
    model_dist_dict[f"{name}_img"] = {
        "embeddings": np.load(f"{base_dir}/embeddings_images_epoch{epoch}.npy")
    }
    model_dist_dict[f"{name}_text"] = {
        "embeddings": np.load(f"{base_dir}/embeddings_texts_epoch{epoch}.npy")
    }
kiccho_models = ["exp383", "exp385"]
for kiccho_model in kiccho_models:
    path_all = glob.glob(f"kiccho-san/{kiccho_model}/features*")[0]
    path_img = glob.glob(f"kiccho-san/{kiccho_model}/img_features*")[0]
    path_txt = glob.glob(f"kiccho-san/{kiccho_model}/txt_features*")[0]
    
    model_dist_dict[f"{kiccho_model}_all"] = {
        "embeddings": np.load(path_all)[kiccho_index]
    }
    model_dist_dict[f"{kiccho_model}_img"] = {
        "embeddings": np.load(path_img)
    }
    model_dist_dict[f"{kiccho_model}_text"] = {
        "embeddings": np.load(path_txt)
    }

IndexError: list index out of range

In [11]:
model_dist_dict["vit_base_patch16_384_cv0.875_all"]["embeddings"].shape

(6839, 2048)

In [14]:
model_dist_dict["vit_base_patch16_384_cv0.875_text"]["embeddings"].shape

(6839, 11014)

In [75]:
for model_name in model_dist_dict.keys():
    model_dist_dict[model_name]["cosine_similarity"] = get_cosine_similarity(model_dist_dict[model_name]["embeddings"])    
    torch.cuda.empty_cache()

In [76]:
def combine_predictions_any(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)
    x = np.unique(x)
    return x

In [77]:
def combine_predictions_major(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 1
    return x[ret_idx]

In [78]:
def combine_predictions_all(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)

    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts == 3
    return x[ret_idx]

In [79]:
def combine_predictions_major2(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 2
    return x[ret_idx]

In [80]:
def combine_predictions_major3(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 3
    return x[ret_idx]

In [81]:
def combine_predictions_major4(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 4
    return x[ret_idx]

In [82]:
def combine_predictions_major5(row):
    x = np.concatenate(row[model_dist_dict.keys()].values)
    x, counts = np.unique(x, return_counts=True)
    
    ret_idx = counts > 5
    return x[ret_idx]

## get_cv(1件→2件むりやり)

In [83]:
def aggregate_distance(model_dist_dict, mode):
    ary = []
    
    for k, v in model_dist_dict.items():
        ary.append(model_dist_dict[k]["cosine_similarity"])
    
    ary = np.array(ary)
    
    if mode == "min":
        ary = ary.min(axis=0)
    if mode == "mean":
        ary = ary.mean(axis=0)
    if mode == "median":
        ary = np.median(ary, axis=0)
    if mode == "max":
        ary = ary.max(axis=0)

    return ary

In [70]:
df_result = []
for th in np.arange(0.45, 0.55, 0.01):
    print("=========================================================")
    print(f"THRESHOLD: {th}")
    for model_name in model_dist_dict.keys():
        if "_all" in model_name:
            f1score, precision, recall = get_cv(df, 
                                                model_dist_dict[model_name]["cosine_similarity"],
                                                th,
                                                posting_id,
                                                pred_name=model_name,
                                                min_n=2,
                                                mode="max")
            print(f"model={model_name} [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")
    df["pred"] = df.apply(combine_predictions_major, axis=1)
    f1score, precision, recall = calc_cv(df, col_name="pred")
    print(f"<<<ensemble_MAJOR>>>: [f1] {round(f1score, 4)}, [precision] {round(precision, 4)}, [recall] {round(recall, 4)}")

THRESHOLD: 0.45
model=vit_base_patch16_384_cv0.875_all [f1] 0.8616, [precision] 0.8341, [recall] 0.9485
model=swin_base_patch4_window12_384_cv0.873_all [f1] 0.8516, [precision] 0.8155, [recall] 0.9556
model=exp383_all [f1] 0.8824, [precision] 0.9137, [recall] 0.8952
model=exp385_all [f1] 0.8798, [precision] 0.9148, [recall] 0.89
<<<ensemble_MAJOR>>>: [f1] 0.8774, [precision] 0.8597, [recall] 0.9487
THRESHOLD: 0.46
model=vit_base_patch16_384_cv0.875_all [f1] 0.8669, [precision] 0.8444, [recall] 0.9451
model=swin_base_patch4_window12_384_cv0.873_all [f1] 0.8577, [precision] 0.8263, [recall] 0.9522
model=exp383_all [f1] 0.8807, [precision] 0.9165, [recall] 0.8895
model=exp385_all [f1] 0.8788, [precision] 0.9181, [recall] 0.8848
<<<ensemble_MAJOR>>>: [f1] 0.8803, [precision] 0.8667, [recall] 0.9454
THRESHOLD: 0.47000000000000003
model=vit_base_patch16_384_cv0.875_all [f1] 0.871, [precision] 0.853, [recall] 0.9417
model=swin_base_patch4_window12_384_cv0.873_all [f1] 0.8625, [precision] 0.83

In [126]:
all_th = 0.6
img_th = 0.65
text_th = 0.7
for model_name in model_dist_dict.keys():
    if "all" in model_name:
        th = all_th
    if "img" == model_name:
        th = img_th
    if "text" == model_name:
        th = text_th

    f1score, precision, recall = get_cv(df, 
                                        model_dist_dict[model_name]["cosine_similarity"],
                                        th,
                                        posting_id,
                                        pred_name=model_name,
                                        min_n=2,
                                        mode="max")
    print(f"{model_name}: th={th} f1score={f1score}")

swin_base_patch4_window7_224_cv0.871_all: th=0.6 f1score=0.8784494521837478
swin_base_patch4_window7_224_cv0.871_img: th=0.6 f1score=0.8064437581415261
swin_base_patch4_window7_224_cv0.871_text: th=0.6 f1score=0.811954811669544
vit_base_patch16_384_cv0.875_all: th=0.6 f1score=0.8746093699061965
vit_base_patch16_384_cv0.875_img: th=0.6 f1score=0.8067150444988691
vit_base_patch16_384_cv0.875_text: th=0.6 f1score=0.8104693355659606
swin_base_patch4_window12_384_cv0.873_all: th=0.6 f1score=0.8824417972670031
swin_base_patch4_window12_384_cv0.873_img: th=0.6 f1score=0.8198312011820124
swin_base_patch4_window12_384_cv0.873_text: th=0.6 f1score=0.8124039526575115
swin_large_patch4_window7_224_cv0.871_all: th=0.6 f1score=0.8762742172346852
swin_large_patch4_window7_224_cv0.871_img: th=0.6 f1score=0.8175571614729272
swin_large_patch4_window7_224_cv0.871_text: th=0.6 f1score=0.8108493821705074


In [87]:
df_result = []
for all_th in [0.5, 0.55, 0.6]:
    for model_name in model_dist_dict.keys():
        if "all" in model_name:
            f1score, precision, recall = get_cv(df, 
                                                model_dist_dict[model_name]["cosine_similarity"],
                                                all_th,
                                                posting_id,
                                                pred_name=model_name,
                                                min_n=2,
                                                mode="max")
    
    for img_th in np.arange(all_th, 0.8, 0.05):
        for model_name in model_dist_dict.keys():
            if "img" in model_name:
                f1score, precision, recall = get_cv(df, 
                                                    model_dist_dict[model_name]["cosine_similarity"],
                                                    img_th,
                                                    posting_id,
                                                    pred_name=model_name,
                                                    min_n=2,
                                                    mode="max")

        for text_th in np.arange(all_th, 0.8, 0.05):
            print(f"{all_th}_{img_th}_{text_th}")

            for model_name in model_dist_dict.keys():
                if "text" in model_name:
                    f1score, precision, recall = get_cv(df, 
                                                        model_dist_dict[model_name]["cosine_similarity"],
                                                        text_th,
                                                        posting_id,
                                                        pred_name=model_name,
                                                        min_n=2,
                                                        mode="max")
            df["pred"] = df.apply(combine_predictions_major, axis=1)
            f1score, precision, recall = calc_cv(df, col_name="pred")
            df_result.append({"name": "vote2",
                              "all_th": all_th,
                              "img_th": img_th,
                              "text_th": text_th,
                              "f1score": f1score,
                              "precision": precision,
                              "recall": recall})

            df["pred"] = df.apply(combine_predictions_major2, axis=1)
            f1score, precision, recall = calc_cv(df, col_name="pred")
            df_result.append({"name": "vote3",
                              "all_th": all_th,
                              "img_th": img_th,
                              "text_th": text_th,
                              "f1score": f1score,
                              "precision": precision,
                              "recall": recall})

            df["pred"] = df.apply(combine_predictions_major3, axis=1)
            f1score, precision, recall = calc_cv(df, col_name="pred")
            df_result.append({"name": "vote4",
                              "all_th": all_th,
                              "img_th": img_th,
                              "text_th": text_th,
                              "f1score": f1score,
                              "precision": precision,
                              "recall": recall})

            df["pred"] = df.apply(combine_predictions_major4, axis=1)
            f1score, precision, recall = calc_cv(df, col_name="pred")
            df_result.append({"name": "vote5",
                              "all_th": all_th,
                              "img_th": img_th,
                              "text_th": text_th,
                              "f1score": f1score,
                              "precision": precision,
                              "recall": recall})

            df["pred"] = df.apply(combine_predictions_major5, axis=1)
            f1score, precision, recall = calc_cv(df, col_name="pred")
            df_result.append({"name": "vote6",
                              "all_th": all_th,
                              "img_th": img_th,
                              "text_th": text_th,
                              "f1score": f1score,
                              "precision": precision,
                              "recall": recall})    
            print(df_result[-1])

0.5_0.5_0.5
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.5, 'text_th': 0.5, 'f1score': 0.8743186577212235, 'precision': 0.9323202764754972, 'recall': 0.8744227040781476}
0.5_0.5_0.55
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.5, 'text_th': 0.55, 'f1score': 0.8736232061964598, 'precision': 0.9368199637760729, 'recall': 0.8696625849790075}
0.5_0.5_0.6000000000000001
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.5, 'text_th': 0.6000000000000001, 'f1score': 0.8722936911753599, 'precision': 0.9419596908224699, 'recall': 0.8636767038153563}
0.5_0.5_0.6500000000000001
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.5, 'text_th': 0.6500000000000001, 'f1score': 0.8711911265931476, 'precision': 0.9457942414475257, 'recall': 0.8593198357962566}
0.5_0.5_0.7000000000000002
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.5, 'text_th': 0.7000000000000002, 'f1score': 0.8698846154862218, 'precision': 0.9489614343261158, 'recall': 0.854706099737437}
0.5_0.5_0.7500000000000002
{'name': 'vote6', 'all_th': 0.5,

{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.7500000000000002, 'text_th': 0.7000000000000002, 'f1score': 0.8568666997399106, 'precision': 0.9612377529965993, 'recall': 0.8264245457991055}
0.5_0.7500000000000002_0.7500000000000002
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.7500000000000002, 'text_th': 0.7500000000000002, 'f1score': 0.8531323932338816, 'precision': 0.9639770375587823, 'recall': 0.8188853405929868}
0.5_0.7500000000000002_0.8000000000000003
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.7500000000000002, 'text_th': 0.8000000000000003, 'f1score': 0.8487883378398946, 'precision': 0.9674714788437815, 'recall': 0.8103535102269644}
0.5_0.8000000000000003_0.5
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.8000000000000003, 'text_th': 0.5, 'f1score': 0.8633097283445131, 'precision': 0.9531679453323837, 'recall': 0.8408471034921926}
0.5_0.8000000000000003_0.55
{'name': 'vote6', 'all_th': 0.5, 'img_th': 0.8000000000000003, 'text_th': 0.55, 'f1score': 0.861707056145525, 'precision': 0

KeyboardInterrupt: 

In [88]:
pd.DataFrame(df_result).sort_values("f1score", ascending=False).to_csv("exp022_result.csv")