In [None]:
import re
import pickle
import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.metrics import accuracy_score
from scipy.optimize import minimize

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
sub_df = pd.read_csv("../input/sample_submission.csv")

In [None]:
def load_preds(exp_name):
    with open(f"../output/{exp_name}/oof_val_preds.pkl", "rb") as f:
        val_preds = pickle.load(f)

    with open(f"../output/{exp_name}/oof_test_preds.pkl", "rb") as f:
        test_preds = pickle.load(f)

    return val_preds, test_preds

weights = ["exp005", "exp009", "exp018"]
all_val_preds = np.zeros((len(train_df), len(weights)), dtype=np.float32)
all_test_preds = np.zeros((len(sub_df), len(weights)), dtype=np.float32)
for i, exp_name in enumerate(weights):
    val_preds, test_preds = load_preds(exp_name)
    val_preds = softmax(val_preds, axis=1)[:, 1]
    test_preds = softmax(test_preds, axis=1)[:, 1]
    all_val_preds[:, i] = val_preds
    all_test_preds[:, i] = test_preds

    score = accuracy_score(train_df["isFake"], (val_preds>0.5).astype(int))
    print(f"{exp_name}: score: {score:4f}")

In [None]:
def func(weights):
    preds = np.average(all_val_preds, axis=1, weights=weights)
    preds = (preds>0.5).astype(int)
    score = -accuracy_score(train_df["isFake"].values, preds)
    return score

n_models = all_val_preds.shape[1]
initial_weights = np.array([0.45, 0.1, 0.45])
res = minimize(func, initial_weights, method='Nelder-Mead')
print(-res["fun"])
print(res["x"])

In [None]:
all_val_preds = np.average(all_val_preds, axis=1, weights=res['x'])
all_test_preds = np.average(all_test_preds, axis=1, weights=res['x'])

In [None]:
train_df["pred"] = (all_val_preds>0.5).astype(int)
test_df["pred"] = (all_test_preds>0.5).astype(int)

In [None]:
best_score = 0.
for i in np.arange(0, 1.005, 0.005):
    score = accuracy_score(train_df["isFake"], (all_val_preds>i).astype(int))
    if best_score < score:
        best_score = score
best_score

# postprocess

In [None]:
accuracy_score(train_df["isFake"], train_df["pred"])

In [None]:
train_df[train_df["pred"]!=train_df["isFake"]]

In [None]:
train_df[train_df["text"].str.contains("47によると")]

In [None]:
test_df[test_df["text"].str.contains("47によると")]

In [None]:
test_df.loc[test_df["text"].str.contains("47によると"), "pred"] = 1

In [None]:
train_df[train_df["text"].str.contains("\(、以下同")]

In [None]:
test_df[test_df["text"].str.contains("\(、以下同")]

In [None]:
test_df.loc[test_df["text"].str.contains("\(、以下同"), "pred"] = 1

In [None]:
train_df[train_df["text"].apply(lambda x: True if re.match(r"^.?Cによると", x) else False)]

In [None]:
test_df[test_df["text"].apply(lambda x: True if re.match(r"^.?Cによると", x) else False)]

In [None]:
test_df.loc[test_df["text"].apply(lambda x: True if re.match(r"^.?Cによると", x) else False), "pred"] = 1

In [None]:
train_df[train_df["text"].apply(lambda x: True if re.match(r"^.?C、.+によると", x) else False)]

In [None]:
test_df[test_df["text"].apply(lambda x: True if re.match(r"^.?C、.+によると", x) else False)]

In [None]:
test_df.loc[test_df["text"].apply(lambda x: True if re.match(r"^.?C、.+によると", x) else False), "pred"] = 1

In [None]:
sub_df["isFake"] = test_df["pred"]

In [None]:
sub_df.to_csv("../output/ensamble_exp005_exp009_exp018_pp.csv", index=False)