In [1]:
import os

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.metrics import precision_recall_curve

In [2]:
val_df = pd.read_parquet('../Data/val_seed42.parquet')

### fv4_223

In [4]:
model_file_list = []

file_dir = '../feat_v4/fv4_223'
fv4_223_list = []

for file in os.listdir(file_dir):
    elment = file.split('_')
    if (float(elment[2]) > 0.93) and (float(elment[3]) > 0.7):
        model_file_list.append(file)
model_file_list

['fv4_223_0.954_0.707_133.txt',
 'fv4_223_0.939_0.731_139.txt',
 'fv4_223_0.949_0.701_139.txt',
 'fv4_223_0.938_0.714_138.txt',
 'fv4_223_0.963_0.707_131.txt',
 'fv4_223_0.950_0.718_135.txt',
 'fv4_223_0.950_0.718_134.txt',
 'fv4_223_0.938_0.724_139.txt',
 'fv4_223_0.933_0.707_139.txt',
 'fv4_223_0.945_0.704_135.txt']

In [19]:
val_pred_df = pd.DataFrame({
    'ID': val_df['ID'],
})

for m in model_file_list:
    element = m.split('_')
    col = m.replace('.txt', '')
    folder = f"{element[0]}_{element[1]}"
    path = f"../feat_v4/{folder}/{m}"
    model = lgb.Booster(model_file=path)

    selected_features = model.feature_name()

    val_prob = model.predict(val_df[selected_features])
    val_label = val_df.iloc[:,-1]
    precision, recall, thresholds = precision_recall_curve(val_label, val_prob)
    f1s = 2 * precision * recall / (precision + recall + 1e-10)
    best_index = np.argmax(f1s)
    best_thres = thresholds[best_index]
    print(f"Best threshold: {best_thres:g}, F1: {f1s[best_index]:g}, Precision: {precision[best_index]:g}, Recall: {recall[best_index]:g}")

    val_pred_df[col] = (val_prob > best_thres).astype(int)
val_pred_df['sum'] = val_pred_df.iloc[:, 1:].mean(axis=1)

Best threshold: 0.035063, F1: 0.8125, Precision: 0.954128, Recall: 0.707483
Best threshold: 0.0179061, F1: 0.82218, Precision: 0.938865, Recall: 0.731293
Best threshold: 0.21857, F1: 0.806262, Precision: 0.949309, Recall: 0.70068
Best threshold: 0.250939, F1: 0.810811, Precision: 0.9375, Recall: 0.714286
Best threshold: 0.0300036, F1: 0.815686, Precision: 0.962963, Recall: 0.707483
Best threshold: 0.0358074, F1: 0.817829, Precision: 0.95045, Recall: 0.717687
Best threshold: 0.029005, F1: 0.817829, Precision: 0.95045, Recall: 0.717687
Best threshold: 0.0158442, F1: 0.817658, Precision: 0.938326, Recall: 0.72449
Best threshold: 0.222118, F1: 0.804642, Precision: 0.932735, Recall: 0.707483
Best threshold: 0.165085, F1: 0.807018, Precision: 0.945205, Recall: 0.704082


In [None]:
val_pred_df.sort_values(by=['sum'], ascending=False).reset_index(drop=True)

Unnamed: 0,ID,fv4_223_0.954_0.707_133,fv4_223_0.939_0.731_139,fv4_223_0.949_0.701_139,fv4_223_0.938_0.714_138,fv4_223_0.963_0.707_131,fv4_223_0.950_0.718_135,fv4_223_0.950_0.718_134,fv4_223_0.938_0.724_139,fv4_223_0.933_0.707_139,fv4_223_0.945_0.704_135,sum
0,TR-135894,1,1,1,1,1,1,1,1,1,1,1.0
1,TR-50866,1,1,1,1,1,1,1,1,1,1,1.0
2,TR-161297,1,1,1,1,1,1,1,1,1,1,1.0
3,TR-34709,1,1,1,1,1,1,1,1,1,1,1.0
4,TR-110884,1,1,1,1,1,1,1,1,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
40168,TR-150560,0,0,0,0,0,0,0,0,0,0,0.0
40169,TR-82892,0,0,0,0,0,0,0,0,0,0,0.0
40170,TR-48376,0,0,0,0,0,0,0,0,0,0,0.0
40171,TR-19409,0,0,0,0,0,0,0,0,0,0,0.0


In [33]:
precision, recall, thresholds = precision_recall_curve(val_label, val_pred_df['sum'])
f1s = 2 * precision * recall / (precision + recall + 1e-10)
best_index = np.argmax(f1s)
best_thres = thresholds[best_index]

print(f"Best threshold: {best_thres:g}, F1: {f1s[best_index]:g}, Precision: {precision[best_index]:g}, Recall: {recall[best_index]:g}")

Best threshold: 0.3, F1: 0.823308, Precision: 0.920168, Recall: 0.744898


In [35]:
from sklearn.metrics import f1_score, precision_score, recall_score

tmp = (val_pred_df['sum'] >= best_thres).astype(int)
print(f"F1: {f1_score(val_label, tmp):g}, Precision: {precision_score(val_label, tmp):g}, Recall: {recall_score(val_label, tmp):g}")

F1: 0.823308, Precision: 0.920168, Recall: 0.744898
