In [1]:
import os

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.metrics import precision_recall_curve

In [2]:
val_df = pd.read_parquet('../Data/val_seed42.parquet')
test_df = pd.read_parquet('../testing/X_all.parquet')
tmplate = pd.read_csv('../testing/submission_template_public_and_private.csv')

val_label = val_df.iloc[:,-1]
print(test_df.shape, tmplate.shape)

(50216, 10213) (50216, 2)


In [9]:
model_path = f"../feat_v4/fv4_105/fv4_105_0.903_0.762_148.txt"
model = lgb.Booster(model_file=model_path)
selected_features = model.feature_name()

In [11]:
val_prob = model.predict(val_df[selected_features])

prcision, recall, thresholds = precision_recall_curve(val_label, val_prob)
f1s = 2 * (prcision * recall) / (prcision + recall + 1e-10)
best_index = np.argmax(f1s)
best_threshold = thresholds[best_index]
print(f"best_threshold: {best_threshold:.5f}, f1: {f1s[best_index]:.5f}, precision: {prcision[best_index]:.5f}, recall: {recall[best_index]:.5f}")
test_prob = model.predict(test_df[selected_features])
test_pred = (test_prob >= best_threshold).astype(int)
num_test = test_pred.shape[0]
print(f"public test: {test_pred[:25108].sum()}, private test: {test_pred[25108:].sum()}")
print(f"test_pred: {test_pred.sum()}")


best_threshold: 0.00878, f1: 0.82657, precision: 0.90323, recall: 0.76190
public test: 148, private test: 157
test_pred: 305


In [12]:
tmplate['飆股'] = test_pred

In [13]:
smt_name = './smt_folder/fv4_105_0.903_0.762_148.csv'
tmplate[:25108].to_csv(smt_name, index=False)

In [None]:
# smt_name = './test_smt_folder/fv4_105_0.884_0.779_159.csv'
# tmplate.to_csv(smt_name, index=False)

In [14]:
tmp = pd.read_csv(smt_name)
tmp['飆股'].value_counts()

飆股
0    24960
1      148
Name: count, dtype: int64

---

In [None]:
model_file_list = []

file_dir = '../feat_v4/fv4_223'
fv4_223_list = []

for file in os.listdir(file_dir):
    elment = file.split('_')
    if (float(elment[2]) > 0.93) and (float(elment[3]) > 0.7):
        model_file_list.append(file)
print(f'model_file_list: {model_file_list}')

In [None]:
for m in model_file_list:
    element = m.split('_')
    folder = f"{element[0]}_{element[1]}"
    path = f"../feat_v4/{folder}/{m}"
    model = lgb.Booster(model_file=path)

    selected_features = model.feature_name()

    val_prob = model.predict(val_df[selected_features])
    precision, recall, thresholds = precision_recall_curve(val_label, val_prob)
    f1s = 2 * precision * recall / (precision + recall + 1e-10)
    best_index = np.argmax(f1s)
    best_thres = thresholds[best_index]
    print(f"Best threshold: {best_thres}")

    test_prob = model.predict(test_df[selected_features])
    test_pred = (test_prob > best_thres).astype(int)

    tmplate['飆股'] = test_pred
    csv_name = f'./csv_file/{folder}/{m.split(".txt")[0]}.csv'
    tmplate.to_csv(csv_name, index=False)
    print(f"csv_name: {csv_name}")

    tmp = pd.read_csv(csv_name)
    print(tmp['飆股'].value_counts())
    print()