In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, precision_score, recall_score


In [2]:
print("Loading data...")
df = pd.read_parquet("./Data/training_feature_v1.parquet")
test_df = pd.read_parquet("testing/public_x.parquet")

Loading data...


In [None]:
model_name = 'feat_v4/retrain_pt/model/pt_0.905_0.806_143157.txt'
model =lgb.Booster(model_file=model_name)
file_name = model_name.split("/")[-1].split(".txt")[0]
print("file_name:", file_name)


selected_features = model.feature_name()
X = df[selected_features]
y = df.iloc[:, -1]
print("X shape:", X.shape)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
y_pred_prob = model.predict(X_valid, num_iteration=model.best_iteration)

In [None]:
# y_valid: 真實標籤
# y_pred_prob: 模型預測的機率（對 class=1）

# 1. 計算 PR 曲線
prec_lst, rec_lst, thresholds = precision_recall_curve(y_valid, y_pred_prob)

# 2. 對齊長度（thresholds 比較短一個）
prec_lst = prec_lst[:-1]
rec_lst = rec_lst[:-1]
thresholds = thresholds

# 3. 計算每個 threshold 對應的 F1
f1s = 2 * prec_lst * rec_lst / (prec_lst + rec_lst + 1e-10)

# 4. 找出 F1 最大的位置
best_idx = np.argmax(f1s)
best_f1 = f1s[best_idx]
best_prec = prec_lst[best_idx]
best_rec = rec_lst[best_idx]

best_threshold = thresholds[best_idx]
print(f"Best threshold: {best_threshold}")
print(f"F1 Score: {best_f1:.4f}, Precision: {best_prec:.4f}, Recall: {best_rec:.4f}")

In [None]:
plt.figure(figsize=(7,5))
plt.plot(thresholds, prec_lst, label='Precision')
plt.plot(thresholds, rec_lst, label='Recall')
plt.plot(thresholds, f1s, label='F1 Score', linestyle='--')
plt.axvline(best_threshold, color='red', linestyle=':', label=f'Best Threshold = {best_threshold:.4f}')
plt.scatter(best_threshold, best_f1, color='red', zorder=5)

plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision, Recall, F1 vs Threshold')
plt.legend()
plt.grid(True)
# plt.show()  # 若有 GUI 可開啟


In [None]:
test_df = test_df[selected_features]
y_test_prob = model.predict(data=test_df, num_iteration=model.best_iteration)
y_pred_label = (y_test_prob >= best_threshold).astype(int)
np.unique(y_pred_label, return_counts=True)

In [None]:
smt_name = f'smt/csv_file/{file_name}.csv'
tmplate = pd.read_csv("testing/submission_template_public.csv")
tmplate['飆股'] = y_pred_label
tmplate.to_csv(smt_name, index=False)
tmp = pd.read_csv(smt_name)
print(tmp['飆股'].value_counts())
print(smt_name)

In [1]:
import pandas as pd
tmp = pd.read_csv('testing/g4.csv')
print(tmp['飆股'][:25108].value_counts())
print(tmp['飆股'][25108:].value_counts())


飆股
0    25104
1        4
Name: count, dtype: int64
Series([], Name: count, dtype: int64)
