In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, confusion_matrix, 
from sklearn.model_selection import train_test_split, StratifiedKFold

def eval_all(preds, dtrain):
    y_true = dtrain.get_label()
    prec, recall, thresholds = precision_recall_curve(y_true, preds)
    f1s = 2 * (prec * recall) / (prec + recall + 1e-10)
    f1s[np.isnan(f1s)] = 0
    best_idx = np.argmax(f1s)
    return [
            ('f1', f1s[best_idx], True),
            ('precision', prec[best_idx], True),
            ('recall', recall[best_idx], True),
            ]

In [12]:
train_df = pd.read_parquet("../Data/training_feature_v1.parquet")
test_df = pd.read_parquet("../testing/X_all.parquet")
# sudo0_df = pd.read_csv("../smt/prc9/sudo_0.csv") #180
# sudo1_df = pd.read_csv("../smt/prc9/prc9_smt_127.csv")

# sudo1_df = pd.read_csv("../testing/true127.csv")
# sudo1_df= pd.read_csv('../testing/guess.csv')

# tmpl = pd.read_csv('../testing/submission_template_public_and_private.csv')

In [16]:
NUM = 1000
CUM = 15
# ========== 讀取特徵重要性 ==========
# gain = pd.read_csv(filepath_or_buffer=f'./pos_shap_abs_mean.csv', index_col=0)

# method 1
gain = pd.read_csv(filepath_or_buffer=f'../feat_v4/gain.csv', index_col=0)
num_trails = gain.shape[1]

res = {}
for i in range(num_trails):
    rank = gain.iloc[:, i].nlargest(n=NUM).index
    for item in rank:
        if item not in res:
            res[item] = 0
        res[item] += 1
        
rank_cnt = pd.DataFrame(res.items(), columns=['feature', 'rank'])
selected_features = rank_cnt.query(f'rank >= {CUM}')['feature']
print(f"Selected {len(selected_features)} features")
del gain

Selected 153 features


In [17]:
X_train = train_df[selected_features]
y_train = train_df['飆股']

scaler = PowerTransformer(method='yeo-johnson', standardize=True)
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(test_df[selected_features])

In [27]:
np.random.seed(1111)
seeds = np.random.randint(0, 10000, size=10)
num_splits = 10

wrong_cnt = np.zeros((X_train.shape[0], ))
prob_metrics = np.zeros((X_train.shape[0], len(seeds)))
pred_metrics = np.zeros((X_train.shape[0], len(seeds)))
thres_rec = np.zeros((len(seeds), num_splits))
f1_rec = np.zeros((len(seeds), num_splits))
prec_rec = np.zeros((len(seeds), num_splits))
recall_rec = np.zeros((len(seeds), num_splits))

In [None]:
for idx, seed in enumerate(seeds):
    stf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(stf.split(X_train, y_train)):
        print(f"Iteration {idx+1}, fold {fold+1}")

        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # LightGBM
        lgb_train_set = lgb.Dataset(X_train_fold, label=y_train_fold)
        lgb_valid_set = lgb.Dataset(X_val_fold, label=y_val_fold, reference=lgb_train_set)

        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            # 'data_sample_strategy': 'goss',
            'metric': ['recall', 'f1', 'precision', ],
            # 'metric': 'binary_logloss',
            'is_unbalance': True,
            'verbosity': -1,

            'learning_rate': 0.03,
            'num_leaves': 64,
            'max_depth': -1,
            # 'min_data_in_leaf': 5,

            'feature_fraction': 0.7,
            'bagging_fraction': 0.75,
            # 'top_rate': 0.25,
            # 'other_rate': 0.5,

            'seed': seed,
            'n_jobs': 50,
        }

        model = lgb.train(
            params,
            train_set=lgb_train_set,
            valid_sets=[lgb_valid_set],
            valid_names=['valid', 'test'],
            num_boost_round=10000,
            feval=[eval_all,],
            callbacks=[
                lgb.early_stopping(stopping_rounds=3000, first_metric_only=True, verbose=True),
                lgb.log_evaluation(period=1000),
                # lgb.reset_parameter(learning_rate=dynamic_lr),
                ], )
        
        val_prob = model.predict(X_val_fold, num_iteration=model.best_iteration)
        prec, recall, thresholds = precision_recall_curve(y_val_fold, val_prob)
        f1s = 2 * (prec * recall) / (prec + recall + 1e-10)
        f1s[np.isnan(f1s)] = 0
        best_idx = np.argmax(f1s)
        best_threshold = thresholds[best_idx]

        val_pred = (val_prob >= best_threshold).astype(int)
        wrong_idx = np.where(val_pred != y_val_fold.values.reshape(-1))[0]
        wrong_cnt[val_idx][wrong_idx] += 1
        cm = confusion_matrix(y_val_fold, val_pred)
        print(f"Number of False Positives: {cm[0][1]}, Number of False Negatives: {cm[1][0]}")


        prob_metrics[val_idx, idx] = val_prob
        pred_metrics[val_idx, idx] = val_pred

        thres_rec[idx, fold] = best_threshold
        f1_rec[idx, fold] = f1s[best_idx]
        prec_rec[idx, fold] = prec[best_idx]
        recall_rec[idx, fold] = recall[best_idx]
        print(f"Threshold: {best_threshold:.4f}, f1: {f1s[best_idx]:.4f}, precision: {prec[best_idx]:.4f}, recall: {recall[best_idx]:.4f}")
        print()
    # print(f"==========================")

In [51]:
def Cal_F1(precison, recall):
    return 2 * (precison * recall) / (precison + recall + 1e-10)

Cal_F1(0.9161, 0.7655)

0.8340563153642925