In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

def eval_all(preds, dtrain):
    y_true = dtrain.get_label()
    prec, recall, thresholds = precision_recall_curve(y_true, preds)
    f1s = 2 * (prec * recall) / (prec + recall + 1e-10)
    f1s[np.isnan(f1s)] = 0
    best_idx = np.argmax(f1s)
    return [
            ('f1', f1s[best_idx], True),
            ('precision', prec[best_idx], True),
            ('recall', recall[best_idx], True),
            ]

In [2]:
train_df = pd.read_parquet("../Data/training_feature_v1.parquet")
test_df = pd.read_parquet("../testing/X_all.parquet")
# sudo0_df = pd.read_csv("../smt/prc9/sudo_0.csv") #180
# sudo1_df = pd.read_csv("../smt/prc9/prc9_smt_127.csv")

# sudo1_df = pd.read_csv("../testing/true127.csv")
# sudo1_df= pd.read_csv('../testing/guess136.csv')
tmpl = pd.read_csv('../testing/submission_template_public_and_private.csv')

In [3]:
guess_df = pd.read_csv('../testing/guess142-2.csv')
guess0_df = pd.read_csv('../testing/guess0_61.csv')

In [28]:
NUM = 1000
CUM = 8
# ========== 讀取特徵重要性 ==========
# gain = pd.read_csv(filepath_or_buffer=f'./pos_shap_abs_mean.csv', index_col=0)

# method 1
gain = pd.read_csv(filepath_or_buffer=f'./gain.csv', index_col=0)
num_trails = gain.shape[1]

res = {}
for i in range(num_trails):
    rank = gain.iloc[:, i].nlargest(n=NUM).index
    for item in rank:
        if item not in res:
            res[item] = 0
        res[item] += 1
        
rank_cnt = pd.DataFrame(res.items(), columns=['feature', 'rank'])
selected_features = rank_cnt.query(f'rank >= {CUM}')['feature']
print(f"Selected {len(selected_features)} features")
del gain

Selected 677 features


In [37]:
selected_features = pd.read_csv('../Data/common_features4.csv', header=0)['feature'].tolist()

In [39]:
X_train_all = train_df[selected_features]
X_test = test_df[selected_features]
y_train_all = train_df['飆股']
scaler = PowerTransformer(method='yeo-johnson', standardize=True)
# scaler = StandardScaler()

X_train_test = scaler.fit_transform(np.concatenate([X_train_all, X_test], axis=0)) 
X_train_pt = X_train_test[:len(X_train_all)]
X_test_pt = X_train_test[len(X_train_all):]

In [40]:
guess_df['飆股'] = guess_df['飆股'].astype(float)
f_guess = guess_df.copy()
zero_list = np.where(guess0_df[:25108]['飆股']==1)[0] + 1
one_list = np.where(f_guess[:25108]['飆股']==1)[0] + 1
guess_mer_df = pd.merge(guess_df, guess0_df, how='left', on='ID', suffixes=('_1', '_0'))

zero_list_imp = [
    20029, 14120, 13181, 7555, 780, 
    6722, 14467, 6833, 22570, 12519, 
    8806, 4160, 7613
]
one_list_imp = [15012, 12155, 23668, 19506]



zero_np = np.array(zero_list) - 1
one_np = np.array(one_list) - 1
# known_idx = np.concatenate((zero_np, one_np))
known_idx = zero_np
# known_idx = one_np
print(f"Length of zero index: {len(zero_np)}")
print(f"Length of one index: {len(one_np)}")
print(f"Length of test index: {len(known_idx)}")

Length of zero index: 67
Length of one index: 143
Length of test index: 67


In [41]:
X_train, X_val, y_train, y_val = train_test_split(X_train_pt, y_train_all, test_size=0.12,
                                                   random_state=42, stratify=y_train_all)

X_test_known = X_test_pt[known_idx]
y_test_known = np.concatenate([np.zeros(len(zero_np)), np.ones(len(one_np))], axis=0)
y_test_known = np.zeros(len(zero_np))
# y_test_known = np.ones(len(one_np))

X_train_aug = np.concatenate([X_train, X_test_known], axis=0)
y_train_aug = np.concatenate([y_train, y_test_known], axis=0)

rate = (y_train_all == 0).sum() / (y_train_all == 1).sum() 
sample_weight = np.where(y_train == 1, rate, 1)
known_test_weight = np.where(y_test_known == 1, rate*0.5, rate)
# known_test_weight = np.array([1] * len(y_test_known))
sample_weight = np.concatenate([sample_weight, known_test_weight], axis=0)

In [45]:
# lgb_train_set = lgb.Dataset(X_train, y_train)
lgb_train_set = lgb.Dataset(X_train_aug, y_train_aug)
# lgb_train_set = lgb.Dataset(X_train_aug, y_train_aug, weight=sample_weight)
# lgb_train_set = lgb.Dataset(X_train_aug, y_train_aug)
lgb_valid_set = lgb.Dataset(X_val, y_val, reference=lgb_train_set)


params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    # 'data_sample_strategy': 'goss',
    'metric': ['recall', 'f1', 'precision', ],
    # 'metric': 'binary_logloss',
    'is_unbalance': True,
    # 'scale_pos_weight': 20,
    'verbosity': -1,

    'learning_rate': 0.03,
    'num_leaves': 16,
    'max_depth': -1,
    'min_data_in_leaf': 5,
    # 'feature_pre_filter': False,

    'feature_fraction': 0.5,
    # 'bagging_fraction': 0.75,
    # 'top_rate': 0.25,
    # 'other_rate': 0.5,
    'pos_bagging_fraction': 1,
    'neg_bagging_fraction': 0.1,
    'bagging_freq': 100,

    # 'lambda_l2': 0.05, 

    'n_jobs': 30,
}

model = lgb.train(
    params,
    train_set=lgb_train_set,
    valid_sets=[lgb_valid_set],
    valid_names=['valid'],
    num_boost_round=20000,
    feval=[eval_all,],
    callbacks=[
        # lgb.early_stopping(stopping_rounds=3000, first_metric_only=True, verbose=True),
        lgb.log_evaluation(period=500),
        ],
)

best_iter = model.current_iteration()
print(f"Best iteration: {best_iter}")
print(model.params)

  3%|▋                           | 523/20000 [00:03<02:17, 141.90it/s]

[500]	valid's f1: 0.584906	valid's precision: 0.65493	valid's recall: 0.528409


  5%|█▎                         | 1018/20000 [00:07<02:13, 142.14it/s]

[1000]	valid's f1: 0.651899	valid's precision: 0.735714	valid's recall: 0.585227


  8%|██                         | 1528/20000 [00:10<02:10, 141.25it/s]

[1500]	valid's f1: 0.658824	valid's precision: 0.682927	valid's recall: 0.636364


 10%|██▋                        | 2027/20000 [00:14<02:07, 140.58it/s]

[2000]	valid's f1: 0.697819	valid's precision: 0.772414	valid's recall: 0.636364


 13%|███▍                       | 2528/20000 [00:17<02:03, 141.90it/s]

[2500]	valid's f1: 0.722581	valid's precision: 0.835821	valid's recall: 0.636364


 15%|████                       | 3025/20000 [00:21<02:02, 139.12it/s]

[3000]	valid's f1: 0.726073	valid's precision: 0.866142	valid's recall: 0.625


 18%|████▊                      | 3528/20000 [00:25<01:57, 140.34it/s]

[3500]	valid's f1: 0.729642	valid's precision: 0.854962	valid's recall: 0.636364


 20%|█████▍                     | 4016/20000 [00:28<01:55, 138.24it/s]

[4000]	valid's f1: 0.738854	valid's precision: 0.84058	valid's recall: 0.659091


 23%|██████                     | 4527/20000 [00:32<01:54, 134.97it/s]

[4500]	valid's f1: 0.745875	valid's precision: 0.889764	valid's recall: 0.642045


 25%|██████▊                    | 5019/20000 [00:36<01:49, 136.62it/s]

[5000]	valid's f1: 0.755556	valid's precision: 0.856115	valid's recall: 0.676136


 28%|███████▍                   | 5524/20000 [00:39<01:45, 136.86it/s]

[5500]	valid's f1: 0.753894	valid's precision: 0.834483	valid's recall: 0.6875


 30%|████████                   | 6014/20000 [00:43<01:43, 135.57it/s]

[6000]	valid's f1: 0.756757	valid's precision: 0.933333	valid's recall: 0.636364


 33%|████████▊                  | 6518/20000 [00:47<01:39, 136.06it/s]

[6500]	valid's f1: 0.767213	valid's precision: 0.906977	valid's recall: 0.664773


 35%|█████████▍                 | 7022/20000 [00:50<01:36, 134.73it/s]

[7000]	valid's f1: 0.771704	valid's precision: 0.888889	valid's recall: 0.681818


 38%|██████████▏                | 7526/20000 [00:54<01:33, 133.97it/s]

[7500]	valid's f1: 0.770227	valid's precision: 0.894737	valid's recall: 0.676136


 40%|██████████▊                | 8016/20000 [00:58<01:28, 135.35it/s]

[8000]	valid's f1: 0.782895	valid's precision: 0.929688	valid's recall: 0.676136


 43%|███████████▌               | 8520/20000 [01:01<01:25, 134.55it/s]

[8500]	valid's f1: 0.778878	valid's precision: 0.929134	valid's recall: 0.670455


 45%|████████████▏              | 9024/20000 [01:05<01:21, 134.07it/s]

[9000]	valid's f1: 0.771704	valid's precision: 0.888889	valid's recall: 0.681818


 48%|████████████▊              | 9523/20000 [01:09<01:15, 139.44it/s]

[9500]	valid's f1: 0.776316	valid's precision: 0.921875	valid's recall: 0.670455


 50%|█████████████             | 10023/20000 [01:12<01:06, 150.73it/s]

[10000]	valid's f1: 0.774603	valid's precision: 0.877698	valid's recall: 0.693182


 53%|█████████████▋            | 10519/20000 [01:15<01:01, 153.11it/s]

[10500]	valid's f1: 0.772152	valid's precision: 0.871429	valid's recall: 0.693182


 55%|██████████████▎           | 11018/20000 [01:19<00:55, 161.80it/s]

[11000]	valid's f1: 0.77116	valid's precision: 0.86014	valid's recall: 0.698864


 58%|██████████████▉           | 11528/20000 [01:22<00:50, 167.97it/s]

[11500]	valid's f1: 0.775244	valid's precision: 0.908397	valid's recall: 0.676136


 60%|███████████████▋          | 12027/20000 [01:25<00:47, 167.86it/s]

[12000]	valid's f1: 0.779221	valid's precision: 0.909091	valid's recall: 0.681818


 63%|████████████████▎         | 12530/20000 [01:28<00:43, 173.51it/s]

[12500]	valid's f1: 0.778135	valid's precision: 0.896296	valid's recall: 0.6875


 65%|████████████████▉         | 13033/20000 [01:31<00:40, 171.96it/s]

[13000]	valid's f1: 0.771704	valid's precision: 0.888889	valid's recall: 0.681818


 68%|█████████████████▌        | 13524/20000 [01:33<00:35, 179.89it/s]

[13500]	valid's f1: 0.772152	valid's precision: 0.871429	valid's recall: 0.693182


 70%|██████████████████▏       | 14018/20000 [01:36<00:33, 178.94it/s]

[14000]	valid's f1: 0.772277	valid's precision: 0.92126	valid's recall: 0.664773


 73%|██████████████████▉       | 14532/20000 [01:39<00:30, 179.69it/s]

[14500]	valid's f1: 0.77707	valid's precision: 0.884058	valid's recall: 0.693182


 75%|███████████████████▌      | 15029/20000 [01:42<00:27, 180.06it/s]

[15000]	valid's f1: 0.781759	valid's precision: 0.916031	valid's recall: 0.681818


 78%|████████████████████▏     | 15521/20000 [01:44<00:24, 183.88it/s]

[15500]	valid's f1: 0.780328	valid's precision: 0.922481	valid's recall: 0.676136


 80%|████████████████████▊     | 16034/20000 [01:47<00:21, 184.03it/s]

[16000]	valid's f1: 0.776316	valid's precision: 0.921875	valid's recall: 0.670455


 83%|█████████████████████▍    | 16528/20000 [01:50<00:18, 183.90it/s]

[16500]	valid's f1: 0.774194	valid's precision: 0.895522	valid's recall: 0.681818


 85%|██████████████████████▏   | 17022/20000 [01:53<00:16, 179.87it/s]

[17000]	valid's f1: 0.775244	valid's precision: 0.908397	valid's recall: 0.676136


 88%|██████████████████████▊   | 17536/20000 [01:55<00:13, 187.41it/s]

[17500]	valid's f1: 0.776699	valid's precision: 0.902256	valid's recall: 0.681818


 90%|███████████████████████▍  | 18033/20000 [01:58<00:10, 185.14it/s]

[18000]	valid's f1: 0.772727	valid's precision: 0.901515	valid's recall: 0.676136


 93%|████████████████████████  | 18528/20000 [02:01<00:07, 185.85it/s]

[18500]	valid's f1: 0.772727	valid's precision: 0.901515	valid's recall: 0.676136


 95%|████████████████████████▋ | 19028/20000 [02:03<00:05, 187.52it/s]

[19000]	valid's f1: 0.771704	valid's precision: 0.888889	valid's recall: 0.681818


 98%|█████████████████████████▍| 19522/20000 [02:06<00:02, 187.55it/s]

[19500]	valid's f1: 0.770227	valid's precision: 0.894737	valid's recall: 0.676136


                                                                      

[20000]	valid's f1: 0.769231	valid's precision: 0.882353	valid's recall: 0.681818
Best iteration: 20000
{'objective': 'binary', 'boosting_type': 'gbdt', 'metric': ['recall', 'f1', 'precision'], 'is_unbalance': True, 'verbosity': -1, 'learning_rate': 0.03, 'num_leaves': 16, 'max_depth': -1, 'min_data_in_leaf': 5, 'feature_fraction': 0.5, 'pos_bagging_fraction': 1, 'neg_bagging_fraction': 0.1, 'bagging_freq': 100, 'n_jobs': 30, 'num_iterations': 20000}


In [43]:
val_prob = model.predict(data=X_val, n_jobs=30)

prec, recall, thresholds = precision_recall_curve(y_val, val_prob)
prec_lst = prec[:-1]
rec_lst = recall[:-1]
f1s = 2 * prec_lst * rec_lst / (prec_lst + rec_lst + 1e-10)
best_idx = np.argmax(f1s)
best_threshold_be = thresholds[best_idx]
print(f"Best threshold: {best_threshold_be:.4f}, F1: {f1s[best_idx]:.4f}, Precision: {prec[best_idx]:.4f}, Recall: {recall[best_idx]:.4f}")
val_pred_label = (val_prob >= best_threshold_be).astype(int)
val_num_pos = np.unique(val_pred_label, return_counts=True)[1][1]
print(f"Validation num pos: {val_num_pos}")


test_prob = model.predict(data=X_test_pt)
y_pred_label = (test_prob >= best_threshold_be).astype(int)
public_test_num_pos = np.unique(y_pred_label[:25108], return_counts=True)[1][1]
private_test_num_pos = np.unique(y_pred_label[25108:], return_counts=True)[1][1]
print(f"Before retain,  Public test num pos: {public_test_num_pos}, Private test num pos: {private_test_num_pos}")

Best threshold: 0.0920, F1: 0.7862, Precision: 0.8803, Recall: 0.7102
Validation num pos: 142
Before retain,  Public test num pos: 124, Private test num pos: 140


In [None]:
kk = pd.DataFrame({
    'ID': tmpl['ID'],
    'pred': y_pred_label,
    'prob': test_prob,
})[:25108]
kk.merge(guess_mer_df, how='left', on='ID').sort_values(by=['prob','飆股_1', '飆股_0'], ascending=False).reset_index(drop=True)

In [44]:
true_pos = sum((y_pred_label[:25108] == 1) & (guess_mer_df[:25108]['飆股_1'] == 1))
false_pos = sum((y_pred_label[:25108] == 1) & (guess_mer_df[:25108]['飆股_0'] == 1))
false_neg = sum((y_pred_label[:25108] == 0) & (guess_mer_df[:25108]['飆股_1'] == 1))
new_pred = sum((y_pred_label[:25108] == 1) & (guess_mer_df[:25108]['飆股_1'] == 0) & (guess_mer_df[:25108]['飆股_0'] == 0))
print(f"True pos: {true_pos}, False pos: {false_pos}, False neg: {false_neg}, New pred: {new_pred}")

True pos: 119, False pos: 0, False neg: 24, New pred: 5


In [None]:
pd.DataFrame({
    'ID': kk['ID'],
    '飆股': ((kk[:25108]['pred']==1) & ((guess_mer_df[:25108]['飆股_1']==0) & (guess_mer_df[:25108]['飆股_0']==0))).astype(int),
}).sort_values(by=['飆股'], ascending=False)

## Retrain

In [None]:
re_weight = np.where(y_val == 1, rate, 1)
re_sample_weight = np.concatenate([sample_weight, re_weight], axis=0)

retrain_set = np.concat([X_train_aug, X_val], axis=0)
retrain_label = np.concat([y_train_aug, y_val], axis=0)

retrain_set = lgb.Dataset(retrain_set, label=retrain_label, weight=re_sample_weight)
# retrain_set = lgb.Dataset(retrain_set, label=retrain_label, weight=retrain_sample_weight)


final_model = lgb.train(
    params,
    train_set=retrain_set,
    num_boost_round=best_iter,
)

In [None]:
test_prob = final_model.predict(data=X_test_pt)
y_pred_label = (test_prob >= best_threshold_be).astype(int)
public_test_num_pos = np.unique(y_pred_label[:25108], return_counts=True)[1][1]
private_test_num_pos = np.unique(y_pred_label[25108:], return_counts=True)[1][1]
print(f"After retain,  Public test num pos: {public_test_num_pos}, Private test num pos: {private_test_num_pos}")

In [None]:
pd.DataFrame({
    'ID': tmpl['ID'],
    'pred': y_pred_label,
    'prob': test_prob
})[:25108].merge(f_guess, how='left', on='ID').sort_values(by=['pred','prob'], ascending=False).reset_index(drop=True)
#.sort_values(by=['飆股','pred'], ascending=False).reset_index(drop=True)

In [None]:
pd.DataFrame({
    'ID': tmpl['ID'],
    '飆股': ((y_pred_label==1) & (f_guess['飆股']!=1)).astype(int),
})[:25108].to_csv('guess_146.csv', index=False)

In [None]:
def f1(prec, recall):
    if prec == 0 and recall == 0:
        return 0
    else:
        return (2 * prec * recall) / (prec + recall)
f1(0.93377, 0.80113)