In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from tqdm import tqdm

# --- Part 1: Feature Engineering (Final Corrected Version) ---
print("--- Part 1: Feature Engineering (Final Corrected Version) ---")

# 1.1 Load Data
print("Loading data...")
try:
    df_trans = pd.read_csv('acct_transaction.csv')
    df_alert = pd.read_csv('acct_alert.csv')
except FileNotFoundError:
    print("Error: CSV files not found.")
    exit()

# 1.2 Positive Sample Feature Engineering
print("Generating features for positive samples...")
# (This part is correct and remains unchanged)
alert_accts = df_alert['acct'].unique()
acct_to_event_date = df_alert.set_index('acct')['event_date'].to_dict()
positive_features_list = []
for acct, event_date in tqdm(acct_to_event_date.items(), desc="Processing alerted accounts"):
    acct_txns = df_trans[((df_trans['from_acct'] == acct) | (df_trans['to_acct'] == acct)) & (df_trans['txn_date'] < event_date)].copy()
    if acct_txns.empty: continue
    features = {'acct': acct, 'label': 1, 'ref_date': event_date}
    time_windows = [1, 3, 7, 30]
    for window in time_windows:
        window_txns = acct_txns[acct_txns['txn_date'] >= event_date - window]
        from_txns = window_txns[window_txns['from_acct'] == acct]
        features[f'from_count_{window}d'] = len(from_txns); features[f'from_amt_sum_{window}d'] = from_txns['txn_amt'].sum(); features[f'from_amt_mean_{window}d'] = from_txns['txn_amt'].mean()
        to_txns = window_txns[window_txns['to_acct'] == acct]
        features[f'to_count_{window}d'] = len(to_txns); features[f'to_amt_sum_{window}d'] = to_txns['txn_amt'].sum()
    epsilon = 1e-6
    features['from_amt_ratio_3d_30d'] = features['from_amt_sum_3d'] / (features['from_amt_sum_3d'] + epsilon)
    features['from_count_ratio_3d_30d'] = features['from_count_3d'] / (features['from_count_3d'] + epsilon)
    positive_features_list.append(features)
positive_features_df = pd.DataFrame(positive_features_list).fillna(0)

# 1.3 Negative Sample Feature Engineering (Corrected for Time-Split)
print("\nGenerating features for negative samples (Corrected for Time-Split)...")
all_accts = np.union1d(df_trans['from_acct'].unique(), df_trans['to_acct'].unique())
non_alert_accts = np.setdiff1d(all_accts, alert_accts)
neg_pos_ratio = 3
num_negative_samples = len(positive_features_df) * neg_pos_ratio
num_negative_samples = min(num_negative_samples, len(non_alert_accts))
np.random.seed(42)
selected_neg_accts = np.random.choice(non_alert_accts, size=num_negative_samples, replace=False)

negative_features_list = []
for acct in tqdm(selected_neg_accts, desc="Processing non-alerted accounts"):
    acct_all_txns = df_trans[(df_trans['from_acct'] == acct) | (df_trans['to_acct'] == acct)]
    # **KEY CORRECTION**: Sample pseudo_event_date from the account's ENTIRE history
    acct_txn_dates = acct_all_txns['txn_date'].unique()
    if len(acct_txn_dates) < 2: continue
    pseudo_event_date = np.random.choice(acct_txn_dates[acct_txn_dates > np.min(acct_txn_dates)])

    acct_txns = acct_all_txns[acct_all_txns['txn_date'] < pseudo_event_date]
    if acct_txns.empty: continue
    features = {'acct': acct, 'label': 0, 'ref_date': pseudo_event_date}
    time_windows = [1, 3, 7, 30]
    for window in time_windows:
        window_txns = acct_txns[acct_txns['txn_date'] >= pseudo_event_date - window]
        from_txns = window_txns[window_txns['from_acct'] == acct]
        features[f'from_count_{window}d'] = len(from_txns); features[f'from_amt_sum_{window}d'] = from_txns['txn_amt'].sum(); features[f'from_amt_mean_{window}d'] = from_txns['txn_amt'].mean()
        to_txns = window_txns[window_txns['to_acct'] == acct]
        features[f'to_count_{window}d'] = len(to_txns); features[f'to_amt_sum_{window}d'] = to_txns['txn_amt'].sum()
    epsilon = 1e-6
    features['from_amt_ratio_3d_30d'] = features['from_amt_sum_3d'] / (features['from_amt_sum_3d'] + epsilon)
    features['from_count_ratio_3d_30d'] = features['from_count_3d'] / (features['from_count_3d'] + epsilon)
    negative_features_list.append(features)
negative_features_df = pd.DataFrame(negative_features_list).fillna(0)

# 1.4 Combine
full_features_df = pd.concat([positive_features_df, negative_features_df], ignore_index=True)
print(f"\nFull feature dataset created. Total samples: {len(full_features_df)}")

# --- Part 2: Time-Split Validation ---
print("\n--- Part 2: Time-Split Validation ---")
split_date = 90
train_df = full_features_df[full_features_df['ref_date'] <= split_date]
val_df = full_features_df[full_features_df['ref_date'] > split_date]

print(f"Training set size: {len(train_df)} (ref_date <= {split_date})")
print(f"Validation set size: {len(val_df)} (ref_date > {split_date})")

# Error handling for empty splits
if len(train_df) == 0 or len(val_df) == 0 or 0 not in train_df['label'].value_counts() or 1 not in train_df['label'].value_counts() or 1 not in val_df['label'].value_counts():
    print("\nError: The data split resulted in an invalid training/validation set.")
    exit()

print(f"Training set label distribution:\n{train_df['label'].value_counts()}")
print(f"Validation set label distribution:\n{val_df['label'].value_counts()}")

features_to_drop = ['acct', 'label', 'ref_date']
X_train = train_df.drop(columns=features_to_drop)
y_train = train_df['label']
X_val = val_df.drop(columns=features_to_drop)
y_val = val_df['label']
X_val = X_val[X_train.columns]

scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"Calculated scale_pos_weight: {scale_pos_weight:.2f}")

lgb_params = {
    'objective': 'binary', 'boosting_type': 'gbdt', 'n_estimators': 1000,
    'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': -1,
    'seed': 42, 'n_jobs': -1, 'verbose': -1, 'scale_pos_weight': scale_pos_weight
}
model = lgb.LGBMClassifier(**lgb_params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=[lgb.early_stopping(100, verbose=False)])

val_preds_proba = model.predict_proba(X_val)[:, 1]
best_f1, best_thresh = 0, 0.5
for thresh in np.arange(0.1, 0.9, 0.01):
    f1 = f1_score(y_val, (val_preds_proba > thresh).astype(int))
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print("\n--- Time-Consistent Validation Result ---")
print(f"F1-Score on the time-split validation set: {best_f1:.4f}")
print(f"Found best threshold: {best_thresh:.2f}")
print("This F1-Score is our new reliable baseline.")

--- Part 1: Feature Engineering (Final Corrected Version) ---
Loading data...
Generating features for positive samples...


Processing alerted accounts: 100%|██████████| 1004/1004 [06:07<00:00,  2.73it/s]



Generating features for negative samples (Corrected for Time-Split)...


Processing non-alerted accounts: 100%|██████████| 2838/2838 [17:13<00:00,  2.75it/s]



Full feature dataset created. Total samples: 1949

--- Part 2: Time-Split Validation ---
Training set size: 1286 (ref_date <= 90)
Validation set size: 663 (ref_date > 90)
Training set label distribution:
label
0    653
1    633
Name: count, dtype: int64
Validation set label distribution:
label
0    350
1    313
Name: count, dtype: int64
Calculated scale_pos_weight: 1.03

--- Time-Consistent Validation Result ---
F1-Score on the time-split validation set: 0.8477
Found best threshold: 0.32
This F1-Score is our new reliable baseline.


In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from tqdm import tqdm

# --- 假设 full_features_df, df_trans, X_train.columns (來自上一段程式碼) 已经存在 ---

# --- 阶段一: 使用全量数据训练最终模型 ---
print("--- Phase 1: Training Final Model on ALL Data ---")

# 准备全量训练数据
X_full_train = full_features_df.drop(columns=['acct', 'label', 'ref_date'])
y_full_train = full_features_df['label']
X_full_train = X_full_train[X_train.columns] # 确保欄位順序与之前验证时一致

# 计算全量数据的 scale_pos_weight
scale_pos_weight = y_full_train.value_counts()[0] / y_full_train.value_counts()[1]
print(f"Training with {len(full_features_df)} samples. Scale_pos_weight: {scale_pos_weight:.2f}")

# 使用我们验证过的 lgb_params 参数
lgb_params = {
    'objective': 'binary', 'boosting_type': 'gbdt', 'n_estimators': 500, # 使用固定的迭代次数
    'learning_rate': 0.05, 'num_leaves': 31, 'max_depth': -1,
    'seed': 42, 'n_jobs': -1, 'verbose': -1, 'scale_pos_weight': scale_pos_weight
}

# 训练最终模型
final_model = lgb.LGBMClassifier(**lgb_params)
final_model.fit(X_full_train, y_full_train)
print("Final model training complete.")


# --- 阶段二: 为测试集生成特徵 (使用之前优化过的极速版) ---
print("\n--- Phase 2: Generating Features for Test Set (Vectorized) ---")
df_predict = pd.read_csv('acct_predict.csv')
test_accts = df_predict['acct'].unique()
latest_date = df_trans['txn_date'].max()

test_features_df = pd.DataFrame(index=test_accts)
test_features_df.index.name = 'acct'
time_windows = [1, 3, 7, 30]

for window in tqdm(time_windows, desc="Processing time windows"):
    window_trans = df_trans[df_trans['txn_date'] >= latest_date - window]
    from_feats = window_trans.groupby('from_acct')['txn_amt'].agg(['count', 'sum', 'mean'])
    from_feats.columns = [f'from_count_{window}d', f'from_amt_sum_{window}d', f'from_amt_mean_{window}d']
    to_feats = window_trans.groupby('to_acct')['txn_amt'].agg(['count', 'sum'])
    to_feats.columns = [f'to_count_{window}d', f'to_amt_sum_{window}d']
    test_features_df = test_features_df.join(from_feats, how='left').join(to_feats, how='left')

epsilon = 1e-6
test_features_df['from_amt_ratio_3d_30d'] = test_features_df['from_amt_sum_3d'] / (test_features_df['from_amt_sum_30d'] + epsilon)
test_features_df['from_count_ratio_3d_30d'] = test_features_df['from_count_3d'] / (test_features_df['from_count_30d'] + epsilon)
test_features_df.fillna(0, inplace=True)
test_features_df = test_features_df[X_train.columns] # 使用验证过的欄位顺序
print(f"Successfully generated features for {len(test_features_df)} test accounts.")


# --- 阶段三: 預測并使用可靠的阈值生成提交文件 ---
print("\n--- Phase 3: Predicting and Generating Final Submission File ---")

# 预测
test_predictions_proba = final_model.predict_proba(test_features_df)[:, 1]

# **关键**: 使用我们从可靠的验证集中找到的最佳阈值
best_thresh_from_validation = 0.32
print(f"Using the reliable threshold found from our time-split validation: {best_thresh_from_validation}")

# 生成最终预测
test_predictions_final = (test_predictions_proba > best_thresh_from_validation).astype(int)

# 创建提交文件
submission_df = pd.DataFrame({'acct': test_accts, 'label': test_predictions_final})
submission_df.to_csv('submission_final.csv', index=False)

print("\n--- Submission File 'submission_final.csv' Generated ---")
print(f"Prediction distribution:\n{submission_df['label'].value_counts()}")
print("Preview of submission file:")
print(submission_df.head())

--- Phase 1: Training Final Model on ALL Data ---
Training with 1949 samples. Scale_pos_weight: 1.06
Final model training complete.

--- Phase 2: Generating Features for Test Set (Vectorized) ---


Processing time windows: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]

Successfully generated features for 4780 test accounts.

--- Phase 3: Predicting and Generating Final Submission File ---
Using the reliable threshold found from our time-split validation: 0.32

--- Submission File 'submission_final.csv' Generated ---
Prediction distribution:
label
1    3452
0    1328
Name: count, dtype: int64
Preview of submission file:
                                                acct  label
0  fcf31c5113d3dbd9cb5056045c6a0f213bd8a4fc1bc834...      1
1  e21dfa45e990364194468e501fbfe52ec02a4b71a2e2e8...      0
2  2552e943aaf9caa33183758cd40128ef20a6e6ff16c232...      1
3  71700e7b7c3d40abdfdbcc7afc0752fa8d9bd28b408651...      1
4  c70349fc718ffb88f03f31b5a7fcf65b33dd71dce6fee0...      1



