In [1]:
"""
這份檔案要用來執行訓練行的部分，主要利用catboost來訓練模型，可透過參數利用GPU進行運算

input：特徵處理完的training dataset(processed_data.parquet)、validation dataset(val_data.parquet)

output：最終final_prediction和訓練後model模型檔案model.cbm 


"""

'\n這份檔案要用來執行訓練行的部分，主要利用catboost來訓練模型，可透過參數利用GPU進行運算\n\ninput：特徵處理完的training dataset(processed_data.parquet)、validation dataset(val_data.parquet)\n\noutput：最終final_prediction  \n\n\n'

In [2]:
import pandas as pd
import numpy as np
import datetime
import dask.dataframe as dd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from scipy.stats import entropy

# 訓練模型

In [5]:
new_train_data = pd.read_parquet('processed_data.parquet')
new_val_data = pd.read_parquet('val_data.parquet')
example = pd.read_csv('dataset_1st/31_範例繳交檔案.csv')


In [6]:
cat_features = ['contp', 'etymd', 'mcc', 'ecfg', 'stocn', 'scity', 'insfg', 'mchno', 'acqic',
                'stscd', 'hcefg', 'csmcu', 'flg_3dsmk', 'hour','city_change', 'country_change']

columns_to_drop = ['label', 'txkey', 'chid', 'cano', 'bnsfg', 'flbmk', 'ovrlt', 'iterm']

# 從數據框中刪除指定的列
X = new_train_data.drop(columns=columns_to_drop)
y = new_train_data['label']


In [7]:
for feature in cat_features:
    X[feature] = X[feature].astype(str)
    
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.35, 
    random_state=40, 
    stratify=y
)

train_pool = Pool(X_train, y_train,cat_features=cat_features)
test_pool = Pool(X_test, y_test,cat_features=cat_features)


In [23]:
"""
註記：若電腦為MAC系列，則須無法使用 task_type='GPU'，可將其註解並且開啟subsample，接受另外兩個參數再進行訓練，超參數皆無須調整

"""
catboost_model = CatBoostClassifier(
    iterations=5000,  
    learning_rate=0.0325,
    depth=7,
    loss_function='Logloss',
    eval_metric='F1',  
    early_stopping_rounds=800,
    random_seed=42,
    verbose=100,
    l2_leaf_reg=3,
    leaf_estimation_iterations=10,
#     colsample_bylevel=0.8,
#     subsample=0.85,
    max_ctr_complexity=10,
    task_type='GPU',
    scale_pos_weight = 9.484, # 9.45
    random_strength=3,
    grow_policy='Lossguide'
)

catboost_model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

0:	learn: 0.8703902	test: 0.8745276	best: 0.8745276 (0)	total: 90.6ms	remaining: 7m 32s
100:	learn: 0.9055057	test: 0.9062178	best: 0.9062178 (100)	total: 9.48s	remaining: 7m 39s
200:	learn: 0.9237512	test: 0.9225561	best: 0.9225561 (200)	total: 19.8s	remaining: 7m 53s
300:	learn: 0.9322794	test: 0.9297908	best: 0.9297908 (299)	total: 30s	remaining: 7m 47s
400:	learn: 0.9404068	test: 0.9357573	best: 0.9357573 (400)	total: 39.7s	remaining: 7m 35s
500:	learn: 0.9463741	test: 0.9401230	best: 0.9401923 (496)	total: 49.3s	remaining: 7m 22s
600:	learn: 0.9512660	test: 0.9434974	best: 0.9434974 (600)	total: 58.5s	remaining: 7m 8s
700:	learn: 0.9545332	test: 0.9454646	best: 0.9454967 (699)	total: 1m 7s	remaining: 6m 55s
800:	learn: 0.9587598	test: 0.9474868	best: 0.9474868 (799)	total: 1m 17s	remaining: 6m 43s
900:	learn: 0.9624912	test: 0.9503524	best: 0.9503822 (897)	total: 1m 26s	remaining: 6m 32s
1000:	learn: 0.9657967	test: 0.9517409	best: 0.9517409 (1000)	total: 1m 35s	remaining: 6m 21s


<catboost.core.CatBoostClassifier at 0x19d0dc6b340>

In [24]:
y_pred = catboost_model.predict(test_pool)

precision = precision_score(y_test, y_pred, average='binary', pos_label=1)

print(f"Precision: {precision}")


Precision: 0.8923089159309632


In [25]:
recall = recall_score(y_test, y_pred, average='binary', pos_label=1)

print(f"Recall: {recall}")



Recall: 0.9368684759916492


In [27]:
# 查看特徵

feature_names =['locdt', 'loctm', 'contp', 'etymd', 'mchno', 'acqic', 'mcc', 
                'conam', 'ecfg', 'insfg', 'flam1', 'stocn', 'scity', 'stscd', 
                'hcefg', 'csmcu', 'csmam', 'flg_3dsmk', 'card_transaction_count', 
                'customer_total_transactions', 'card_transaction_ratio_before_30', 
                'card_transaction_ratio_after_30', 'ratio_change', 'min_daily_trans', 
                'max_daily_trans', 'daily_transactions', 'normalized_trans_freq', 
                'normalized_daily_amount', 'difference_seconds', 'avg_interval', 
                'std_interval', 'transactions_per_mcc_x', 'mcc_total_amount_x', 
                'variance_transaction_amount_per_mcc_x', 'transactions_per_mcc_y', 
                'mcc_total_amount_y', 'variance_transaction_amount_per_mcc_y', 
                'mad_transaction_amount_per_mcc', 'transactions_per_mchno', 'mchno_total_amount', 
                'variance_transaction_amount_per_mchno', 'mad_transaction_amount_per_mchno', 
                'city_change', 'country_change', 'hour', 'loctm_seconds']

feature_importances = catboost_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df)


                                  Feature  Importance
29                           avg_interval    9.300616
18                 card_transaction_count    6.533378
30                           std_interval    6.405744
19            customer_total_transactions    6.311303
22                           ratio_change    5.900899
38                 transactions_per_mchno    5.224883
6                                     mcc    4.857460
5                                   acqic    4.427706
4                                   mchno    3.782711
39                     mchno_total_amount    3.619851
31                 transactions_per_mcc_x    3.449154
11                                  stocn    3.225175
37         mad_transaction_amount_per_mcc    3.122533
34                 transactions_per_mcc_y    2.866306
0                                   locdt    2.852094
24                        max_daily_trans    2.142222
32                     mcc_total_amount_x    2.020789
12                          

# 預測和輸出資料

In [28]:
new_val_data = new_val_data.set_index('txkey')

In [30]:
columns_to_drop = ['chid', 'cano', 'bnsfg', 'flbmk', 'ovrlt', 'iterm']

cat_features = ['contp', 'etymd', 'mcc', 'ecfg', 'stocn', 'scity', 'insfg', 'mchno', 'acqic',
                'stscd', 'hcefg', 'csmcu', 'flg_3dsmk', 'hour','city_change', 'country_change']

X = new_val_data.drop(columns=columns_to_drop)

for feature in cat_features:
    X[feature] = X[feature].astype(str)
    
test_pool = Pool(X, cat_features=cat_features)


In [35]:
y_pred = catboost_model.predict(test_pool).astype(int)
new_val_data['pred']= y_pred
new_val_data =new_val_data.reset_index()

output_df = new_val_data[['txkey', 'pred']].set_index('txkey')
example = example.drop_duplicates(subset='txkey')

df2_sorted = example[['txkey']].merge(output_df, on='txkey', how='left')
df2_sorted = df2_sorted.set_index('txkey')

In [36]:
output_filename = 'dataset_2nd/predictions_secondround.csv'
df2_sorted.to_csv(output_filename, index='True')