In [None]:
"""
這份檔案要用來執行訓練行的部分，主要利用catboost來訓練模型，可透過參數利用GPU進行運算

input：特徵處理完的training dataset(processed_data.parquet)、validation dataset(val_data.parquet)

output：最終final_prediction 


"""

In [5]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve
from collections import Counter
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from scipy.stats import entropy


In [9]:
new_train_data = pd.read_csv('data2/processed_data.csv')
new_val_data = pd.read_csv('data2/val_data.csv')
example = pd.read_csv('data2/private_2_template_v2.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'data2/processed_data.parquet'

In [None]:
cat_features = ['contp', 'etymd', 'mcc', 'ecfg', 'stocn', 'scity', 'insfg', 'mchno', 'acqic',
                'stscd', 'hcefg', 'csmcu', 'flg_3dsmk', 'hour']

columns_to_drop = ['label', 'txkey', 'chid', 'cano', 'bnsfg', 'flbmk', 'ovrlt', 'iterm', 'first_use_date',
                   'last_use_date', 'days_active']

# 從數據框中刪除指定的列
X = new_train_data.drop(columns=columns_to_drop)
y = new_train_data['label']


In [None]:
for feature in cat_features:
    X[feature] = X[feature].astype(str)
    
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.35, 
    random_state=40, 
    stratify=y
)

train_pool = Pool(X_train, y_train,cat_features=cat_features)
test_pool = Pool(X_test, y_test,cat_features=cat_features)

In [None]:
"""
註記：若電腦為MAC系列，則須無法使用 task_type='GPU'，可將其註解並且開啟subsample，接受另外兩個參數再進行訓練，超參數皆無須調整

"""
catboost_model = CatBoostClassifier(
    iterations=7000,  
    learning_rate=0.0365,
    depth=7,
    loss_function='Logloss',
    eval_metric='F1',  
    early_stopping_rounds=800,
    random_seed=42,
    verbose=100,
    l2_leaf_reg=3,
    leaf_estimation_iterations=10,
    colsample_bylevel=0.8,
    subsample=0.85,
    max_ctr_complexity=10,
#     task_type='GPU',
    scale_pos_weight = 9.484, # 9.45
    random_strength=3,
    grow_policy='Lossguide'
)

catboost_model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

In [None]:
# 預測概率
y_pred_proba = catboost_model.predict_proba(X_test)[:, 1]

# 計算精確度和召回率以及閾值
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# 計算 F1 分數
f1_scores = 2 * recall * precision / (recall + precision)
max_f1_index = np.argmax(f1_scores)
best_threshold = thresholds[max_f1_index]

# 使用最佳閾值進行預測
y_pred_custom = (y_pred_proba > best_threshold).astype(int)

# 計算調整後的精確度和召回率
adjusted_precision = precision_score(y_test, y_pred_custom)
adjusted_recall = recall_score(y_test, y_pred_custom)
adjusted_f1 = f1_score(y_test, y_pred_custom)

print(f"Best Threshold: {best_threshold}")
print(f"Adjusted Precision: {adjusted_precision}")
print(f"Adjusted Recall: {adjusted_recall}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 設置 seaborn 的風格
sns.set(style="whitegrid")

# 假設 precision, recall, best_recall, best_precision 已經定義好
# 繪製 Precision-Recall 曲線
plt.figure(figsize=(10, 8))
sns.lineplot(x=recall, y=precision, label='PR Curve', linewidth=2)

# 標記最佳點
plt.scatter(adjusted_recall, adjusted_precision, color='red', s=100, edgecolor='black', zorder=5)
plt.text(adjusted_recall, adjusted_precision, f'  (Recall: {adjusted_recall:.2f}, Precision: {adjusted_precision:.2f})', 
         verticalalignment='bottom', horizontalalignment='right', fontsize=12)

# 添加圖表標題和標籤
plt.xlabel('Recall', fontsize=14)
plt.ylabel('Precision', fontsize=14)
plt.title('Precision-Recall Curve', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# 添加圖例
plt.legend(fontsize=14, loc='lower left')

# 顯示圖表
plt.show()


In [None]:
new_val_data = new_val_data.set_index('txkey')
columns_to_drop = ['chid', 'cano', 'bnsfg', 'flbmk', 'ovrlt', 'iterm']

cat_features = ['contp', 'etymd', 'mcc', 'ecfg', 'stocn', 'scity', 'insfg', 'mchno', 'acqic',
                'stscd', 'hcefg', 'csmcu', 'flg_3dsmk', 'hour','city_change', 'country_change']

X = new_val_data.drop(columns=columns_to_drop)

for feature in cat_features:
    X[feature] = X[feature].astype(str)
    
test_pool = Pool(X, cat_features=cat_features)


In [None]:
y_pred_proba = catboost_model.predict_proba(test_pool)[:, 1]
y_pred_custom = (y_pred_proba > best_threshold).astype(int)
new_val_data['pred'] = y_pred_custom
new_val_data = new_val_data.reset_index()
output_df = new_val_data[['txkey', 'pred']].set_index('txkey')
example = example.drop_duplicates(subset='txkey')

df2_sorted = example[['txkey']].merge(output_df, on='txkey', how='left')
df2_sorted = df2_sorted.set_index('txkey')