# Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import datetime
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import joblib
from sklearn.model_selection import StratifiedKFold
import time
from sklearn.metrics import precision_recall_curve
import optuna
from sklearn.metrics import roc_auc_score

# Data

In [None]:
train_path = "D:\\MUFG Data Science Champion Ship 2023\\train.csv"
df_train = pd.read_csv(train_path)

card_path = "D:\\MUFG Data Science Champion Ship 2023\\card.csv"
df_card = pd.read_csv(card_path)

user_path = "D:\\MUFG Data Science Champion Ship 2023\\user.csv"
df_user = pd.read_csv(user_path)

test_path = "D:\\MUFG Data Science Champion Ship 2023\\test.csv"
df_test = pd.read_csv(test_path)

## Merge

In [None]:
# is_testカラムを追加
df_train['is_test'] = 0
df_test['is_test'] = 1

# データフレームをマージ
merged_df1 = pd.concat([df_train, df_test], axis=0, ignore_index=True)

merged_df2 = merged_df1.merge(df_user, on='user_id', how='left')

final_data = merged_df2.merge(df_card, on=['user_id', 'card_id'], how='left')

In [None]:
print(df_train.shape)
print(df_test.shape)
print(df_user.shape)
print(df_card.shape)
print(merged_df1.shape)
print(merged_df2.shape)
print(final_data.shape)

In [None]:
final_data.to_csv('D:\\MUFG Data Science Champion Ship 2023\\output\\output2.csv', index=False)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

final_data.head()

## 前処理

In [None]:
final_data['amount'] = final_data['amount'].str.replace('$', '').astype(float)
final_data['credit_limit'] = final_data['credit_limit'].str.replace('$', '').astype(float)
final_data['per_capita_income_zipcode'] = final_data['per_capita_income_zipcode'].str.replace('$', '').astype(float)
final_data['yearly_income_person'] = final_data['yearly_income_person'].str.replace('$', '').astype(float)
final_data['total_debt'] = final_data['total_debt'].str.replace('$', '').astype(float)
final_data.drop('index', axis=1, inplace=True)
final_data.drop('user_id', axis=1, inplace=True)
# final_data.drop('merchant_city', axis=1, inplace=True)

# Check the first few rows to confirm the change
final_data.head()

# Drop

In [None]:
# # 削除するカラムのリスト
# columns_to_drop = [
#     'card_brand', 'state', 'has_chip', 'gender',
#     'cards_issued', 'card_type', 'num_credit_cards',
#     'birth_year', 'yearly_income_person', 'retirement_age', 'index', 'user_id', 'address', 'birth_month'
# ]

# カラムを削除
final_data = final_data.drop(columns=columns_to_drop)

# Check the first few rows to confirm the change
final_data.head()

In [None]:
print(lgb.__version__)
print(final_data.shape)

# 学習

In [None]:
# カテゴリ変数を指定
categorical_features = final_data.select_dtypes(include=['object']).columns.tolist()

# カテゴリ変数をカテゴリ型に変換
for col in categorical_features:
    final_data[col] = final_data[col].astype('category')

# `is_test` カラムを使用して学習用データとテストデータに分割
final_data_train = final_data[final_data['is_test'] == 0]
final_data_test = final_data[final_data['is_test'] == 1]

X = final_data_train.drop(columns=['is_fraud?', 'is_test'])
y = final_data_train['is_fraud?']

# ハイパーパラメータの設定
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# 4-fold クロスバリデーションを実行
folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
f1_scores = []
optimal_thresholds = []  # 各フォールドの最適な閾値を保存するリスト

for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    print(f'Fold {fold_n + 1} started at {time.ctime()}')

    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)

    num_round = 1000
    bst = lgb.train(params, train_data, num_round, valid_sets=[valid_data])

     # 予測確率を取得
    y_pred_probs = bst.predict(X_valid, num_iteration=bst.best_iteration)
    # 適合率、再現率、閾値を計算
    precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs)
    # F1スコアを計算
    f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
    # 最大のF1スコアを持つ閾値を取得
    optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
    optimal_thresholds.append(optimal_threshold)
    # 予測確率をバイナリの0 or 1に変換
    y_pred_binary = (y_pred_probs > optimal_threshold).astype(int)

    f1 = f1_score(y_valid, y_pred_binary)
    f1_scores.append(f1)

    # モデルを保存
    bst.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\model_fold{fold_n + 1}.txt')

print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores)))
#58.2s
#CV:0.6348

# test

In [None]:
# テストデータでの予測
predictions = []

# テストデータの特徴量を取得
final_data_test_features = final_data_test.drop(columns=['is_fraud?', 'is_test'])

# 各モデルでの予測を行う
for fold_n in range(4):
    bst = lgb.Booster(model_file=f'model_fold{fold_n + 1}.txt')
    preds = bst.predict(final_data_test_features, num_iteration=bst.best_iteration)
    predictions.append(preds)

# 予測の平均を取得
mean_preds = np.mean(predictions, axis=0)

# 4つの最適な閾値の平均を取得
mean_optimal_threshold = np.mean(optimal_thresholds)

# 予測結果を二値化
binary_predictions = (mean_preds > mean_optimal_threshold).astype(int)

# 提出用のデータフレームを作成
submission_df = pd.DataFrame({
    'ID': df_test['index'].values,
    'is_fraud?': binary_predictions
})

# ヘッダーとインデックスを無効にしてCSVに保存
submission_df.to_csv('submission.csv', index=False, header=False)

In [None]:
submission_df.to_csv('D:\\MUFG Data Science Champion Ship 2023\\predictions\\submit_baseline1.csv', index=False)

# 特徴量需要度

In [None]:
import matplotlib.pyplot as plt
import shap
import seaborn as sns

# Get feature importances from the model
feature_importances = bst.feature_importance() #importance_type='gain'

# Create a DataFrame for visualization
features_df = pd.DataFrame({
    'Feature': X.columns,  # Ensure this is the complete feature set used for training
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=features_df)
plt.title('Feature Importances')
plt.show()



In [None]:
# Get feature importances from the model
feature_importances = bst_lgb.feature_importance() #importance_type='gain'

# Create a DataFrame for visualization
features_df = pd.DataFrame({
    'Feature': X.columns,  # Ensure this is the complete feature set used for training
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=True)  # Change to ascending order for bottom 10

# Print the bottom 10 features
print(features_df.head(10))

# If you still want to visualize all feature importances, you can sort again
features_df = features_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=features_df)
plt.title('Feature Importances')
plt.show()


In [None]:
# 1つ目のモデルをロード (ここでは1つのfoldを例として使用)
bst = lgb.Booster(model_file='D:\\MUFG Data Science Champion Ship 2023\\model\\lgb_model_fold1.txt')

# SHAPのExplainerを作成し、SHAP値を計算
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_valid)

# summary_plotの表示
shap.summary_plot(shap_values, X_valid)

# force_plotの表示 (最初の観測値を例として)
shap.initjs()  # JavaScriptを初期化 (Jupyter Notebookでの表示のため)
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_valid.iloc[0,:]) #6m14.7s


In [None]:
# Set seaborn style
sns.set_style("whitegrid")

# Plot
shap.summary_plot(shap_values, X_valid)

# アンサンブル学習

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score, precision_recall_curve
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import time

# Specify categorical variables
categorical_features = final_data.select_dtypes(include=['object']).columns.tolist()

# Convert categorical variables to categorical data type
for col in categorical_features:
    final_data[col] = final_data[col].astype('category')

# Split data into training and test sets using the 'is_test' column
final_data_train = final_data[final_data['is_test'] == 0]
final_data_test = final_data[final_data['is_test'] == 1]

# Data split
X = final_data_train.drop(columns=['is_fraud?', 'is_test'])
y = final_data_train['is_fraud?']

# Hyperparameter settings
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 5
}

f1_scores = []
optimal_thresholds = []

folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X, y))):
    print(f'Fold {fold_n + 1} started at {time.ctime()}')
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    # LightGBM
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
    num_round = 10000
    bst_lgb = lgb.train(lgb_params, train_data, num_round, valid_sets=[valid_data], callbacks=[lgb.early_stopping(stopping_rounds=500,
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(1)])

    # XGBoost
    xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
    xgb_valid = xgb.DMatrix(X.iloc[valid_index], label=y.iloc[valid_index], enable_categorical=True)
    bst_xgb = xgb.train(xgb_params, xgb_train, num_boost_round=10000, evals=[(xgb_valid, 'eval')], early_stopping_rounds=500, verbose_eval=100)

    # Ensemble predictions from LightGBM and XGBoost
    y_pred_probs_lgb = bst_lgb.predict(X_valid)
    y_pred_probs_xgb = bst_xgb.predict(xgb_valid)
    y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb) / 2

    # Optimal Threshold based on ensemble predictions
    precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs_avg)
    f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
    optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
    optimal_thresholds.append(optimal_threshold)
    y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

    f1 = f1_score(y_valid, y_pred_binary)
    f1_scores.append(f1)

    # Save models
    bst_lgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\lgb_model_fold{fold_n + 1}.txt')
    bst_xgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\xgb_model_fold{fold_n + 1}.txt')

print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores)))
#24m13.8s

#after Drop
#17m21.0s
#CV mean score: 0.6444, std: 0.0045.

#after Dorp2
# index, user_id, address
#24m35.2s
#CV mean score: 0.6442, std: 0.0038.

#500
#35m27.3s
#CV mean score: 0.6472, std: 0.0014

#1000
#38m
#CV mean score: 0.6472, std: 0.0014


## Separate models

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score, precision_recall_curve
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import time
import numpy as np

def train_model_for_transaction_type(data, transaction_type):
    # Filter the data based on the specified transaction type
    filtered_data = data[data['use_chip'] == transaction_type]

    # Specify categorical variables
    categorical_features = filtered_data.select_dtypes(include=['object']).columns.tolist()

    # 'use_chip' column is not needed in the categorical features list as it's already filtered out
    if 'use_chip' in categorical_features:
        categorical_features.remove('use_chip')

    # Convert categorical variables to categorical data type
    for col in categorical_features:
        filtered_data[col] = filtered_data[col].astype('category')

    # Split data using 'is_test' column
    filtered_data_train = filtered_data[filtered_data['is_test'] == 0]
    filtered_data_test = filtered_data[filtered_data['is_test'] == 1]

    # Data split
    X = filtered_data_train.drop(columns=['is_fraud?', 'is_test', 'use_chip'])
    y = filtered_data_train['is_fraud?']

    # Hyperparameter settings
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'eta': 0.05,
        'max_depth': 5
    }

    f1_scores = []
    optimal_thresholds = []

    folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'{transaction_type}Fold {fold_n + 1} started at {time.ctime()}')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # LightGBM
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
        num_round = 10000
        bst_lgb = lgb.train(lgb_params, train_data, num_round, valid_sets=[valid_data], callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True)])

        # XGBoost
        xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
        xgb_valid = xgb.DMatrix(X.iloc[valid_index], label=y.iloc[valid_index], enable_categorical=True)
        bst_xgb = xgb.train(xgb_params, xgb_train, num_boost_round=10000, evals=[(xgb_valid, 'eval')], early_stopping_rounds=500, verbose_eval=100)

        # Ensemble predictions from LightGBM and XGBoost
        y_pred_probs_lgb = bst_lgb.predict(X_valid)
        y_pred_probs_xgb = bst_xgb.predict(xgb_valid)
        y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb) / 2

        # Optimal Threshold based on ensemble predictions
        precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs_avg)
        f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
        optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
        optimal_thresholds.append(optimal_threshold)
        y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

        f1 = f1_score(y_valid, y_pred_binary)
        f1_scores.append(f1)


        #Save models
        bst_lgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\lgb_model_fold{fold_n + 1}.txt')
        bst_xgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\xgb_model_fold{fold_n + 1}.txt')

    print(f'{transaction_type} - CV mean score: {np.mean(f1_scores):.4f}, std: {np.std(f1_scores):.4f}.')

# Train models for each transaction type
transaction_types = ['Swipe Transaction', 'Chip Transaction', 'Online Transaction']
for transaction_type in transaction_types:
    train_model_for_transaction_type(final_data, transaction_type)


### Feature importance

In [None]:
# Function to map XGBoost feature names to original dataset column names
def map_xgb_feature_names(xgb_importance, original_features):
    mapped_importance = {}
    for key, value in xgb_importance.items():
        # Extract the feature index from the XGBoost feature name (like 'f0', 'f1', etc.)
        feature_idx = int(key[1:])
        mapped_importance[original_features[feature_idx]] = value
    return mapped_importance

# Updated function to plot feature importance using seaborn with XGBoost feature name correction
def plot_feature_importance_seaborn_final(lgb_model, xgb_model, transaction_type):
    # LightGBM feature importance
    lgb_importance = lgb_model.feature_importance(importance_type='gain')
    lgb_features = lgb_model.feature_name()
    lgb_sorted_idx = np.argsort(lgb_importance)[::-1]

    # XGBoost feature importance
    xgb_importance = xgb_model.get_score(importance_type='gain')

    # If XGBoost model does not have feature names, map them
    if not xgb_model.feature_names:
        xgb_importance = map_xgb_feature_names(xgb_importance, X.columns)
    xgb_sorted_idx = sorted(xgb_importance, key=xgb_importance.get, reverse=True)

    # Create a DataFrame for seaborn plotting
    lgb_df = pd.DataFrame({
        'Feature': np.array(lgb_features)[lgb_sorted_idx][:10],
        'Importance': lgb_importance[lgb_sorted_idx][:10]
    })
    xgb_df = pd.DataFrame({
        'Feature': xgb_sorted_idx[:10],
        'Importance': [xgb_importance[i] for i in xgb_sorted_idx][:10]
    })

    # Plot feature importance using seaborn
    fig, ax = plt.subplots(1, 2, figsize=(14, 10))

    # LightGBM
    sns.barplot(data=lgb_df, y='Feature', x='Importance', ax=ax[0], palette="viridis")
    ax[0].set_title(f'LightGBM - Top 10 Feature Importance for {transaction_type}')
    ax[0].set_xlabel('Importance')
    ax[0].set_ylabel('Features')

    # XGBoost
    sns.barplot(data=xgb_df, y='Feature', x='Importance', ax=ax[1], palette="viridis")
    ax[1].set_title(f'XGBoost - Top 10 Feature Importance for {transaction_type}')
    ax[1].set_xlabel('Importance')
    ax[1].set_ylabel('Features')

    plt.tight_layout()
    plt.show()

# Load the saved models and plot feature importance for each transaction type
lgb_online_model = lgb.Booster(model_file='D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\lgb_model_Online Transaction_fold3.txt')
xgb_online_model = xgb.Booster(model_file='D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\xgb_model_Online Transaction_fold3.txt')

lgb_other_model = lgb.Booster(model_file='D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\lgb_model_Other_fold3.txt')
xgb_other_model = xgb.Booster(model_file='D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\xgb_model_Other_fold3.txt')

plot_feature_importance_seaborn_final(lgb_online_model, xgb_online_model, 'Online Transaction')
plot_feature_importance_seaborn_final(lgb_other_model, xgb_other_model, 'Other')



In [None]:
# Updated function to compute and plot SHAP values for LightGBM and XGBoost models,
# while accepting models directly as arguments and without needing the dataset explicitly
def plot_shap_values_final_updated(lgb_model, xgb_model, transaction_type):

    # To get SHAP values, we still need some data. We'll use a sample from the model's training data.
    # This won't give exact insights for all data but is a common approach for global interpretability.
    lgb_data_sample = lgb.Dataset.get_data(lgb_model.train_set).sample(1000, random_state=42)
    xgb_data_sample = xgb.DMatrix(lgb_data_sample, feature_names=lgb_model.feature_name())

    # LightGBM SHAP values
    explainer_lgb = shap.Explainer(lgb_model)
    shap_values_lgb = explainer_lgb(lgb_data_sample)

    # XGBoost SHAP values
    explainer_xgb = shap.Explainer(xgb_model)
    shap_values_xgb = explainer_xgb(xgb_data_sample)

    # Plot
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    # LightGBM
    shap.summary_plot(shap_values_lgb, lgb_data_sample, plot_type="bar", show=False, ax=ax[0])
    ax[0].set_title(f'LightGBM SHAP Values for {transaction_type}')

    # XGBoost
    shap.summary_plot(shap_values_xgb, lgb_data_sample, plot_type="bar", show=False, ax=ax[1])
    ax[1].set_title(f'XGBoost SHAP Values for {transaction_type}')

    plt.tight_layout()
    plt.show()

# Note: The below code will not work in this environment due to the mentioned reasons
plot_shap_values_final_updated(lgb_online_model, xgb_online_model, 'Online Transaction')
plot_shap_values_final_updated(lgb_other_model, xgb_other_model, 'Other')


## F1スコアで最適化

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score, precision_recall_curve
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import time
import numpy as np

# ... [データの準備や前処理] ...
# Specify categorical variables
categorical_features = final_data.select_dtypes(include=['object']).columns.tolist()

# Convert categorical variables to categorical data type
for col in categorical_features:
    final_data[col] = final_data[col].astype('category')

# Split data into training and test sets using the 'is_test' column
final_data_train = final_data[final_data['is_test'] == 0]
final_data_test = final_data[final_data['is_test'] == 1]

# Data split
X = final_data_train.drop(columns=['is_fraud?', 'is_test'])
y = final_data_train['is_fraud?']

# Hyperparameter settings
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 5
}


# F1スコアの評価関数 (LightGBM用)
def lgb_f1_score(y_pred, data):
    y_true = data.get_label()
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
    best_threshold = thresholds[np.argmax(f1_scores_thresholds)]
    y_pred_binary = (y_pred > best_threshold).astype(int)
    return 'f1', f1_score(y_true, y_pred_binary), True

# F1スコアの評価関数 (XGBoost用)
def xgb_f1_score(y_pred, data):
    y_true = data.get_label()
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
    best_threshold = thresholds[np.argmax(f1_scores_thresholds)]
    y_pred_binary = (y_pred > best_threshold).astype(int)
    return 'f1', f1_score(y_true, y_pred_binary)

f1_scores = []
optimal_thresholds = []

folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X, y))):
    print(f'Fold {fold_n + 1} started at {time.ctime()}')
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    # LightGBM
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
    num_round = 10000
    bst_lgb = lgb.train(lgb_params,
                        train_data,
                        num_round,
                        valid_sets=[valid_data],
                        feval=lgb_f1_score,  # ここでカスタム評価関数を指定
                        callbacks=[lgb.early_stopping(stopping_rounds=1000,
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(1)]
                        )

    # XGBoost
    xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
    xgb_valid = xgb.DMatrix(X.iloc[valid_index], label=y.iloc[valid_index], enable_categorical=True)
    bst_xgb = xgb.train(xgb_params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_valid, 'eval')],
                        feval=xgb_f1_score,  # ここでカスタム評価関数を指定
                        early_stopping_rounds=1000,
                        verbose_eval=100)

    # Ensemble predictions
    y_pred_probs_lgb = bst_lgb.predict(X_valid)
    y_pred_probs_xgb = bst_xgb.predict(xgb_valid)
    y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb) / 2

    # Optimal Threshold based on ensemble predictions
    precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs_avg)
    f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
    optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
    optimal_thresholds.append(optimal_threshold)
    y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

    f1 = f1_score(y_valid, y_pred_binary)
    f1_scores.append(f1)

    # Save models
    bst_lgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\lgb_model_fold{fold_n + 1}.txt')
    bst_xgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\xgb_model_fold{fold_n + 1}.txt')

print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores)))

#50
#3m56.6
#CV mean score: 0.6204, std: 0.0026.

#100
#11m34.5s
#CV mean score: 0.6245, std: 0.0018.

#200
#15m16.5s
#CV mean score: 0.6300, std: 0.0029.

#500
#20m4.4s
#CV mean score: 0.6344, std: 0.0033.

#1000
#27m.33.0s
#CV mean score: 0.6387, std: 0.0025.


### F1score optuna

In [None]:
import optuna

def objective(trial):
    # Hyperparameters to be optimized with Optuna
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0)
    }

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'eta': trial.suggest_float('eta', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 1, 10)
    }

    f1_scores = []

    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # LightGBM
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
        num_round = 10000
        bst_lgb = lgb.train(lgb_params, train_data, num_round, valid_sets=[valid_data], feval=lgb_f1_score, callbacks=[lgb.early_stopping(stopping_rounds=1000, verbose=True), lgb.log_evaluation(1)])

        # XGBoost
        xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
        xgb_valid = xgb.DMatrix(X.iloc[valid_index], label=y.iloc[valid_index], enable_categorical=True)
        bst_xgb = xgb.train(xgb_params, xgb_train, num_boost_round=10000, evals=[(xgb_valid, 'eval')], feval=xgb_f1_score, early_stopping_rounds=1000, verbose_eval=100)

        # Ensemble predictions
        y_pred_probs_lgb = bst_lgb.predict(X_valid)
        y_pred_probs_xgb = bst_xgb.predict(xgb_valid)
        y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb) / 2

        # Optimal Threshold
        precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs_avg)
        f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
        optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
        y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

        f1 = f1_score(y_valid, y_pred_binary)
        f1_scores.append(f1)

    return -np.mean(f1_scores)  # We use negative because Optuna tries to minimize the objective

# Create study object
study = optuna.create_study()
study.optimize(objective, n_trials=100)  # Adjust the number of trials

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


## Optuna適用

In [None]:
# import lightgbm as lgb
# import xgboost as xgb
# from sklearn.metrics import f1_score, precision_recall_curve
# from tqdm import tqdm
# from sklearn.model_selection import StratifiedKFold
# import time

# # Specify categorical variables
# categorical_features = final_data.select_dtypes(include=['object']).columns.tolist()

# # Convert categorical variables to categorical data type
# for col in categorical_features:
#     final_data[col] = final_data[col].astype('category')

# # Split data into training and test sets using the 'is_test' column
# final_data_train = final_data[final_data['is_test'] == 0]
# final_data_test = final_data[final_data['is_test'] == 1]

# # Data split
# X = final_data_train.drop(columns=['is_fraud?', 'is_test'])
# y = final_data_train['is_fraud?']

# # Hyperparameter settings
# lgb_params = {
#     'objective': 'binary',
#     'metric': 'binary_logloss',
#     'boosting_type': 'gbdt',
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9
# }

# xgb_params = {
#     'objective': 'binary:logistic',
#     'eval_metric': 'logloss',
#     'eta': 0.05,
#     'max_depth': 5
# }

# f1_scores = []
# optimal_thresholds = []

# folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# def objective(trial):
#     # LightGBM hyperparameters
#     lgb_params = {
#         'objective': 'binary',
#         'metric': 'binary_logloss',
#         'boosting_type': 'gbdt',
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
#     }

#     # XGBoost hyperparameters
#     xgb_params = {
#         'objective': 'binary:logistic',
#         'eval_metric': 'logloss',
#         'eta': trial.suggest_float('eta', 1e-4, 1e-1, log=True),
#         'max_depth': trial.suggest_int('max_depth', 1, 9),
#     }

#     for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X, y))):
#         print(f'Fold {fold_n + 1} started at {time.ctime()}')
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

#         # LightGBM
#         train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
#         valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
#         num_round = 1000
#         bst_lgb = lgb.train(lgb_params, train_data, num_round, valid_sets=[valid_data])

#         # XGBoost
#         xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
#         xgb_valid = xgb.DMatrix(X.iloc[valid_index], label=y.iloc[valid_index], enable_categorical=True)
#         bst_xgb = xgb.train(xgb_params, xgb_train, num_boost_round=1000, evals=[(xgb_valid, 'eval')])

#         # Ensemble predictions from LightGBM and XGBoost
#         y_pred_probs_lgb = bst_lgb.predict(X_valid)
#         y_pred_probs_xgb = bst_xgb.predict(xgb_valid)
#         y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb) / 2

#         # Optimal Threshold based on ensemble predictions
#         precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs_avg)
#         f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
#         optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
#         optimal_thresholds.append(optimal_threshold)
#         y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

#         f1 = f1_score(y_valid, y_pred_binary)
#         f1_scores.append(f1)

#         # Save models
#         bst_lgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\lgb_model_fold{fold_n + 1}.txt')
#         bst_xgb.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\xgb_model_fold{fold_n + 1}.txt')

#         return np.mean(f1_scores)

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# # 最適なハイパーパラメータを表示
# print('Number of finished trials:', len(study.trials))
# print('Best trial:')
# trial = study.best_trial

# print('  Value: {:.4f}'.format(trial.value))
# print('  Params: ')
# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))

# print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores))) #174m4.9s kakatta


In [None]:
import optuna

def objective(trial):
    # Hyperparameters to be optimized with Optuna
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0)
    }

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'eta': trial.suggest_float('eta', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 1, 10)
    }

    f1_scores = []

    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # LightGBM
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
        num_round = 10000
        bst_lgb = lgb.train(lgb_params, train_data, num_round, valid_sets=[valid_data], callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True),
                                                                                                     lgb.log_evaluation(1)])

        # XGBoost
        xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
        xgb_valid = xgb.DMatrix(X.iloc[valid_index], label=y.iloc[valid_index], enable_categorical=True)
        bst_xgb = xgb.train(xgb_params, xgb_train, num_boost_round=10000, evals=[(xgb_valid, 'eval')], early_stopping_rounds=500, verbose_eval=100)

        # Ensemble predictions from LightGBM and XGBoost
        y_pred_probs_lgb = bst_lgb.predict(X_valid)
        y_pred_probs_xgb = bst_xgb.predict(xgb_valid)
        y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb) / 2

        # Optimal Threshold based on ensemble predictions
        precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_probs_avg)
        f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
        optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
        y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

        f1 = f1_score(y_valid, y_pred_binary)
        f1_scores.append(f1)

    return -np.mean(f1_scores)

study = optuna.create_study()
study.optimize(objective, n_trials=200)  # You can adjust the number of trials

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores)))


## Test

In [None]:
# Test data predictions
predictions_lgb = []
predictions_xgb = []

# Get features for the test data
final_data_test_features = final_data_test.drop(columns=['is_fraud?', 'is_test'])
xgb_test = xgb.DMatrix(final_data_test_features, enable_categorical=True)

# Get predictions for each model
for fold_n in range(4):
    # LightGBM
    bst_lgb = lgb.Booster(model_file=f'D:\\MUFG Data Science Champion Ship 2023\\model\\lgb_model_fold{fold_n + 1}.txt')
    preds_lgb = bst_lgb.predict(final_data_test_features, num_iteration=bst_lgb.best_iteration)
    predictions_lgb.append(preds_lgb)

    # XGBoost
    bst_xgb = xgb.Booster(model_file=f'D:\\MUFG Data Science Champion Ship 2023\\model\\xgb_model_fold{fold_n + 1}.txt')
    preds_xgb = bst_xgb.predict(xgb_test)
    predictions_xgb.append(preds_xgb)

# Get the average predictions
mean_preds_lgb = np.mean(predictions_lgb, axis=0)
mean_preds_xgb = np.mean(predictions_xgb, axis=0)

# Average predictions from the two models
mean_preds_ensemble = (mean_preds_lgb + mean_preds_xgb) / 2

# Get the average of the optimal thresholds
mean_optimal_threshold = np.mean(optimal_thresholds)

# Convert the predictions to binary
binary_predictions = (mean_preds_ensemble > mean_optimal_threshold).astype(int)

# Create a dataframe for submission
submission_df = pd.DataFrame({
    'ID': df_test['index'].values,
    'is_fraud?': binary_predictions
})

# Save to CSV without header and index
submission_df.to_csv('D:\\MUFG Data Science Champion Ship 2023\\predictions\\submit10.csv', index=False)


In [None]:
# Optunaで最適化されたハイパーパラメータを取得
best_params_lgb = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': study.best_params['num_leaves'],
    'learning_rate': study.best_params['learning_rate'],
    'feature_fraction': study.best_params['feature_fraction']
}

best_params_xgb = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': study.best_params['eta'],
    'max_depth': study.best_params['max_depth']
}

# テストデータの予測用の空のリストを初期化
predictions_lgb = []
predictions_xgb = []


# テストデータの特徴量を取得
final_data_test_features = final_data_test.drop(columns=['is_fraud?', 'is_test'])
xgb_test = xgb.DMatrix(final_data_test_features, enable_categorical=True)

# 各モデルに対して予測を行う
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    # LightGBM
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data)
    num_round = 10000
    bst_lgb = lgb.train(best_params_lgb, train_data, num_round, valid_sets=[valid_data], callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True)])
    preds_lgb = bst_lgb.predict(final_data_test_features)
    predictions_lgb.append(preds_lgb)

    # XGBoost
    xgb_train = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index], enable_categorical=True)
    bst_xgb = xgb.train(best_params_xgb, xgb_train, num_boost_round=10000, evals=[(xgb_valid, 'eval')], early_stopping_rounds=500)
    preds_xgb = bst_xgb.predict(xgb_test)
    predictions_xgb.append(preds_xgb)

# 予測の平均を取得
mean_preds_lgb = np.mean(predictions_lgb, axis=0)
mean_preds_xgb = np.mean(predictions_xgb, axis=0)
mean_preds_ensemble = (mean_preds_lgb + mean_preds_xgb) / 2

# 平均の最適な閾値を取得
mean_optimal_threshold = np.mean(optimal_thresholds)

# 予測を二値に変換
binary_predictions = (mean_preds_ensemble > mean_optimal_threshold).astype(int)

# 送信用のデータフレームを作成
submission_df = pd.DataFrame({
    'ID': df_test['index'].values,
    'is_fraud?': binary_predictions
})

# CSVに保存（ヘッダーとインデックスなし）
submission_df.to_csv('D:\\MUFG Data Science Champion Ship 2023\\predictions\\submit0912v2.csv', index=False)


In [None]:
# すべてのカテゴリ変数のカラムについて、学習データと検証データの一意の値を比較します。
for col in categorical_features:
    train_unique_values = set(X_train[col].unique())
    valid_unique_values = set(X_valid[col].unique())

    # 学習データにのみ存在するカテゴリを検出
    only_train_values = train_unique_values - valid_unique_values
    if only_train_values:
        print(f"Only in training data for {col}: {only_train_values}")

    # 検証データにのみ存在するカテゴリを検出
    only_valid_values = valid_unique_values - train_unique_values
    if only_valid_values:
        print(f"Only in validation data for {col}: {only_valid_values}")


In [None]:
if 'merchant_city' in final_data_test_features.columns:
    print("'merchant_city' exists in final_data_test_features.")
else:
    print("'merchant_city' does not exist in final_data_test_features.")


## Separate models test

In [None]:
def get_predictions_for_transaction_type(data, df_test, transaction_type):
    # Filter test data for the given transaction type
    filtered_data_test = data[(data['use_chip'] == transaction_type) & (data['is_test'] == 1)]

    # Specify categorical variables
    categorical_features = filtered_data_test.select_dtypes(include=['object']).columns.tolist()

    # 'use_chip' column is not needed in the categorical features list as it's already filtered out
    if 'use_chip' in categorical_features:
        categorical_features.remove('use_chip')

    # Convert categorical variables to categorical data type
    for col in categorical_features:
        filtered_data_test[col] = filtered_data_test[col].astype('category')

    # Get features for the filtered test data
    filtered_data_test_features = filtered_data_test.drop(columns=['is_fraud?', 'is_test', 'use_chip'])
    xgb_test = xgb.DMatrix(filtered_data_test_features, enable_categorical=True)

    predictions_lgb = []
    predictions_xgb = []

    # Get predictions for each model
    for fold_n in range(4):
        # LightGBM
        bst_lgb = lgb.Booster(model_file=f'D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\lgb_model_{transaction_type}_fold{fold_n + 1}.txt')
        preds_lgb = bst_lgb.predict(filtered_data_test_features, num_iteration=bst_lgb.best_iteration)
        predictions_lgb.append(preds_lgb)

        # XGBoost
        bst_xgb = xgb.Booster(model_file=f'D:\\MUFG Data Science Champion Ship 2023\\model\\separate models\\xgb_model_{transaction_type}_fold{fold_n + 1}.txt')
        preds_xgb = bst_xgb.predict(xgb_test)
        predictions_xgb.append(preds_xgb)

    # Average the predictions from the two models
    mean_preds_lgb = np.mean(predictions_lgb, axis=0)
    mean_preds_xgb = np.mean(predictions_xgb, axis=0)
    mean_preds_ensemble = (mean_preds_lgb + mean_preds_xgb) / 2

    # Convert the predictions to binary
    binary_predictions = (mean_preds_ensemble > mean_optimal_threshold).astype(int)

    return df_test[df_test['use_chip'] == transaction_type]['index'].values, binary_predictions

# Get predictions for each transaction type and concatenate
transaction_types = ['Swipe Transaction', 'Chip Transaction', 'Online Transaction']
all_ids = []
all_predictions = []

for transaction_type in transaction_types:
    ids, preds = get_predictions_for_transaction_type(final_data, df_test, transaction_type)
    all_ids.extend(ids)
    all_predictions.extend(preds)

# Create a dataframe for submission
submission_df = pd.DataFrame({
    'ID': all_ids,
    'is_fraud?': all_predictions
})

# Sort the dataframe by ID to ensure the original order is maintained
submission_df = submission_df.sort_values(by='ID')

# Save to CSV without header and index
submission_df.to_csv('D:\\MUFG Data Science Champion Ship 2023\\predictions\\submit11_Separatemodels.csv', index=False)



# Adversarial Validation

In [None]:
# 1. カテゴリ変数の指定とデータ型の変換
categorical_features = final_data.select_dtypes(include=['object']).columns.tolist()
for col in categorical_features:
    final_data[col] = final_data[col].astype('category')

# 2. 'amount' カラムの値から `$` を削除し、浮動小数点数に変換
# final_data['amount'] = final_data['amount'].str.replace('$', '').astype(float)

# 3. 'index' カラムの削除
# final_data.drop('index', axis=1, inplace=True)

# 4. 目的変数 'is_fraud?' の削除
# final_data.drop('is_fraud?', axis=1, inplace=True)

# 5. 'is_test' カラムを使用して、学習データとテストデータに分割
final_data_train = final_data[final_data['is_test'] == 0]
final_data_test = final_data[final_data['is_test'] == 1]

# 1. 学習データとテストデータの結合
combined_data = pd.concat([final_data_train, final_data_test])

# 目的変数と説明変数の定義
y = combined_data['is_test']
X = combined_data.drop('is_test', axis=1)

# 学習データと検証データに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBMのデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features, free_raw_data=False)

# パラメータの設定
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# モデルの訓練
num_round = 1000
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data])

# 検証データでの予測
y_pred_val = bst.predict(X_val, num_iteration=bst.best_iteration)

# AUCの計算
auc_score = roc_auc_score(y_val, y_pred_val)
auc_score


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_val)
auc_score = auc(fpr, tpr)

# Plotting the ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
# 特徴量の重要度をプロット
lgb.plot_importance(bst, figsize=(10, 15))
plt.title("Feature Importance")
plt.show()

In [None]:
import shap

# Load the shap values for the trained model
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_val)

# Visualize the shap summary plot
shap.summary_plot(shap_values, X_val)


# Cataboost

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from catboost import CatBoostClassifier


# # 金額のカラムを浮動小数点数に変換
# df_train['amount'] = df_train['amount'].str.replace('$', '').astype(float)

# # indexカラムの削除
# df_train = df_train.drop(columns=['index'])

# # カテゴリ変数のリスト
# categorical_features = ['user_id', 'card_id', 'errors?', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'use_chip']

# # 欠損値処理: カテゴリ変数は 'Unknown' で埋める
# df_train[categorical_features] = df_train[categorical_features].fillna('Unknown')
# df_train['zip'] = df_train['zip'].astype(str)

# # データセットを学習用と検証用に分割
# X = df_train.drop(columns=['is_fraud?'])
# y = df_train['is_fraud?']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # CatBoostの学習
# cat_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200, cat_features=categorical_features)
# cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# # モデルの保存
# # model_path = "path_to_save_model.cbm"  # モデルを保存するパスを設定してください
# # cat_model.save_model(model_path)


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

# 前処理
df_train['amount'] = df_train['amount'].str.replace('$', '').astype(float)
df_train = df_train.drop(columns=['index'])
categorical_features_cat = ['card_id', 'errors?', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'use_chip']
df_train[categorical_features_cat] = df_train[categorical_features_cat].fillna('Unknown')
df_train['zip'] = df_train['zip'].astype(str)

X_cat = df_train.drop(columns=['is_fraud?'])
y_cat = df_train['is_fraud?']

f1_scores_cat = []

folds_cat = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

for fold_n, (train_index_cat, valid_index_cat) in tqdm(enumerate(folds_cat.split(X_cat, y_cat))):
    print(f'Fold {fold_n + 1} started at {time.ctime()}')
    X_train_cat, X_val_cat = X_cat.iloc[train_index_cat], X_cat.iloc[valid_index_cat]
    y_train_cat, y_val_cat = y_cat.iloc[train_index_cat], y_cat.iloc[valid_index_cat]

    cat_model = CatBoostClassifier(iterations=10000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200, cat_features=categorical_features_cat)
    cat_model.fit(X_train_cat, y_train_cat, eval_set=(X_val_cat, y_val_cat), early_stopping_rounds=50)

    y_pred_probs_cat = cat_model.predict_proba(X_val_cat)[:, 1]

    # 最適なしきい値を適用してクラス分類
    precision, recall, thresholds = precision_recall_curve(y_val_cat, y_pred_probs_cat)
    f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
    optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
    y_pred_binary = (y_pred_probs_cat > optimal_threshold).astype(int)

    f1 = f1_score(y_val_cat, y_pred_binary)
    f1_scores_cat.append(f1)

    # モデルの保存
    cat_model.save_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\cat_model_fold{fold_n + 1}.cbm')

print('CatBoost CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores_cat), np.std(f1_scores_cat))) #47m51.3


In [None]:
# LightGBMとXGBoostの予測
# 全データを使ってカテゴリカル変数をエンコード
for col in categorical_features:
    final_data[col] = final_data[col].astype('category')

# 以降のコードはほとんど変わりませんが、lgb.Datasetを作成する際にfree_raw_dataを指定します
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=train_data, free_raw_data=False)


y_pred_probs_lgb = bst_lgb.predict(X_val_cat) # ここでX_val_catを使います
y_pred_probs_xgb = bst_xgb.predict(xgb.DMatrix(X_val_cat, enable_categorical=True))

# CatBoostの予測
y_pred_probs_cat = cat_model.predict_proba(X_val_cat)[:, 1]

# 3つのモデルの予測確率を平均
y_pred_probs_avg = (y_pred_probs_lgb + y_pred_probs_xgb + y_pred_probs_cat) / 3

# 最適なしきい値を適用してクラス分類
precision, recall, thresholds = precision_recall_curve(y_val_cat, y_pred_probs_avg) # ここでy_val_catを使います
f1_scores_thresholds = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores_thresholds)]
y_pred_binary = (y_pred_probs_avg > optimal_threshold).astype(int)

# F1スコアの計算
f1 = f1_score(y_val_cat, y_pred_binary) # ここでy_val_catを使います
print(f'F1 score for ensemble: {f1:.4f}')


In [None]:
import numpy as np

# Test data predictions
predictions_lgb = []
predictions_xgb = []
predictions_cat = []

# Get features for the test data
final_data_test_features = final_data_test.drop(columns=['is_fraud?', 'is_test'])
xgb_test = xgb.DMatrix(final_data_test_features, enable_categorical=True)

# Get predictions for each model
for fold_n in range(4):
    # LightGBM
    bst_lgb = lgb.Booster(model_file=f'D:\\MUFG Data Science Champion Ship 2023\\model\\lgb_model_fold{fold_n + 1}.txt')
    preds_lgb = bst_lgb.predict(final_data_test_features, num_iteration=bst_lgb.best_iteration)
    predictions_lgb.append(preds_lgb)

    # XGBoost
    bst_xgb = xgb.Booster(model_file=f'D:\\MUFG Data Science Champion Ship 2023\\model\\xgb_model_fold{fold_n + 1}.txt')
    preds_xgb = bst_xgb.predict(xgb_test)
    predictions_xgb.append(preds_xgb)

    # CatBoost
    cat_model = CatBoostClassifier()
    cat_model.load_model(f'D:\\MUFG Data Science Champion Ship 2023\\model\\cat_model_fold{fold_n + 1}.cbm')
    preds_cat = cat_model.predict_proba(final_data_test_features)[:, 1]
    predictions_cat.append(preds_cat)

# Get the average predictions
mean_preds_lgb = np.mean(predictions_lgb, axis=0)
mean_preds_xgb = np.mean(predictions_xgb, axis=0)
mean_preds_cat = np.mean(predictions_cat, axis=0)

# Average predictions from the three models
mean_preds_ensemble = (mean_preds_lgb + mean_preds_xgb + mean_preds_cat) / 3

# Get the average of the optimal thresholds (from LightGBM and XGBoost)
mean_optimal_threshold = np.mean(optimal_thresholds)

# Convert the predictions to binary
binary_predictions = (mean_preds_ensemble > mean_optimal_threshold).astype(int)

# Create a dataframe for submission
submission_df = pd.DataFrame({
    'ID': df_test['index'].values,
    'is_fraud?': binary_predictions
})

# Save to CSV without header and index
submission_df.to_csv('D:\\MUFG Data Science Champion Ship 2023\\predictions\\submit_baseline6.csv', index=False)


# Lasso