# 导入第三方包

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier as cat
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error, precision_recall_curve, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
from datetime import datetime
import sys
import os
import gc
import argparse
import warnings
warnings.filterwarnings('ignore')

from sklearn.manifold import TSNE # 导入tsne包
from sklearn.decomposition import PCA, KernelPCA # PCA
from sklearn.manifold import Isomap # Isomap
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt

# 数据读取与基本处理

## 数据读取¶

In [None]:
# 初赛训练集数据
credit_card_trans  = pd.read_csv('../input/赛题B_初赛/credit_card_trans.csv')
debit_card_trans = pd.read_csv('../input/赛题B_初赛/debit_card_trans.csv')
installment_info = pd.read_csv('../input/赛题B_初赛/installment_info.csv')
mcc = pd.read_csv('../input/赛题B_初赛/mcc.csv')

## 数据处理

In [None]:
# 删除单一取值的特征
for f in credit_card_trans.columns:
#     print(credit_card_trans[f].nunique())
    if credit_card_trans[f].nunique() < 2:
        credit_card_trans = credit_card_trans.drop(f, axis=1)

# 可视化数据及客户画像

##  credit_card_trans

### 交易金额分布图

In [None]:
plt.hist(credit_card_trans['b45'], bins=20)
plt.xlabel('Credit Card Transaction Amount')
plt.ylabel('Frequency')
plt.title('Credit Card Transaction Amount Distribution', fontsize=14, fontweight="bold")
plt.show()

### 交易时间分布图

In [None]:
import seaborn as sns

credit_card_trans['Credit Card Transaction Time'] = pd.to_datetime(credit_card_trans['b43'])
credit_card_trans['Credit Card Transaction Hour'] = credit_card_trans['Credit Card Transaction Time'].dt.hour

hourly_counts = credit_card_trans['Credit Card Transaction Hour'].value_counts().sort_index()

sns.set(style="whitegrid")

plt.figure(figsize=(12, 6))
sns.barplot(x=hourly_counts.index, y=hourly_counts.values, color='skyblue')
plt.xlabel('Transaction Hour', fontsize=14)
plt.ylabel('Transaction Count', fontsize=14)
plt.title('Credit Card Transaction Time Distribution', fontsize=16, fontweight="bold")
plt.xticks(range(24), fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

### 商品类别分布图

In [None]:
# 统计不同商户类别的交易数量
category_counts = credit_card_trans['b46'].value_counts()

# 设置图表风格
sns.set(style="whitegrid")

# 创建商户类别分布的条形图
plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values, color='skyblue')
plt.xlabel('Merchant Category', fontsize=12)
plt.ylabel('Transaction Count', fontsize=12)
plt.title('Credit Card Merchant Category Distribution', fontsize=14, fontweight="bold")
plt.xticks(rotation=90, fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

## debit_card_trans

### 根据出入账，统计金额

In [None]:
# 根据出入账标记统计交易金额
transaction_amount = debit_card_trans.groupby('b48')['b49'].sum()

sns.set_theme(style="whitegrid")
sns.set_palette("pastel")

plt.figure(figsize=(8, 6))

sns.barplot(x=transaction_amount.index, y=transaction_amount.values)

plt.title('Debit Card Transaction Amount Statistics', fontsize=14, fontweight="bold")
plt.xlabel('Debit Card Flag', fontsize=12)
plt.ylabel('Transaction Amount', fontsize=12)

plt.xticks(ticks=[0, 1], labels=['Credit', 'Debit'], fontsize=10)
plt.yticks(fontsize=10)

for index, value in enumerate(transaction_amount.values):
    plt.text(index, value, f"{value:,.2f}", ha='center', va='bottom', fontsize=10)

# 显示图表
plt.show()

In [None]:
debit_data = debit_card_trans[debit_card_trans['b48'] == "出账"]  # 出账数据
credit_data = debit_card_trans[debit_card_trans['b48'] == "入账"]  # 入账数据

In [None]:
debit_total_amount = debit_data['b49'].sum()
credit_total_amount = credit_data['b49'].sum()
debit_avg_amount = debit_data['b49'].mean()
credit_avg_amount = credit_data['b49'].mean()
debit_count = debit_data.shape[0]
credit_count = credit_data.shape[0]

In [None]:
print("出账金额总计：", debit_total_amount)
print("入账金额总计：", credit_total_amount)
print("出账金额平均值：", debit_avg_amount)
print("入账金额平均值：", credit_avg_amount)
print("出账次数：", debit_count)
print("入账次数：", credit_count)

### 每个时间段出账与入账的客户数量对比

In [None]:
# 提取交易时间的小时
debit_card_trans['Hour'] = pd.to_datetime(debit_card_trans['b47']).dt.hour

# 划分时间段
time_intervals = [0, 4, 8, 12, 16, 20, 24]
labels = ['0-4', '4-8', '8-12', '12-16', '16-20', '20-24']
debit_card_trans['Time_Period'] = pd.cut(debit_card_trans['Hour'], bins=time_intervals, labels=labels, include_lowest=True)

# 统计每个时间段出账与入账的客户数量
customer_count = debit_card_trans.groupby(['Time_Period', 'b48'])['id'].nunique().unstack().fillna(0)

# 重新命名图例
customer_count.rename(columns={'出账': 'Debit', '入账': 'Credit'}, inplace=True)

sns.set_theme(style="whitegrid")
sns.set_palette("pastel")

plt.figure(figsize=(10, 6))

customer_count.plot(kind='bar', width=0.4)

plt.title('Number of Customers with Debit and Credit Transactions by Time Period', fontsize=14, fontweight="bold")
plt.xlabel('Time Period', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)

plt.xticks(range(len(labels)), labels, fontsize=10)
plt.yticks(fontsize=10)

plt.legend(title='Debit Card Flag', loc='upper right')

plt.show()


## installment_info

### 次月分期类型

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="darkgrid")
ax = sns.countplot(x='b51', data=installment_info, palette='Set2')
plt.title('Distribution of Next Month Installment Types', fontsize=14, fontweight="bold")

# 设置x轴标签名称
x_labels = ['TypeA', 'TypeB', 'TypeC']  # 你想要显示的新标签名称
ax.set_xticklabels(x_labels)

plt.xlabel('Installment Type', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)

# Add value labels to each bar
for p in ax.patches:
    ax.annotate(f"{p.get_height()}", (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center',
                xytext=(0, 5), textcoords='offset points', fontsize=10)

plt.show()

### 历史分期次数与未结清分期数之间是否存在关联

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="darkgrid")

sns.regplot(x='b54', y='b55', data=installment_info)
plt.title('Relationship between Historical Installment Count and Outstanding Installment Count', fontsize=14, fontweight="bold")
plt.xlabel('Historical Installment Count', fontsize=12)
plt.ylabel('Outstanding Installment Count', fontsize=12)

plt.show()

### 分期本金与分期期数

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="darkgrid")

sns.regplot(x='b52', y='b53', data=installment_info)
plt.title('Relationship between Installment Principal and Installment Term', fontsize=14, fontweight="bold")
plt.xlabel('Installment Principal', fontsize=12)
plt.ylabel('Installment Term', fontsize=12)

plt.show()