In [None]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 1. 定义内存优化函数 (这是风控面试常考的工程能力)
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'初始内存占用: {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                else:
                    df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'优化后内存占用: {end_mem:.2f} MB (减少了 {100 * (start_mem - end_mem) / start_mem:.1f}%)')
    return df

# 2. 读取并合并数据
print("正在读取数据 (可能需要1分钟)...")
try:
    # 注意路径：..代表上一级目录，因为 notebook 在 notebooks 文件夹里
    train_trans = pd.read_csv('../data/train_transaction.csv')
    train_id = pd.read_csv('../data/train_identity.csv')
    
    print("正在执行内存优化...")
    train_trans = reduce_mem_usage(train_trans)
    train_id = reduce_mem_usage(train_id)
    
    print("正在合并表...")
    train = train_trans.merge(train_id, on='TransactionID', how='left')
    
    # 清理不再需要的原始变量
    del train_trans, train_id
    gc.collect()
    
    print(f"✅ 数据加载成功！最终形状: {train.shape}")

except FileNotFoundError:
    print("❌ 数据文件未找到，请确保 data 文件夹中包含 train_transaction.csv 和 train_identity.csv")

In [None]:
# --- 修复中文显示问题 (Windows专用) ---
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体为黑体
plt.rcParams['axes.unicode_minus'] = False   # 解决负号'-'显示为方块的问题

In [None]:
#计算欺诈比例
fraud_rate = train['isFraud'].mean()*100
fraud_count = train['isFraud'].value_counts()
print(f"正常交易：{fraud_count[0]}，欺诈交易：{fraud_count[1]}，欺诈比例：{fraud_rate:.4f}%")
# 可视化欺诈交易分布
plt.figure(figsize=(6,4))   
sns.countplot(x='isFraud', data=train)
plt.title('欺诈交易分布')
plt.xlabel('是否欺诈 (0=正常, 1=欺诈)')
plt.ylabel('交易数量')
plt.show()#1