In [None]:
import pandas as pd
import numpy as np
import gc
def reduce_mem_usage(df):
    """ 
    Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
# 2. 读取数据 (确保你的文件在 ../data/ 目录下)
print("正在读取 Transaction 数据...")
train_trans = pd.read_csv('../data/train_transaction.csv')
train_trans = reduce_mem_usage(train_trans)

print("正在读取 Identity 数据...")
train_id = pd.read_csv('../data/train_identity.csv')
train_id = reduce_mem_usage(train_id)
# 3. 合并数据
print("正在合并数据...")
train = train_trans.merge(train_id, on='TransactionID', how='left')
# 4. 清理内存
del train_trans, train_id
gc.collect()
print(f"✅ 准备就绪！当前数据集形状: {train.shape}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#1.把秒数（TransactionDT）转换为日期时间格式
# 逻辑：transactionDT 是从某个起始时间点开始的秒数偏移，我们假设起始时间点为 2017-12-01
train['hour'] = ((train['TransactionDT'] // 3600) % 24).astype(np.int8)
#2.看看好人和坏人在时间上分布的区别
plt.figure(figsize=(14,6))
# 绘制好人（isFraud=0）和坏人（isFraud=1）在不同小时的交易数量分布
sns.histplot(data=train, x='hour', hue='isFraud', multiple='stack', bins=24, palette={0: 'blue', 1: 'red'})
plt.title('Transaction Hour Distribution by Fraud Status')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Transactions')
plt.xticks(range(0,24))
plt.legend(title='isFraud', labels=['Fraud', 'not Fraud'])  
plt.show()

In [None]:
# 1. 计算每张卡 (card1) 的历史平均交易金额
# 逻辑：把所有卡号相同的人聚在一起，算一下他们平时平均刷多少钱
train['card1_amt_mean'] = train.groupby('card1')['TransactionAmt'].transform('mean')

# 2. 计算每张卡 (card1) 的历史交易金额标准差 (波动情况)
# 逻辑：这个人平时消费很稳(std小)，还是忽高忽低(std大)？
train['card1_amt_std'] = train.groupby('card1')['TransactionAmt'].transform('std')

# 3. 制造核心特征：当前金额 vs 历史平均的 差值
# 逻辑：如果差值巨大（比如平时均值100，今天刷了10000），那就是高危
train['amt_minus_mean'] = train['TransactionAmt'] - train['card1_amt_mean']

# 4. 看看我们造出来的新特征长什么样
print(train[['TransactionAmt', 'card1_amt_mean', 'amt_minus_mean']].head())

In [None]:
# 计算每张卡出现的频次 (Count Encoding)
# 这里的 card1 可以理解为卡号（虽然实际是脱敏的，但逻辑通用）
train['card1_count'] = train.groupby('card1')['TransactionID'].transform('count')

# 看看是不是有某些卡刷得特别频繁？
print(train[['card1', 'card1_count']].sort_values(by='card1_count', ascending=False).head())

#1