## 信用卡欺诈数据集 二分类问题

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv(r'../data/credit-a.csv')
data.head()

Unnamed: 0,0,30.83,0.1,0.2,0.3,9,0.4,1.25,0.5,0.6,1,1.1,0.7,202,0.8,-1
0,1,58.67,4.46,0,0,8,1,3.04,0,0,6,1,0,43,560.0,-1
1,1,24.5,0.5,0,0,8,1,1.5,0,1,0,1,0,280,824.0,-1
2,0,27.83,1.54,0,0,9,0,3.75,0,0,5,0,0,100,3.0,-1
3,0,20.17,5.625,0,0,9,0,1.71,0,1,0,1,2,120,0.0,-1
4,0,32.08,4.0,0,0,6,0,2.5,0,1,0,0,0,360,0.0,-1


### 特征值为前15列
### 输出为最后一列，需要把-1替换为0

In [5]:
all_features = data.iloc[:,:-1]
all_labels = data.iloc[:,-1].replace(-1, 0)

### 将特征值标准化

In [6]:
all_features = all_features.apply(
    lambda x: (x - x.mean()) / (x.std())
)

In [7]:
import torch

In [8]:
X = torch.from_numpy(all_features.values).type(torch.float32)
Y = torch.from_numpy(all_labels.values.reshape(-1, 1)).type(torch.float32)

In [9]:
from torch.utils.data import TensorDataset, DataLoader

### 当数据集很小时，可以使用k折验证法验证模型的效果
### k折验证法：将数据集分割为K个部分，取其中一部分作为测试集、剩余部分作为训练集

In [10]:
def k_folder_split(k, i, features, labels, batch_size):
    assert i <= k
    folder_size = features.shape[0] // k
    l = i * folder_size
    r = (i + 1) * folder_size
    train_features = torch.cat([features[:l], features[r:]])
    test_features = features[l:r]
    train_labels = torch.cat([labels[:l], labels[r:]])
    test_labels = labels[l:r]
    return (
        DataLoader(TensorDataset(train_features, train_labels), batch_size=batch_size, shuffle=True),
        DataLoader(TensorDataset(test_features, test_labels), batch_size=batch_size)
    )

In [11]:
from torch import nn

### 使用两层Linear层，然后sigmoid激活
### 二分类问题使用nn.BCELoss()计算损失

In [17]:
in_features = X.shape[1]
model = nn.Sequential(
    nn.Linear(in_features, 32),
    nn.Linear(32, 1),
    nn.Sigmoid()
)
loss = nn.BCELoss()
opt = torch.optim.Adam(model.parameters(), lr=0.01)

In [18]:
k = 5
num_epochs = 50
BATCHSIZE = 16

for i in range(k):
    train_iter, test_iter = k_folder_split(k, i, X, Y, batch_size=BATCHSIZE)
    acc = 0
    count = 0
    for epoch in range(num_epochs):
        for x, y in train_iter:
            opt.zero_grad()
            l = loss(model(x), y)
            l.backward()
            opt.step()
        with torch.no_grad():
            total = 0
            correct = 0
            for x, y in test_iter:
                acc += ((model(x).data.numpy() > 0.5).astype('int') == y.numpy()).mean()
                count += 1
    print(f'{i + 1} of {k} accuracy: {round((acc / count) * 100, 3)}%')



1 of 5 accuracy: 68.347%
2 of 5 accuracy: 84.583%
3 of 5 accuracy: 93.069%
4 of 5 accuracy: 85.528%
5 of 5 accuracy: 92.125%
