In [1]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
torch.__version__

'1.6.0'

# 5.2 Pytorch处理结构化数据
## 简介

首先明确下什么是结构化数据。结构化数据，是高度组织和整齐格式化的数据，是可以放入表格的数据类型。
结构化数据可以理解为是2维表格，如一个csv文件，就是结构化数据。
一般被称作**Tabular Data**,或叫**structured data**，下面来看一下结构化数据例子。

文件来自fastai的自带数据集：
https://github.com/fastai/fastai/blob/master/examples/tabular.ipynb
fastai样例

## 数据预处理

结构化数据一般是一个csv文件或数据库中的一张表格。对于结构化的数据，直接用pasdas库处理

In [2]:
# 读入文件
df = pd.read_csv('./data/adult.csv')

# salary是数据集最后要分类的结果
df['salary'].unique()

array(['>=50k', '<50k'], dtype=object)

In [3]:
# 查看数据类型
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [4]:
# pandas的describe 告诉整个数据集的大概结构
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32074.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.079815,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.572999,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
# 查看一共有多少数据
len(df)

32561

对于模型的训练，只能够处理数字类型数据，首先要将数据分成三个类别

- **训练的结果标签**：即训练结果，通过这个结果就能够明确知道这次训练的任务，是分类还是回归。
- **分类数据**：这类数据离散，无法通过直接输入模型训练。所以预处理时，需要优先处理这部分，这也是数据预处理的主要工作之一
- **数值型数据**：这类数据直接可以输入模型，但是这部分数据有可能还是离散的。如果需要也可以处理，处理后会对训练精度有很大提升

In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
df.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k


In [5]:
# 训练结果
result_var = 'salary'

# 分类型数据
cat_names = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'race', 'sex', 'native-country'
]

# 数值型数据
cont_names = [
    'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
    'hours-per-week'
]

人工确认完数据类型后，看一下分类类型数据的数量和分布情况

In [10]:
for col in df.columns:
    if col in cat_names:
        ccol = Counter(df[col])
        print(col, len(ccol), ccol)
        print("\n")

workclass 9 Counter({' Private': 22696, ' Self-emp-not-inc': 2541, ' Local-gov': 2093, ' ?': 1836, ' State-gov': 1298, ' Self-emp-inc': 1116, ' Federal-gov': 960, ' Without-pay': 14, ' Never-worked': 7})


education 16 Counter({' HS-grad': 10501, ' Some-college': 7291, ' Bachelors': 5355, ' Masters': 1723, ' Assoc-voc': 1382, ' 11th': 1175, ' Assoc-acdm': 1067, ' 10th': 933, ' 7th-8th': 646, ' Prof-school': 576, ' 9th': 514, ' 12th': 433, ' Doctorate': 413, ' 5th-6th': 333, ' 1st-4th': 168, ' Preschool': 51})


marital-status 7 Counter({' Married-civ-spouse': 14976, ' Never-married': 10683, ' Divorced': 4443, ' Separated': 1025, ' Widowed': 993, ' Married-spouse-absent': 418, ' Married-AF-spouse': 23})


occupation 16 Counter({' Prof-specialty': 4073, ' Craft-repair': 4028, ' Exec-managerial': 4009, ' Adm-clerical': 3720, ' Sales': 3590, ' Other-service': 3247, ' Machine-op-inspct': 1968, ' ?': 1820, ' Transport-moving': 1566, ' Handlers-cleaners': 1347, ' Farming-fishing': 977, ' Tech

下一步将分类型数据转成数字型数据，这部分里还做了对于缺失数据的填充

用pandas的**fillna函数**对分类数据做**空值填充**，标识成一个与其他现有值不一样的值就可以。

这里使用三个中划线 --- ，然后用sklearn的**LabelEncoder函数**进行数据处理

对数值型数据做**0填充处理**，对于数值型数据的填充也可以用平均值或者其他方式。

In [6]:
for col in df.columns:
    if col in cat_names:
        df[col].fillna('---')
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    if col in cont_names:
        df[col] = df[col].fillna(0)

In [13]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,4,101320,13,12.0,2,7,5,4,0,0,1902,40,33,>=50k
1,44,4,236746,4,14.0,0,10,1,4,1,10520,0,45,33,>=50k
2,38,4,96185,3,0.0,0,7,4,2,0,0,0,32,33,<50k
3,38,5,112847,6,15.0,2,2,0,1,1,0,0,40,33,>=50k
4,42,6,82297,11,0.0,2,14,5,2,0,0,0,50,33,<50k


数据处理完成，现在所有数据都是数字类型，可以直接输入到模型进行训练

In [7]:
# 分割训练数据和标签
Y = df['salary']
Y_label = LabelEncoder()
Y = Y_label.fit_transform(Y)
Y

array([1, 1, 0, ..., 1, 0, 0])

In [8]:
X = df.drop(columns=result_var)
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,49,4,101320,7,12.0,2,15,5,4,0,0,1902,40,39
1,44,4,236746,12,14.0,0,4,1,4,1,10520,0,45,39
2,38,4,96185,11,0.0,0,15,4,2,0,0,0,32,39
3,38,5,112847,14,15.0,2,10,0,1,1,0,0,40,39
4,42,6,82297,5,0.0,2,8,5,2,0,0,0,50,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,36,4,297449,9,13.0,0,10,1,4,1,14084,0,40,39
32557,23,0,123983,9,13.0,4,0,3,3,1,0,0,40,39
32558,53,4,157069,7,12.0,2,7,0,4,1,0,0,40,39
32559,32,2,217296,11,9.0,2,14,5,4,0,4064,0,22,39


基本的数据预处理已经完成，只是一些必要处理。如果要提高训练准确率还有很多技巧。

## 定义数据集
用pytorch处理数据，肯定要用**Dataset进行数据集的定义**，定义一个简单的数据集

In [9]:
class tabularDataset(Dataset):
    def __init__(self, X, Y):
        self.x = X.values
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

train_ds = tabularDataset(X, Y)

直接用**索引访问**定义好数据集的数据

In [18]:
train_ds[0]

(array([4.9000e+01, 4.0000e+00, 1.0132e+05, 1.3000e+01, 1.2000e+01,
        2.0000e+00, 7.0000e+00, 5.0000e+00, 4.0000e+00, 0.0000e+00,
        0.0000e+00, 1.9020e+03, 4.0000e+01, 3.3000e+01]),
 1)

In [10]:
X.shape

(32561, 14)

In [11]:
Y.shape

(32561,)

## 定义模型

数据已经准备完毕，下一步就是要定义模型，这里使用3层线性层的简单模型

定义模型时候加入**Batch Normalization**做批量归一化：

批量归一化见这篇文章：https://mp.weixin.qq.com/s/FFLQBocTZGqnyN79JbSYcQ

In [12]:
class tabularModel(nn.Module):
    def __init__(self):
        super().__init__()

        # Linear1
        self.lin1 = nn.Linear(14, 500)

        # Linear2
        self.lin2 = nn.Linear(500, 100)

        # Linear3
        self.lin3 = nn.Linear(100, 2)

        self.bn_in = nn.BatchNorm1d(14)
        self.bn1 = nn.BatchNorm1d(500)
        self.bn2 = nn.BatchNorm1d(100)

    def forward(self, x_in):

        # print(x_in.shape)

        # BatchNorm in
        x = self.bn_in(x_in)

        # Linear1 + relu
        x = F.relu(self.lin1(x))

        # BatchNorm 1
        x = self.bn1(x)
        #print(x)

        # Linear2 + relu
        x = F.relu(self.lin2(x))

        # BatchNorm 2
        x = self.bn2(x)
        #print(x)

        # Linear3 + sigmoid
        x = self.lin3(x)
        x = torch.sigmoid(x)
        return x

## 训练

In [14]:
# 训练前指定使用设备
DEVICE = torch.device("cpu")
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
print(DEVICE)

cpu


In [15]:
# 损失函数
criterion = nn.CrossEntropyLoss()

In [16]:
#实例化模型
model = tabularModel().to(DEVICE)
print(model)

tabularModel(
  (lin1): Linear(in_features=14, out_features=500, bias=True)
  (lin2): Linear(in_features=500, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=2, bias=True)
  (bn_in): BatchNorm1d(14, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [17]:
# 测试模型
rn = torch.rand(3, 14).to(DEVICE)
model(rn)

tensor([[0.5634, 0.3531],
        [0.4887, 0.5639],
        [0.5049, 0.6096]], grad_fn=<SigmoidBackward>)

In [18]:
# 学习率
LEARNING_RATE = 0.01

# BS
batch_size = 1024

# 优化器
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [21]:
# DataLoader加载数据
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
train_dl

<torch.utils.data.dataloader.DataLoader at 0x128f02a60>

以上的基本步骤每个训练过程都需要，下面开始模型的训练

In [22]:
%%time

model.train()

# 训练10轮
TOTAL_EPOCHS = 100

# 记录损失函数
losses = []

for epoch in range(TOTAL_EPOCHS):
    for i, (x, y) in enumerate(train_dl):

        #输入float类型
        x = x.float().to(DEVICE)

        #结果标签long类型
        y = y.long().to(DEVICE)

        # 清零
        optimizer.zero_grad()

        # 计算网络输出
        outputs = model(x)

        # 计算损失函数
        loss = criterion(outputs, y)

        # 梯度反向传播
        loss.backward()

        # 优化算法
        optimizer.step()

        losses.append(loss.cpu().data.item())
    if epoch % 10 == 9:
        print('Epoch : %d/%d,   Loss: %.4f' %
              (epoch + 1, TOTAL_EPOCHS, np.mean(losses)))

Epoch : 10/100,   Loss: 0.4564
Epoch : 20/100,   Loss: 0.4514
Epoch : 30/100,   Loss: 0.4482
Epoch : 40/100,   Loss: 0.4456
Epoch : 50/100,   Loss: 0.4429
Epoch : 60/100,   Loss: 0.4405
Epoch : 70/100,   Loss: 0.4382
Epoch : 80/100,   Loss: 0.4360
Epoch : 90/100,   Loss: 0.4339
Epoch : 100/100,   Loss: 0.4320
CPU times: user 6min 9s, sys: 4.74 s, total: 6min 13s
Wall time: 1min 34s


训练完成后看一下模型的准确率

In [30]:
model.eval()
correct = 0
total = 0

for i, (x, y) in enumerate(train_dl):
    x = x.float().to(DEVICE)
    y = y.long()
    
    outputs = model(x).cpu()
    _, predicted = torch.max(outputs.data, 1)
    total += y.size(0)
    correct += (predicted == y).sum()
    
print("准确率: %0.2f%%"  % (100.0*correct / total))

准确率: 90.06%


以上就是基本流程