In [1]:
import torch
from torch import nn
from torch.nn import init
import numpy as np
import pandas as pd
import sys
sys.path.append('../')
import ML as ml

In [4]:
data = pd.read_csv('train_dataset.csv')
df = data[['m1','m2','t1','t2','L1','L2','e_b','a_p','ahb','phb']]
df

Unnamed: 0,m1,m2,t1,t2,L1,L2,e_b,a_p,ahb,phb
0,0.67,0.33,4587.1738,3608.2662,0.214719,0.015262,0.20,0.93,0,2
1,0.69,0.31,4644.7566,3550.6834,0.241070,0.012173,0.34,1.07,2,2
2,0.71,0.29,4702.3394,3493.1006,0.269922,0.009563,0.25,1.16,2,2
3,0.63,0.37,4472.0082,3723.4318,0.168852,0.023102,0.17,0.93,2,2
4,0.53,0.47,4184.0942,4011.3458,0.087130,0.055564,0.06,0.93,2,2
...,...,...,...,...,...,...,...,...,...,...
299995,0.65,0.35,4529.5910,3665.8490,0.190700,0.018884,0.09,1.15,2,2
299996,0.56,0.44,4270.4684,3924.9716,0.107359,0.043531,0.10,1.00,2,2
299997,0.79,0.21,4932.6706,3262.7694,0.414005,0.002908,0.52,0.72,0,0
299998,0.54,0.46,4212.8856,3982.5544,0.093511,0.051303,0.08,0.71,0,0


In [5]:
n_train = 270000
train_set = df.iloc[:n_train]
val_set = df.iloc[n_train:]
all_features = df.iloc[:,:-2]
all_features.head(10)

Unnamed: 0,m1,m2,t1,t2,L1,L2,e_b,a_p
0,0.67,0.33,4587.1738,3608.2662,0.214719,0.015262,0.2,0.93
1,0.69,0.31,4644.7566,3550.6834,0.24107,0.012173,0.34,1.07
2,0.71,0.29,4702.3394,3493.1006,0.269922,0.009563,0.25,1.16
3,0.63,0.37,4472.0082,3723.4318,0.168852,0.023102,0.17,0.93
4,0.53,0.47,4184.0942,4011.3458,0.08713,0.055564,0.06,0.93
5,0.75,0.25,4817.505,3377.935,0.335842,0.005569,0.23,0.81
6,0.74,0.26,4788.7136,3406.7264,0.318276,0.006428,0.03,0.66
7,0.53,0.47,4184.0942,4011.3458,0.08713,0.055564,0.22,1.39
8,0.61,0.39,4414.4254,3781.0146,0.149023,0.027979,0.0,1.21
9,0.59,0.41,4356.8426,3838.5974,0.13107,0.033586,0.36,0.57


In [6]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# 标准化后，每个特征的均值变为0，所以可以直接用0来替换缺失值
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [7]:
all_features = pd.get_dummies(all_features, dummy_na=False)
all_features

Unnamed: 0,m1,m2,t1,t2,L1,L2,e_b,a_p
0,0.052350,-0.052350,0.052350,-0.052350,-0.147837,-0.339999,-0.157966,-0.226514
1,0.279597,-0.279597,0.279597,-0.279597,0.092073,-0.504408,0.858060,0.258776
2,0.506844,-0.506844,0.506844,-0.506844,0.354745,-0.643391,0.204901,0.570748
3,-0.402143,0.402143,-0.402143,0.402143,-0.565428,0.077334,-0.375685,-0.226514
4,-1.538377,1.538377,-1.538377,1.538377,-1.309453,1.805365,-1.173991,-0.226514
...,...,...,...,...,...,...,...,...
299995,-0.174897,0.174897,-0.174897,0.174897,-0.366518,-0.147162,-0.956271,0.536084
299996,-1.197507,1.197507,-1.197507,1.197507,-1.125278,1.164857,-0.883698,0.016131
299997,1.415831,-1.415831,1.415831,-1.415831,1.666531,-0.997637,2.164378,-0.954448
299998,-1.424754,1.424754,-1.424754,1.424754,-1.251354,1.578554,-1.028845,-0.989112


In [8]:
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
val_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
#ahb
train_labels = torch.tensor(train_set.phb.values, dtype=torch.long)
val_labels = torch.tensor(val_set.phb.values, dtype=torch.long)

In [9]:
num_inputs, num_outputs, num_hiddens = 8, 3, 128

net = nn.Sequential(
        ml.FlattenLayer(),
        nn.Linear(num_inputs, num_hiddens),
        nn.ReLU(),
        nn.Linear(num_hiddens, num_hiddens),
        nn.ReLU(),
        nn.Linear(num_hiddens, num_hiddens),
        nn.ReLU(),
        nn.Linear(num_hiddens, num_outputs),
         
        )

for params in net.parameters():
    init.normal_(params, mean=0, std=0.01)


In [10]:
batch_size = 512
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
valset = torch.utils.data.TensorDataset(val_features, val_labels)
val_iter = torch.utils.data.DataLoader(valset, batch_size, shuffle=True)

In [11]:
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.5, momentum=0.8)
num_epochs = 100

In [12]:
train_ls, test_ls = ml.train(net, train_iter, val_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0009, train acc 0.798, test acc 0.958
epoch 2, loss 0.0002, train acc 0.965, test acc 0.973
epoch 3, loss 0.0001, train acc 0.973, test acc 0.977
epoch 4, loss 0.0001, train acc 0.977, test acc 0.977
epoch 5, loss 0.0001, train acc 0.980, test acc 0.982
epoch 6, loss 0.0001, train acc 0.980, test acc 0.986
epoch 7, loss 0.0001, train acc 0.982, test acc 0.980
epoch 8, loss 0.0001, train acc 0.982, test acc 0.978
epoch 9, loss 0.0001, train acc 0.983, test acc 0.985
epoch 10, loss 0.0001, train acc 0.983, test acc 0.981
epoch 11, loss 0.0001, train acc 0.984, test acc 0.984
epoch 12, loss 0.0001, train acc 0.984, test acc 0.988
epoch 13, loss 0.0001, train acc 0.985, test acc 0.985
epoch 14, loss 0.0001, train acc 0.985, test acc 0.984
epoch 15, loss 0.0001, train acc 0.985, test acc 0.984
epoch 16, loss 0.0001, train acc 0.985, test acc 0.989
epoch 17, loss 0.0001, train acc 0.986, test acc 0.982
epoch 18, loss 0.0001, train acc 0.986, test acc 0.984
epoch 19, loss 0.00

In [13]:
PATH = "./phb_net.pt"
torch.save(net, PATH)