In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

infile = "../../data/run0/run0.npz"
#outfile = "../../data/balanced_adult_matched_pairs/balanced_adult_matched_pairs.npz"

random.seed(0)

In [6]:
data = np.load(infile)

In [19]:
list(data.keys())

['x_train',
 'x_test',
 'y_train',
 'y_test',
 'attr_train',
 'attr_test',
 'train_inds',
 'valid_inds']

In [20]:
x_train = data['x_train'][data['train_inds']]
y_train = data['y_train'][data['train_inds']]
attr_train = data['attr_train'][data['train_inds']]

x_valid = data['x_train'][data['valid_inds']]
y_valid = data['y_train'][data['valid_inds']]
attr_valid = data['attr_train'][data['valid_inds']]

x_test = data['x_test']
y_test = data['y_test']
attr_test = data['attr_test']

xa_train = np.concatenate((x_train, attr_train), axis = 1)
xa_valid = np.concatenate((x_valid, attr_valid), axis = 1)
xa_test = np.concatenate((x_test, attr_test), axis = 1)

In [6]:
model = LogisticRegression()

In [14]:
model.fit(xa_train, y_train[:,1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
model.score(xa_train, y_train[:,1])

0.78565

In [16]:
model.score(xa_test, y_test[:,1])

0.8302

In [17]:
model.score(xa_valid, y_valid[:,1])

0.7871

In [24]:
for i in range(model.coef_.shape[1]):
    print(i, model.coef_[0][i])

0 -0.5393916578290149
1 0.4816547065464337
2 -0.5293972178167717
3 -0.08352527237617917
4 0.51407033002563
5 -0.5264150004270932
6 0.47945838388000933
7 -0.47469836769114365
8 0.44012826547932343
9 -0.46917460801216143
10 0.43272211424107093
11 0.0759706061788131
12 -0.04423772285933652
13 -0.46542061621778363
14 0.4762274248920727
15 -0.08070263623938101
16 0.1236681835080698
17 0.49607565021004524
18 -0.08480440334159293
19 -0.1003181407535637
20 0.14804395984250848
21 -0.10554777513273798
22 0.066868786534361
23 -0.4927205366350978
24 -0.5292446830119528
25 0.44874435360054016
26 0.4464875099902162
27 0.45545971836147625
28 0.07782469582260987
29 -0.13170146132590455
30 -0.019075997742463865
31 0.05124548441545944
32 0.0002733957184938747
33 -0.06160709094232852
34 0.056044373742144854
35 -0.06031702737734131
36 0.00875307118881495
37 0.04414452159913376
38 -0.07246063335281216
39 0.05163936947003219


In [23]:
model.coef_[0][1]

0.4816547065464337

In [63]:
x_test.shape

(5000, 30)

In [103]:
print(attr_test.shape)
print(attr_test)

(5000, 10)
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [123]:
attr_test2 = np.concatenate( (np.zeros((5000,4)), np.ones((5000,1)), np.zeros((5000,5))  ), axis=1 )
attr_test2 = np.zeros((5000,10))

# Make training set into mini-batches

In [34]:
test = np.array([[1,1,1,1], [3,3,3,3], [4,4,4,4], [50,50,50,50]])
ytest = np.array([0, 0, 1, 1])
xytest = np.concatenate( (test, np.expand_dims(ytest, axis=1) ) , axis = 1)
print(xytest)
print(np.split(xytest, 2))

[[ 1  1  1  1  0]
 [ 3  3  3  3  0]
 [ 4  4  4  4  1]
 [50 50 50 50  1]]
[array([[1, 1, 1, 1, 0],
       [3, 3, 3, 3, 0]]), array([[ 4,  4,  4,  4,  1],
       [50, 50, 50, 50,  1]])]


In [93]:
x_train_w_y = np.concatenate( (x_train, attr_train, np.expand_dims(y_train[:, 1], axis=1) ) , axis = 1)


In [94]:
np.random.shuffle(x_train_w_y)
print(x_train_w_y.shape)

(40000, 41)


In [95]:
x_train_batches = np.split(x_train_w_y, 400)
print(x_train_batches[0].shape)

(100, 41)


In [96]:
x_train_batches = torch.tensor(x_train_batches, dtype=torch.float)

In [124]:
x_test_w_y = np.concatenate( (x_test, attr_test2, np.expand_dims(y_test[:, 1], axis=1) ) , axis = 1)
np.random.shuffle(x_test_w_y)
x_test_batches = np.split(x_test_w_y, 50)
x_test_batches = torch.tensor(x_test_batches, dtype=torch.float)

In [98]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(40, 10)
        self.fc2 = nn.Linear(10, 10)
        self.fc3 = nn.Linear(10, 10)
        self.fc4 = nn.Linear(10, 2)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        #x = x.view(-1, self.num_flat_features(x))
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        x = F.sigmoid(self.fc4(x))
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

Net(
  (fc1): Linear(in_features=40, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=10, bias=True)
  (fc4): Linear(in_features=10, out_features=2, bias=True)
)


In [99]:
import torch.optim as optim

# create your optimizer
criterion = nn.CrossEntropyLoss()


optimizer = optim.Adam(net.parameters(), lr=0.001)



In [126]:
for epoch in range(100):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(x_train_batches, 0):
        # get the inputs
        inputs = data[:,:-1]
        labels = data[:,-1].type(torch.long)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if epoch % 10 == 0 and i % 400 == 399:    # print every 400 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 400))
            running_loss = 0.0
            
    np.random.shuffle(x_train_batches)
    x_train_batches = torch.tensor(x_train_batches, dtype=torch.float)

print('Finished Training')

[1,   400] loss: 0.313




[11,   400] loss: 0.313
[21,   400] loss: 0.313
[31,   400] loss: 0.313
[41,   400] loss: 0.313
[51,   400] loss: 0.313
[61,   400] loss: 0.313
[71,   400] loss: 0.313
[81,   400] loss: 0.313
[91,   400] loss: 0.313
Finished Training


In [127]:
correct = 0
total = 0
with torch.no_grad():
    for data in x_test_batches:
        inputs = data[:,:-1]
        labels = data[:,-1].type(torch.long)
        
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test data: %d %%' % (
    100 * correct / total))

Accuracy of the network on the test data: 79 %
