In [1]:
import numpy as np

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 115)
pd.set_option('display.width', 250)

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler

In [91]:
trainRaw = pd.read_csv('train_final.csv')
trainData = trainRaw
#Remove Duplicate Inputs in Training Data
#trainData = trainRaw.drop_duplicates(subset=trainRaw.columns.difference(['income>50K']),keep=False).reset_index(drop=True)
x = len(trainData.index)

testData = pd.read_csv('test_final.csv')

In [77]:
#Ignore Training Data with ?
trainData = trainData.replace(to_replace = "?",value=np.nan)
trainData = trainData.dropna().reset_index(drop=True)
x = len(trainData.index)
data = trainData.append(testData.iloc[:,1:],ignore_index=True)

In [92]:
#Replace missing data with mode of column
data = trainData.append(testData.iloc[:,1:],ignore_index=True)
for col in data.columns:
    data[col] = data[col].replace(to_replace="?",value=data[col].mode()[0])

In [20]:
#Replace missing data with mode matching output
for row in range(len(trainData.index)):
    for col in range(len(trainData.columns)):
        if(trainData.iloc[row,col] == "?"):
            subTrainData = trainData[trainData['income>50K'] == trainData.iloc[row,-1]]
            mode = subTrainData[trainData.columns[col]].mode()[0]
            trainData.iloc[row,col] = mode

data = trainData.append(testData.iloc[:,1:],ignore_index=True)
for col in data.columns:
    data[col] = data[col].replace(to_replace="?",value=data[col].mode()[0])
    
#print(data)

In [93]:
#Drop fnlwgt and eduction.num, the former because random, the latter because redundant
data = data.drop(['fnlwgt','education.num'],axis=1)
#print(data)

In [94]:
datDummies = pd.get_dummies(data.iloc[:,:-1])
datDummies = (datDummies-datDummies.min())/(datDummies.max()-datDummies.min())
#print(datDummies)

X = datDummies.iloc[:x,:]
y = data['income>50K'][:x]
test = datDummies.iloc[x:,:]
print(len(datDummies.index))
print(X.shape)
N = X.shape[1]
print(len(y.index))
print(len(test.index))

48842
(25000, 103)
25000
23842


In [95]:
EPOCHS = 100
BATCH_SIZE = 200
LEARNING_RATE = 0.01

In [96]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    
train_data = TrainData(torch.FloatTensor(X.to_numpy()), torch.FloatTensor(y.to_numpy()))

class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(test.to_numpy()))

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [97]:
class BinaryClassification(nn.Module):
    def __init__(self,N):
        super(BinaryClassification, self).__init__()        
        # Number of input features is 105
        self.layer_1 = nn.Linear(N, 16) 
        self.layer_2 = nn.Linear(16,16)
        self.layer_out = nn.Linear(16, 1) 
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(16)
        self.batchnorm2 = nn.BatchNorm1d(16)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [98]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [99]:
model = BinaryClassification(N)
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
lambda1 = lambda epoch: 1/(1+epoch/30)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1])

BinaryClassification(
  (layer_1): Linear(in_features=103, out_features=16, bias=True)
  (layer_2): Linear(in_features=16, out_features=16, bias=True)
  (layer_out): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
  (tanh): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [100]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [101]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:05}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
    scheduler.step()

Epoch 00001: | Loss: 0.39538 | Acc: 81.520
Epoch 00002: | Loss: 0.33605 | Acc: 84.440
Epoch 00003: | Loss: 0.33205 | Acc: 84.496
Epoch 00004: | Loss: 0.32644 | Acc: 84.896
Epoch 00005: | Loss: 0.32544 | Acc: 84.800
Epoch 00006: | Loss: 0.31960 | Acc: 85.088
Epoch 00007: | Loss: 0.31582 | Acc: 85.248
Epoch 00008: | Loss: 0.31439 | Acc: 85.528
Epoch 00009: | Loss: 0.31078 | Acc: 85.640
Epoch 00010: | Loss: 0.30998 | Acc: 85.472
Epoch 00011: | Loss: 0.30624 | Acc: 85.664
Epoch 00012: | Loss: 0.30678 | Acc: 85.632
Epoch 00013: | Loss: 0.30454 | Acc: 85.856
Epoch 00014: | Loss: 0.30458 | Acc: 85.752
Epoch 00015: | Loss: 0.30351 | Acc: 85.640
Epoch 00016: | Loss: 0.30337 | Acc: 85.832
Epoch 00017: | Loss: 0.29970 | Acc: 85.832
Epoch 00018: | Loss: 0.29891 | Acc: 85.904
Epoch 00019: | Loss: 0.30038 | Acc: 86.064
Epoch 00020: | Loss: 0.29790 | Acc: 86.272
Epoch 00021: | Loss: 0.29660 | Acc: 86.152
Epoch 00022: | Loss: 0.29807 | Acc: 86.112
Epoch 00023: | Loss: 0.29411 | Acc: 86.272
Epoch 00024

In [102]:
y_pred_list = []

model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
        
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list = np.array(y_pred_list)
y_pred_list = torch.from_numpy(y_pred_list)
print(y_pred_list)

tensor([0., 0., 0.,  ..., 1., 0., 1.], dtype=torch.float64)


In [103]:
valid = np.genfromtxt('validation.csv')
valid = torch.from_numpy(valid)
print(valid)

tensor([0., 0., 0.,  ..., 1., 0., 0.], dtype=torch.float64)


In [104]:
acc = binary_acc(y_pred_list,valid)

print(f'{acc.item()/100*.91784:.5f}')

0.78016


In [105]:
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
with open('NNSub.csv', 'w') as f:
    f.write("ID,Prediction\n")
    for i in range(len(y_pred_list)):
        f.write(str(i+1) + "," + str(y_pred_list[i]) + "\n")