In [1]:
import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import time
import sys
import pandas as pd
import json

Model Definition

In [2]:

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Change input size
        self.input = nn.Linear(573, 128)
        self.hidden1 = nn.Linear(128, 128)
        self.hidden2 = nn.Linear(128, 128)
        self.hidden3 = nn.Linear(128, 128)
        self.output = nn.Linear(128, 1)
        
    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = F.relu(self.hidden3(x))
        x = self.output(x)
        return x


Variables Definintions

In [3]:
t = int(time.time())
useCUDA = True
dataPath = "../large_field_preprocessed_data.csv"
epochs = 100
batchSize = 32
modelPath = f"models/{t}"

Device Check

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if not useCUDA:
	device = torch.device('cpu')

Load data

In [5]:
data = pd.read_csv(dataPath)

Additional Format

In [6]:
dropColumns = [
    "SERIAL",
    "PERNUM",
    "HHWT",
    "CLUSTER",
    "STRATA",
    "PERWT",
    "YRMARR",
    "YRNATUR",
    "RACNUM",
    "index",
    
]

data = data.drop(columns=dropColumns)

for column in data.columns:
	data[column].fillna(False, inplace=True)
 
data = data.drop(data[(data['INCWAGE_CPIU_2010'] == 0)].index)


Data Typing

In [7]:
print(data.shape)
print(data.columns)

# # dump columns into json file
# with open("columns.json", "w") as f:
# 	json.dump(list(data.columns), f)
 
# with open("heads.json", "w") as f:
# 	json.dump(list(data.head(1).values[0]), f)

# set to float32
data['INCWAGE_CPIU_2010'] = data['INCWAGE_CPIU_2010'].astype(np.float32)

for column in data.columns:
	if column == "INCWAGE_CPIU_2010":
		continue
	data[column] = data[column].astype(np.float32)

# print(data.head(1))

(1369570, 574)
Index(['AGE', 'WKSWORK1', 'UHRSWORK', 'INCWAGE_CPIU_2010', 'TRANTIME',
       'isFemale', 'isAmericanIndian', 'isAsian', 'isBlack',
       'isPacificIslander',
       ...
       'occupation_Machine_Feeders_and_Offbearers',
       'occupation_Packers_and_Packagers_Hand',
       'occupation_Pumping_Station_Operators',
       'occupation_Refuse_and_Recyclable_Material_Collectors',
       'occupation_Material_moving_workers_nec',
       'occupation_Military_Officer_Special_and_Tactical_Operations_Leaders',
       'occupation_First-Line_Enlisted_Military_Supervisors',
       'occupation_Military_Enlisted_Tactical_Operations_and_Air/Weapons_Specialists_and_Crew_Members',
       'occupation_Military_Rank_Not_Specified',
       'occupation_Unemployed_with_No_Work_Experience_in_the_Last_5_Years_or_Earlier_or_Never_Worked'],
      dtype='object', length=574)


Tensor Creation

In [8]:
xTensor = torch.tensor(data.drop(columns=['INCWAGE_CPIU_2010']).values, dtype=torch.float32)
yTensor = torch.tensor(data['INCWAGE_CPIU_2010'].values, dtype=torch.float32)

In [9]:

print(xTensor.shape)
print(yTensor.shape)


torch.Size([1369570, 573])
torch.Size([1369570])


Model and Dataset Creation

In [10]:
net = Net()
net = net.to(device)

xTensor = xTensor.to(device)
yTensor = yTensor.to(device)

print(device)

# create dataset
dataset = TensorDataset(xTensor, yTensor)

trainset, valset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.9), int(len(dataset)*0.1)])

# create dataloader for both train and test
trainLoader = DataLoader(trainset, batch_size=batchSize, shuffle=True)

cuda:0


Training

In [11]:

optimizer = optim.Adam(net.parameters(), lr =1e-5)

criterion = nn.MSELoss(reduction='mean')

# trainLoader = trainLoader.to(device)


print("Epochs Started")

for epoch in range(epochs):
	running_loss = 0.0
	for i, data in enumerate(trainLoader):
		X, y = data
		y = y.unsqueeze(1)
		# X = X.to(device)
		# y = y.to(device)
  
		net.zero_grad()


		output = net(X)

		loss = criterion(output, y)
		loss.backward()
		optimizer.step()
		running_loss += loss.item()
		if i % 1000 == 999:    # print every 1000 mini-batches
			print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 1000:.3f}')
			running_loss = 0.0

torch.save(net.state_dict(), modelPath)
torch.cuda.empty_cache()


Epochs Started
[1,  1000] loss: 5767753158.784
[1,  2000] loss: 5624029776.000
[1,  3000] loss: 5256510432.512
[1,  4000] loss: 4383660381.184
[1,  5000] loss: 3597965256.512
[1,  6000] loss: 3106972671.424
[1,  7000] loss: 3094358615.936
[1,  8000] loss: 3044637310.784
[1,  9000] loss: 3090047224.896
[2,  1000] loss: 3003518104.576
[2,  2000] loss: 2985892675.392
[2,  3000] loss: 2962963092.032
[2,  4000] loss: 2964452622.720
[2,  5000] loss: 3019813149.952
[2,  6000] loss: 2909359171.136
[2,  7000] loss: 2944000918.144
[2,  8000] loss: 2916749591.936
[2,  9000] loss: 2892786915.008
[3,  1000] loss: 2849717237.056
[3,  2000] loss: 2855673243.776
[3,  3000] loss: 2833132499.328
[3,  4000] loss: 2814346601.728
[3,  5000] loss: 2920898713.280
[3,  6000] loss: 2760360731.904
[3,  7000] loss: 2797274491.968
[3,  8000] loss: 2878870439.936
[3,  9000] loss: 2795007251.776
[4,  1000] loss: 2749597036.416
[4,  2000] loss: 2774491621.824
[4,  3000] loss: 2757092952.960
[4,  4000] loss: 26523672

In [12]:
newNet = Net()
# load model state dict
newNet.load_state_dict(torch.load(modelPath))
newNet = newNet.to(device)

# test
newNet.eval()


Net(
  (input): Linear(in_features=573, out_features=128, bias=True)
  (hidden1): Linear(in_features=128, out_features=128, bias=True)
  (hidden2): Linear(in_features=128, out_features=128, bias=True)
  (hidden3): Linear(in_features=128, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
)