In [1]:
import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import time
import sys
import pandas as pd
import json

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Change input size
        self.input = nn.Linear(573, 128)
        self.hidden1 = nn.Linear(128, 128)
        self.hidden2 = nn.Linear(128, 128)
        self.hidden3 = nn.Linear(128, 128)
        self.output = nn.Linear(128, 1)
        
    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = F.relu(self.hidden3(x))
        x = self.output(x)
        return x


In [2]:
t = int(time.time())
useCUDA = False
dataPath = "../large_field_preprocessed_data.csv"
epochs = 20
batchSize = 128
modelPath = f"models/{t}"

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if not useCUDA:
	device = torch.device('cpu')

In [4]:
data = pd.read_csv(dataPath)

In [5]:
dropColumns = [
    "SERIAL",
    "PERNUM",
    "HHWT",
    "CLUSTER",
    "STRATA",
    "PERWT",
    "YRMARR",
    "YRNATUR",
    "RACNUM",
    "index",
    
]

data = data.drop(columns=dropColumns)

for column in data.columns:
	data[column].fillna(False, inplace=True)
 
data = data.drop(data[(data['INCWAGE_CPIU_2010'] == 0)].index)


In [6]:
print(data.shape)
print(data.columns)

# dump columns into json file
with open("columns.json", "w") as f:
	json.dump(list(data.columns), f)
 
with open("heads.json", "w") as f:
	json.dump(list(data.head(1).values[0]), f)

# set to float32
data['INCWAGE_CPIU_2010'] = data['INCWAGE_CPIU_2010'].astype(np.float32)

for column in data.columns:
	if column == "INCWAGE_CPIU_2010":
		continue
	data[column] = data[column].astype(np.float32)

# print(data.head(1))

(1369570, 574)
Index(['AGE', 'WKSWORK1', 'UHRSWORK', 'INCWAGE_CPIU_2010', 'TRANTIME',
       'isFemale', 'isAmericanIndian', 'isAsian', 'isBlack',
       'isPacificIslander',
       ...
       'occupation_Machine_Feeders_and_Offbearers',
       'occupation_Packers_and_Packagers_Hand',
       'occupation_Pumping_Station_Operators',
       'occupation_Refuse_and_Recyclable_Material_Collectors',
       'occupation_Material_moving_workers_nec',
       'occupation_Military_Officer_Special_and_Tactical_Operations_Leaders',
       'occupation_First-Line_Enlisted_Military_Supervisors',
       'occupation_Military_Enlisted_Tactical_Operations_and_Air/Weapons_Specialists_and_Crew_Members',
       'occupation_Military_Rank_Not_Specified',
       'occupation_Unemployed_with_No_Work_Experience_in_the_Last_5_Years_or_Earlier_or_Never_Worked'],
      dtype='object', length=574)


In [7]:
xTensor = torch.tensor(data.drop(columns=['INCWAGE_CPIU_2010']).values, dtype=torch.float32)

In [8]:

yTensor = torch.tensor(data['INCWAGE_CPIU_2010'].values, dtype=torch.float32)


In [9]:

print(xTensor.shape)
print(yTensor.shape)


torch.Size([1369570, 573])
torch.Size([1369570])


In [10]:
# create dataset
dataset = TensorDataset(xTensor, yTensor)

# create dataloader for both train and test
trainLoader = DataLoader(dataset, batch_size=batchSize, shuffle=True)

In [11]:
net = Net()
net.to(device)
# X = torch.rand((1, 572))
# print(net(X))

Net(
  (input): Linear(in_features=573, out_features=128, bias=True)
  (hidden1): Linear(in_features=128, out_features=128, bias=True)
  (hidden2): Linear(in_features=128, out_features=128, bias=True)
  (hidden3): Linear(in_features=128, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
)

In [12]:

optimizer = optim.Adam(net.parameters(), lr =1e-5)

criterion = nn.MSELoss(reduction='mean')

print("Epochs Started")

for epoch in range(epochs):
	running_loss = 0.0
	for i, data in enumerate(trainLoader):
		X, y = data
		X.to(device)
		y.to(device)
		y = y.unsqueeze(1)
  
		net.zero_grad()


		output = net(X)

		loss = criterion(output, y)
		loss.backward()
		optimizer.step()
		running_loss += loss.item()
		if i % 1000 == 999:    # print every 1000 mini-batches
			print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 1000:.3f}')
			running_loss = 0.0

torch.save(net.state_dict(), modelPath)
torch.cuda.empty_cache()


Epochs Started
[1,  1000] loss: 5612911177.344
[1,  2000] loss: 5512800308.864
[1,  3000] loss: 5275259403.264
[1,  4000] loss: 4431717682.176
[1,  5000] loss: 3571730664.032
[1,  6000] loss: 3241885942.528
[1,  7000] loss: 3040397265.920
[1,  8000] loss: 3087872453.120
[1,  9000] loss: 3021061003.648
[1, 10000] loss: 2962593602.624
[2,  1000] loss: 2974626615.680
[2,  2000] loss: 3020916042.112
[2,  3000] loss: 2878301240.000
[2,  4000] loss: 2842450719.872
[2,  5000] loss: 2919820770.816
[2,  6000] loss: 2948778435.264
[2,  7000] loss: 2877514332.992
[2,  8000] loss: 2928461682.624
[2,  9000] loss: 2837417644.288
[2, 10000] loss: 2856140328.896
[3,  1000] loss: 2804650355.584
[3,  2000] loss: 2832945164.288
[3,  3000] loss: 2758494549.632
[3,  4000] loss: 2787479975.808
[3,  5000] loss: 2766017796.928
[3,  6000] loss: 2743823345.792
[3,  7000] loss: 2755527212.928
[3,  8000] loss: 2655693886.656
[3,  9000] loss: 2664162387.776
[3, 10000] loss: 2707041591.936
[4,  1000] loss: 26313402