In [1]:
# Environment setup and module import
import torch
from torch.utils import data
import torch.nn as nn
import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
debug_encoding = True # if True, uses existing mini_train_encoded.csv file so this runs fast
save_data = False # default: False. This is for diagnostic purposes.  Remove later.

In [3]:
class Model(nn.Module):

    def __init__(self, numerical_cols, output_size, layers, p=0.4):
        print("starting")
        super().__init__()
        self.batch_norm_num = nn.BatchNorm1d(numerical_cols)
        all_layers = []
        input_size = numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_numerical):
        x_numerical = self.batch_norm_num(x_numerical)
        x = torch.cat([x_numerical], 1)
        x = self.layers(x)
        return x

In [4]:
params = {'batch_size': 10,
          'shuffle': True,
          'num_workers': 6}
max_epochs = 100

In [5]:
# Import DF from CSV

if debug_encoding == True:
    dataset = 'data/mini_train_encoded.csv'
else:
    dataset = 'data/train_encoded.csv'

df = pd.read_csv(dataset)


In [13]:
# Remove hasdetections from the df.  Make it a new df for labels.
label_df = pd.DataFrame()
label_df = df['HasDetections'].to_numpy()
df.drop(columns=['HasDetections'])

# Convert dtypes to be all identifical for pytorch
df.astype('float64').dtypes
print(type(df))

# Iterate and collect all data values
cols = []
for c in df.columns:
    cols.append(df[c].values)
    
data = np.stack(cols, 1)
data = torch.tensor(data, dtype=torch.float)
output = torch.tensor(label_df)

# Flatten all other variables into a single element
#train_df = pd.DataFrame()
#df['concat'] = pd.Series(df.fillna('').values.tolist()).map(lambda x: ''.join(map(str,x)))
#df['concat'] = df['concat'].str.replace('.','. ')
#df['concat'] = df['concat'].str.replace('-','')
#train_df = df['concat'].to_numpy()
print(data.shape)
print(output.shape)



<class 'pandas.core.frame.DataFrame'>
torch.Size([312251, 150])
torch.Size([312251])


In [9]:
model = Model(data.shape[1], 2, [200,100,50], p=0.4)
print(model)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

starting
Model(
  (batch_norm_num): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=150, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=2, bias=True)
  )
)


In [14]:
epochs = 300
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(data)
    single_loss = loss_function(y_pred, label_df)
    aggregated_losses.append(single_loss)

    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

TypeError: 'int' object is not callable

In [8]:
if save_data == True:
    pd.DataFrame(label_df).to_csv("data/pt-temp-label.csv")
    pd.DataFrame(df).to_csv("data/pt-temp-train.csv")