In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
torch.cuda.is_available()

True

In [21]:
# reference: https://github.com/yashu-seth/pytorch-tabular/blob/master/pytorch_tabular.py
class TabularDataset(Dataset):
    def __init__(self, data, label, cat_cols=None):
        """
        Characterizes a Dataset for PyTorch
        Parameters
        ----------
        data: pandas data frame
        The data frame object for the input data. It must
        contain all the continuous, categorical 
        label: the output labels
        cat_cols: List of strings
        The names of the categorical columns in the data.
        These columns will be passed through the embedding
        layers in the model. These columns must be
        label encoded beforehand. 
        output_col: string
        The name of the output variable column in the data
        provided.
        """

        self.n = data.shape[0]
        if label:
            self.y = label.astype(np.float32).values.reshape(-1, 1)
        else:
            self.y = np.zeros((self.n, 1))
        

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [
            col for col in data.columns if col not in self.cat_cols
        ]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))

    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n

    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [5]:
class FeedForwardNN(nn.Module):
    def __init__(
        self,
        emb_dims,
        no_of_cont,
        lin_layer_sizes,
        output_size,
        emb_dropout,
        lin_layer_dropouts,
    ):

        """
        Parameters
        ----------
        emb_dims: List of two element tuples
        This list will contain a two element tuple for each
        categorical feature. The first element of a tuple will
        denote the number of unique values of the categorical
        feature. The second element will denote the embedding
        dimension to be used for that feature.
        no_of_cont: Integer
        The number of continuous features in the data.
        lin_layer_sizes: List of integers.
        The size of each linear layer. The length will be equal
        to the total number
        of linear layers in the network.
        output_size: Integer
        The size of the final output.
        emb_dropout: Float
        The dropout to be used after the embedding layers.
        lin_layer_dropouts: List of floats
        The dropouts to be used after each linear layer.
        """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(
            self.no_of_embs + self.no_of_cont, lin_layer_sizes[0]
        )

        self.lin_layers = nn.ModuleList(
            [first_lin_layer]
            + [
                nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
                for i in range(len(lin_layer_sizes) - 1)
            ]
        )

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList(
            [nn.BatchNorm1d(size) for size in lin_layer_sizes]
        )

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList(
            [nn.Dropout(size) for size in lin_layer_dropouts]
        )

    def forward(self, cont_data, cat_data):
        
        if self.no_of_embs != 0:
            x = [
                emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)
            ]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in zip(
            self.lin_layers, self.droput_layers, self.bn_layers
        ):

            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)
        return x


In [6]:
data = pd.read_hdf('data_feat.h5', 'table')

X_train = data[data.date_block_num < 33].drop(['item_cnt_month', 'date_block_num'], axis=1).fillna(-1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month', 'date_block_num'], axis=1).fillna(-1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month', 'date_block_num'], axis=1).fillna(-1)

In [7]:
Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

In [8]:
categorical_features = ['shop_id', 'item_id', 'shop_category', 'shop_city', 'item_category_id', 'name2', 'name3', 'subtype_code', 'type_code', 'month', 'days']

from sklearn.preprocessing import LabelEncoder
for cat_col in categorical_features:
    le = LabelEncoder()
    le.fit(X_train[cat_col])
    le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
    unseen = max(le_dict.values()) + 1
    X_train[cat_col] = X_train[cat_col].apply(lambda x: le_dict.get(x, unseen))
    X_valid[cat_col] = X_valid[cat_col].apply(lambda x: le_dict.get(x, unseen))
    X_test[cat_col] = X_test[cat_col].apply(lambda x: le_dict.get(x, unseen))
    print("#unseen {} in valid: {}".format(cat_col, sum(X_valid[cat_col] == unseen)))
    print("#unseen {} in test: {}".format(cat_col, sum(X_test[cat_col] == unseen)))



#unseen shop_id in valid: 5413
#unseen shop_id in test: 5100
#unseen item_id in valid: 21076
#unseen item_id in test: 27174
#unseen shop_category in valid: 0
#unseen shop_category in test: 0
#unseen shop_city in valid: 0
#unseen shop_city in test: 0
#unseen item_category_id in valid: 0
#unseen item_category_id in test: 42
#unseen name2 in valid: 176
#unseen name2 in test: 924
#unseen name3 in valid: 1232
#unseen name3 in test: 2310
#unseen subtype_code in valid: 0
#unseen subtype_code in test: 42
#unseen type_code in valid: 0
#unseen type_code in test: 0
#unseen month in valid: 0
#unseen month in test: 0
#unseen days in valid: 0
#unseen days in test: 0


In [9]:
train_dataset = TabularDataset(data=X_train, label=Y_train, cat_cols=categorical_features)
valid_dataset = TabularDataset(data=X_valid, label=Y_valid, cat_cols=categorical_features)

In [10]:
batchsize = 64
dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=1)
valid_loader = DataLoader(valid_dataset, batchsize, shuffle=True, num_workers=1)

In [11]:
cat_dims = [int(X_train[col].nunique()) + 1 for col in categorical_features]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

print(cat_dims)
print(emb_dims)

[55, 20041, 6, 32, 82, 157, 1530, 65, 11, 13, 4]
[(55, 28), (20041, 50), (6, 3), (32, 16), (82, 41), (157, 50), (1530, 50), (65, 33), (11, 6), (13, 7), (4, 2)]


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = FeedForwardNN(emb_dims, no_of_cont=21, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.2,
                          lin_layer_dropouts=[0.2,0.2]).to(device)

cuda


In [17]:
import math
import time
no_of_epochs = 100

valid_loss_min = np.Inf # set initial "min" to infinity

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(no_of_epochs):
    ts = time.time()
    # monitor the training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ####
    # train the model
    ####
    model.train()
    for y, cont_x, cat_x in dataloader:
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)
        
        optimizer.zero_grad()
        output = model(cont_x, cat_x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * cat_x.size(0)
          
    model.eval()
    for y, cont_x, cat_x in valid_loader:
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y = y.to(device)
        output = model(cont_x, cat_x)
        output = torch.clamp(output, min=0, max=20)
        loss = criterion(output, y)
        valid_loss += loss.item() * cat_x.size(0)
    
    train_loss = math.sqrt(train_loss/len(dataloader.sampler))
    valid_loss = math.sqrt(valid_loss/len(valid_loader.sampler))
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    print(time.time() - ts)
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 1.124400 	Validation Loss: 0.966090
1018.3296625614166
Validation loss decreased (inf --> 0.966090).  Saving model ...
Epoch: 2 	Training Loss: 1.045522 	Validation Loss: 1.012167
1009.1624648571014
Epoch: 3 	Training Loss: 1.027238 	Validation Loss: 0.994586
1009.5293979644775
Epoch: 4 	Training Loss: 1.016786 	Validation Loss: 0.973187
1006.6833546161652
Epoch: 5 	Training Loss: 1.009292 	Validation Loss: 0.992188
1008.3474681377411
Epoch: 6 	Training Loss: 1.005113 	Validation Loss: 1.003348
1008.9519309997559
Epoch: 7 	Training Loss: 1.002839 	Validation Loss: 0.995489
1007.4971539974213
Epoch: 8 	Training Loss: 1.000478 	Validation Loss: 1.023904
1016.1724345684052
Epoch: 9 	Training Loss: 0.998852 	Validation Loss: 0.944548
1024.5817074775696
Validation loss decreased (0.966090 --> 0.944548).  Saving model ...
Epoch: 10 	Training Loss: 0.995400 	Validation Loss: 0.950251
1032.1049454212189
Epoch: 11 	Training Loss: 0.997604 	Validation Loss: 0.956379
1030

KeyboardInterrupt: 

## predict the test data

In [20]:
with open('model.pt', 'rb') as f:
    model.load_state_dict(torch.load(f))

In [22]:
test_dataset = TabularDataset(data=X_test, label=None, cat_cols=categorical_features)

In [25]:
test_loader = DataLoader(test_dataset, batchsize, shuffle=False, num_workers=1)

Y_test = []
model.eval()

for y, cont_x, cat_x in test_loader:
    cat_x = cat_x.to(device)
    cont_x = cont_x.to(device)
    
    output = model(cont_x, cat_x)
    output = torch.clamp(output, min=0, max=20)
    Y_test.append(output.data.cpu().numpy())
    

In [29]:
Y_test = np.array(Y_test)

In [35]:
Y_test = np.vstack(Y_test)

In [38]:
submission = pd.DataFrame({
    "ID": np.arange(len(Y_test)), 
    "item_cnt_month": Y_test.flatten()
})
submission.to_csv('neural_networks_submission.csv', index=False)