In [2]:
import torch
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import tqdm
import torch.nn.functional as F

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
train = pd.read_csv("../Data/catfeature/train.csv")
test = pd.read_csv("../Data/catfeature/test.csv")
sample = pd.read_csv("../Data/catfeature/sample_submission.csv")
# sample = pd.read_csv("../input/cat-in-the-dat-ii/sample_submission.csv")

In [7]:
le = preprocessing.LabelEncoder()

In [8]:
test['target'] = 1
features = [x for x in test.columns if x not in ['id', 'target']]
data_ori = pd.concat([train, test]).reset_index(drop=True)
for f in features:
    data_ori[f] = le.fit_transform(data_ori[f].fillna("-1").astype(str).values)

In [9]:
data_ori.shape

(1000000, 25)

In [10]:
train = data_ori[data_ori.target != -1].reset_index(drop=True)
test = data_ori[data_ori.target == -1].reset_index(drop=True)

In [11]:
train_sub = train[['id','bin_0', 'bin_1', 'target']].iloc[:100, :].copy()
test_sub = test[['id','bin_0', 'bin_1', 'target']].iloc[:100, :].copy()

In [12]:
train_sub = train[['id','bin_0', 'bin_1', 'target']].copy()
test_sub = test[['id','bin_0', 'bin_1', 'target']].copy()

In [13]:
# train_sub['id_1'] = train['id']
# test_sub['id_1'] = test['id']
# train_sub['id_2'] = train['id']
# test_sub['id_2'] = test['id']

In [14]:
train_sub.columns

Index(['id', 'bin_0', 'bin_1', 'target'], dtype='object')

In [15]:
cat_feature = ['bin_0', 'bin_1']
# cat_feature = []
num_feature = ['id']

In [16]:
embedding_size = []
for c in cat_feature:
    num_unique_values = int(data_ori[c].nunique())
    embed_dim = int(min(np.ceil(num_unique_values/2), 50))
    embedding_size.append([num_unique_values, embed_dim])  

In [17]:
embedding_size

[[3, 2], [3, 2]]

In [18]:
train_sub.shape

(1000000, 4)

In [19]:
class entity(nn.Module):
    def __init__(self, embedding_size, num_numerical_cols, 
                 output_size, layers=[5], p = 0.4):
        '''
        embedding_size: Contains the embedding size for the categorical columns
        num_numerical_cols: Stores the total number of numerical columns
        output_size: The size of the output layer or the number of possible outputs.
        layers: List which contains number of neurons for all the layers.
        p: Dropout with the default value of 0.5
        
        '''
        super(entity, self).__init__()
        #list of ModuleList objects for all categorical columns
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        
        #drop out value for all layers
        self.embedding_dropout = nn.Dropout(p)
        
        #list of 1 dimension batch normalization objects for all numerical columns
#         self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

        #the number of categorical and numerical columns are added together and stored in input_size
        all_layers = nn.ModuleList() 
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols + num_numerical_cols
        
        #loop iterates to add corresonding layers to all_layers list above
        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
#             all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i
            
        #append output layer to list of layers    
        all_layers.append(nn.Linear(layers[-1], output_size))
        
        #pass all layers to the sequential class
        self.layers = nn.Sequential(*all_layers)
        
#     def _init_weight_(self):

    def forward(self, x_categorical, x_numerical):
        #this starts the embedding of categorical columns
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))

        x = torch.cat(embeddings, 1)
#         print(x.shape)
        x = self.embedding_dropout(x)

        #normalizing numerical columns
#         x_numerical = self.batch_norm_num(x_numerical)
        #concatenating numerical and categorical columns        
#         x_numerical = x_numerical.reshape(-1,1)
#         print('hao--------1', x.shape)
#         print('hao--------2',x_numerical.shape)
        x = torch.cat([x, x_numerical], 1)
        x = self.layers(x)
        x = torch.sigmoid(x)
        return x

In [20]:
class catData(data.Dataset):
    '''
    num_f: the numerical features
    cat_f: the categorical features 
    '''
    def __init__(self, inp_data, num_f,cat_f):
        self.data = inp_data
        self.label = inp_data['target']
        self.cat_f = cat_f
        self.num_f = num_f
    def __len__(self):
        return len(self.label)
    def __getitem__(self, idx):
#         num_data = np.array(self.data.iloc[idx, 0]).reshape(-1,1)
#         cat_data = np.array(self.data.iloc[idx, 1:-1])
        num_data = np.array(self.data[self.num_f])[idx]
        cat_data = np.array(self.data[self.cat_f])[idx]
        label = self.label[idx]
#         print('hao-------!!', label)
        return num_data, cat_data, label
        

In [27]:
train_dataset = catData(train_sub, num_feature, cat_feature)
test_dataset = catData(test_sub, num_feature, cat_feature) 
train_loader = data.DataLoader(train_dataset, batch_size=100, shuffle=False, num_workers=0)
test_loader = data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=0 )

In [28]:
# def __init__(self, embedding_size, num_numerical_cols, output_size, layers=[2], p = 0.4):
# 

In [29]:
model = entity(embedding_size = embedding_size, num_numerical_cols = len(num_feature),
               output_size = 2)
model.to(device)
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [30]:
print(model)

entity(
  (all_embeddings): ModuleList(
    (0): Embedding(3, 2)
    (1): Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=5, out_features=5, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=5, out_features=2, bias=True)
  )
)


In [31]:
for epoch in tqdm.tqdm(range(20)):
    model.train()
    for batch, (num_data, cat_data, label) in enumerate(train_loader):
        model.zero_grad()
        num_data = num_data.to(device)
        cat_data = cat_data.to(device)
        label = label.to(device)
        prediction = model(cat_data,num_data)[:,1]
#         print(prediction)
        label = label.float()
        loss = loss_function(prediction, label)
        loss.backward()
        optimizer.step()

  0%|                                                    | 0/20 [03:34<?, ?it/s]


KeyboardInterrupt: 