In [3]:
!pip3 install numpy==1.15.4



In [4]:
import pandas as pd
import numpy as np
import re
import json
import pickle

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F

from tqdm import tqdm
from tqdm import tqdm_notebook
from tqdm import notebook

import h5py
from google.colab import drive

In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
cd drive/My\ Drive

/content/drive/My Drive


In [7]:
def labels_map(y):
    S=set(y)
    num=len(S)
    dic={word:i for i, word in enumerate(S)}
    
    labels=y.apply(lambda val: dic[val])
    
    return labels.values    

# All names and categories

In [8]:
df=pd.read_hdf('name_category.h5',key='data',mode='r')

In [9]:
df['name']=df['name'].str.lower()

In [None]:
num_classes=len(set(df['category'].values))
num_classes

1186

In [None]:
labels=labels_map(df['category'])

### Small subset

In [10]:
lst=['Bouwbedrijf','Computersoftware','Kapper','Reclame','Administratiekantoor','Adviesbureaus']
df_reduced=df[df['category'].isin(lst)].copy()
df_reduced.loc[:,'name']=df_reduced['name'].str.lower()

In [12]:
labels=labels_map(df_reduced['category'])
num_classes=len(set(labels))

# Experimental categories

In [93]:
df=pd.read_hdf('data_conv_experimental.h5',key='data',mode='r')
df.loc[:,'name']=df['name'].str.lower()
df['category0'].value_counts()

opleiding          355211
diensten           262805
winkel             259861
vrije tijd         140906
gezondheidszorg    136865
gereedschap        127734
regering            99355
reizen              25868
boodschappen         3901
Name: category0, dtype: int64

In [None]:
labels=labels_map(df['category0'])
num_classes=len(set(labels))
print("Number of classes: ",num_classes)

Number of classes:  9


### Small subset

In [94]:
lst=['diensten','winkel','gereedschap']
idx=  df['category0'].isin(lst)
df_reduced=df[idx].copy()

In [95]:
labels=labels_map(df_reduced['category0'])
num_classes=len(set(labels))

# Pytorch model

In [107]:
max_length=40
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} "
args={'alphabet':alphabet,'number_of_characters':len(alphabet),'max_length':max_length,'dropout_input':0.2,'number_of_classes':num_classes}

In [None]:
args

{'alphabet': 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{} ',
 'dropout_input': 0.2,
 'max_length': 40,
 'number_of_characters': 70,
 'number_of_classes': 1186}

In [128]:
class CNN_Embedding(nn.Module):
    def __init__(self,num_categories,embedding_dim, args):
        super(CNN_Embedding, self).__init__()

        # define conv layers
        self.num_categories=num_categories
        self.embed_dim=embedding_dim
        self.embed=nn.Linear(self.embed_dim,self.num_categories,bias=False)
        self.drop_input=nn.Dropout2d(0.3)
        self.conv1 = nn.Sequential(nn.Conv1d(args['number_of_characters'],
                                             500,
                                             kernel_size=6,
                                             padding=0),
                                   nn.ReLU(),
                                   nn.MaxPool1d(2)
                                   )

        self.conv2 = nn.Sequential(nn.Conv1d(500, 500, kernel_size=3, padding=0),
                                   nn.ReLU(),
                                   nn.MaxPool1d(2)
                                   )
      
        self.conv3 = nn.Sequential(nn.Conv1d(100, 100, kernel_size=3, padding=0),
                                   nn.ReLU()
                                   )
        
        
        # compute the  output shape after forwarding an input to the conv layers

        input_shape = (128,
                       args['max_length'],
                       args['number_of_characters'])
        
        self.output_dimension = self._get_conv_output(input_shape)

        # define linear layers

        self.fc1 = nn.Sequential(
            nn.Linear(self.output_dimension, 500),
            nn.ReLU(),nn.Dropout(0.2)
            
        )

        self.fc2 = nn.Sequential(
            nn.Linear(500, 500),
            nn.ReLU()
            
        )

        self.fc3 = nn.Sequential(nn.Linear(500, self.embed_dim))

        # initialize weights

        self._create_weights()

    # utility private functions

    def _create_weights(self, mean=0.0, std=0.05):
        for module in self.modules():
            if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
                module.weight.data.normal_(mean, std)


    def _get_conv_output(self, shape):
        x = torch.rand(shape)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        #x = self.conv3(x)
        x = x.view(x.size(0), -1)
        output_dimension = x.size(1)
        return output_dimension

    # forward

    def forward(self, x):
        x=self.drop_input(x)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        #x = self.conv3(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x=F.normalize(x,p=2,dim=1)
        x=self.embed(x)
        x=x/torch.norm(self.embed.weight, dim=1).view(1,self.num_categories)
        
        return x

In [108]:
class MyDataset(Dataset):
    def __init__(self, texts, labels, args):
        self.texts = texts
        self.labels = labels
        self.length = len(self.texts)
        self.num_classes=args['number_of_classes']
        self.vocabulary = args['alphabet'] 
        self.number_of_characters = args['number_of_characters'] 
        self.max_length = args['max_length']
        self.identity_mat = np.identity(self.number_of_characters)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        raw_text = self.texts[index]

        data = np.array([self.identity_mat[self.vocabulary.index(i)] for i in list(raw_text) if i in self.vocabulary],
                        dtype=np.float32)
        if len(data) > self.max_length:
            data = data[:self.max_length]
        elif 0 < len(data) < self.max_length:
            data = np.concatenate(
                (data, np.zeros((self.max_length - len(data), self.number_of_characters), dtype=np.float32)))
        elif len(data) == 0:
            data = np.zeros(
                (self.max_length, self.number_of_characters), dtype=np.float32)

        i = self.labels[index]
        label=-1*torch.ones(self.num_classes)
        label[i]=1
        data = torch.Tensor(data)

        return data, label

### Focal Loss

In [None]:
from torch.autograd import Variable

class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): 
            self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): 
            self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, x, target):
        if x.dim()>2:
            x = x.view(x.size(0),x.size(1),-1)  # N,C,H,W => N,C,H*W
            x = x.transpose(1,2)    # N,C,H*W => N,H*W,C
            x = x.contiguous().view(-1,x.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(x)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=x.data.type():
                self.alpha = self.alpha.type_as(x.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: 
            return loss.mean()
        else: 
            return loss.sum()

# Train

In [16]:
def train(model, train_loader, epochs, criterion, optimizer, device):
    L=len(train_loader.dataset)
    for epoch in range(1, epochs + 1):
        model.train() # Make sure that the model is in training mode.

        total_loss = 0
        iterator=notebook.tqdm(train_loader)
        for batch in iterator:
            # get data
            batch_x, batch_y = batch
            
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()

            # get predictions from model
            y_pred = model(batch_x)
        
            # perform backprop
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        iterator.close()
        print("Epoch: {}, Loss: {}".format(epoch, total_loss / L))

# Train/Test data processing

In [109]:
indices=np.arange(df_reduced.shape[0])
np.random.shuffle(indices)
data_x=df_reduced.iloc[indices].copy()['name'].values
data_y=labels[indices]

nb_validation=int(0.2*len(indices))

x_train=data_x[:-nb_validation]
y_train=data_y[:-nb_validation]

x_test=data_x[-nb_validation:]
y_test=data_y[-nb_validation:]

train_ds=MyDataset(x_train,y_train,args)
test_ds=MyDataset(x_test,y_test,args)
test_dl=torch.utils.data.DataLoader(test_ds, batch_size=128)
train_dl=torch.utils.data.DataLoader(train_ds, batch_size=128)

In [97]:
len(train_ds)+len(test_ds)

650400

In [98]:
num_classes

3

## Initialize model and train

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device {}.".format(device))

Using device cuda.


In [129]:
model=CNN_Embedding(num_classes,20,args).to(device)

In [130]:
optimizer = optim.Adam(model.parameters(),lr=0.1)
loss_fn = torch.nn.MSELoss(reduction='sum')

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.0001)
loss_fn = FocalLoss(gamma=2)

In [1]:
train(model, train_dl, 30, loss_fn,optimizer, device)

NameError: ignored

In [25]:
def test_MSE(model,val_dl):
  model.eval()
  with torch.no_grad():
        loss=0
        for batch in val_dl:
            val_X,val_y=batch
            val_X=val_X.to(device)
            val_y=val_y.to(device)
            x=model(val_X)
            loss+=loss_fn(x,val_y)
    
  loss=loss/len(val_dl.dataset)
  return loss

In [126]:
test_MSE(model,test_dl)

tensor(1.8195, device='cuda:0')

In [27]:
def topk_accuracy(model,val_dl,k):
  model.eval()
  with torch.no_grad():
        accu=0
        for batch in val_dl:
            val_X,val_y=batch
            val_X=val_X.to(device)
            val_y=val_y.to(device)
            z=val_y.argmax(dim=1).view(val_y.size(0),1)
            preds=model(val_X)
            out=torch.topk(preds,k,dim=1).indices
            w=(out==z).sum(dim=1)
            accu+=(w!=0).sum().item()
    
  accu=accu/len(val_dl.dataset)
  return accu

In [132]:
topk_accuracy(model,test_dl,1)

0.6720940959409594

In [91]:
topk_accuracy(model,train_dl,1)

0.6884263071222725

In [None]:
for b in train_dl:
  model.eval()
  x,y=b
  x=x.to(device)
  y=y.to(device)
  break

In [None]:
z=model(x)

In [None]:
y[0]

tensor([-1., -1., -1.,  ..., -1., -1., -1.], device='cuda:0')

In [None]:
y[0].argmax()

tensor(399, device='cuda:0')

In [None]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [None]:
get_n_params(model)

2242400