In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import sys
import pickle
import nltk
from nltk.corpus import stopwords
from torch.utils.data import DataLoader

sys.path.append('..')
# nltk.download('stopwords')

In [95]:
def load(filename) -> 'Any':
    with open(filename,'rb') as file:
        return pickle.load(file)

args = load('../config.p')

In [96]:
text = '''We are training our model on CUB dataset. CUB contains 200 bird species with 11,788 images. Since 80% of birds in this dataset have object-image size ratios of less than 0.5, as a pre-processing step, we crop all images to ensure that bounding boxes of birds have greater-than-0.75 object-image size ratios.'''
text

'We are training our model on CUB dataset. CUB contains 200 bird species with 11,788 images. Since 80% of birds in this dataset have object-image size ratios of less than 0.5, as a pre-processing step, we crop all images to ensure that bounding boxes of birds have greater-than-0.75 object-image size ratios.'

In [97]:
params = {"batch_size": args.batch_size,
                       "shuffle": True,
                       "num_workers": args.workers,}

In [98]:
vocab = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}\n"

In [99]:
class TextDataset(Dataset):

    def __init__(self, texts, labels, args):
        self.vocab = args.vocab
        self.max_len = args.max_length

        self.length = len(texts)
        self.texts = texts
        self.labels = labels
        self.identity = np.eye(len(self.vocab))

    def __len__(self):
        return self.length

    def preprocess(self,text):
        text = text.lower()
        word_token = nltk.word_tokenize(text)
        word_token = [word for word in word_token if word not in set(stopwords.words('english'))]
        return ''.join(word_token)

    def __getitem__(self,index):
        raw_text = self.preprocess(self.texts[index])

        data = np.array([self.identity[self.vocab.index(char)] for char in raw_text if char in self.vocab],
                        dtype=np.float64)
        if len(data) > self.max_len:
            data = data[:self.max_len]
        elif 0 < len(data) < self.max_len:
            data = np.concatenate(
                (data, np.zeros((self.max_len - len(data), len(self.vocab)), dtype=np.float64)))
        elif len(data) == 0:
            data = np.zeros(
                (self.max_len, len(self.vocab)), dtype=np.float64)

        label = self.labels[index]
        data = torch.Tensor(data)

        return data, label


In [100]:
dataset = TextDataset([text,text[::-1]], [1,2],args)

In [101]:
dataloader = DataLoader(dataset,**params)

In [143]:
def weights_init_uniform(m,mean=0.0,var=0.05):
    classname = m.__class__.__name__
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        # apply a uniform distribution to the weights and a bias=0
        m.weight.data.uniform_(mean, var)
        m.bias.data.fill_(0)

In [144]:
class CharCNN(nn.Module):
    def __init__(self):
        super(CharCNN,self).__init__()

        self.model = nn.Sequential(

                        nn.Conv1d(len(args.vocab), 256, kernel_size=7, padding=0),
                        nn.ReLU(),
                        nn.MaxPool1d(3),
                        nn.Conv1d(256, 256, kernel_size=7, padding=0),
                        nn.ReLU(),
                        nn.MaxPool1d(3),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.MaxPool1d(3),
                        nn.Flatten(),
                        nn.Linear(256*34,1024),
                        nn.ReLU(),
                        nn.Dropout(0.5),
                        nn.Linear(1024, 1024),
                        nn.ReLU(),
                        nn.Dropout(0.5)
        )

    def forward(self,x):
        return self.model(x)

In [145]:
net = CharCNN()
net.apply(weights_init_uniform)

CharCNN(
  (model): Sequential(
    (0): Conv1d(70, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (3): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (4): ReLU()
    (5): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (7): ReLU()
    (8): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (9): ReLU()
    (10): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (11): ReLU()
    (12): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (13): ReLU()
    (14): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (15): Flatten()
    (16): Linear(in_features=8704, out_features=1024, bias=True)
    (17): ReLU()
    (18): Dropout(p=0.5, inplace=False)
    (19): Linear(in_features=1024, out_features=1024, bias=True)
    (20): ReLU()
    (21): Dropout(p=0.5, inplace=False)
  )
)

In [146]:
net(x[0].view(-1,70,1024)).shape

torch.Size([2, 1024])