deeply refered on  
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

In [1]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from konlpy.tag import Okt
okt = Okt()

In [2]:
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator

## Declaring the Fields
- Torchtext uses the method of declaring data during import.
     - Declare what format the data will have, and torchtext will load the data accordingly.

In [4]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [16]:
max_length = 30
def tokenizer(text):
    token = okt.nouns(text)
    if len(token) < max_length:
        for i in range(0, max_length - len(token)):
            token.append('<PAD>')
    else : 
        token = token[:max_length]
    return token

# If it is shorter than max_length, add the pad token, and if it is long, cut it to max_length.

TEXT = Field(sequential=True, tokenize=tokenizer,use_vocab=True) 
LABEL = Field(sequential=False,unk_token=None, use_vocab=True,dtype=torch.float)

## Constructing the Dataset
- The fields object contains a declaration of how to import the raw data.
- Declare where and what data to fetch through the TabularDataset object.
- The object created through the source code below is in the form of a generator.

In [17]:
%%time 

datafields = [("X", TEXT), ("y", LABEL)]

train_data,test_data = TabularDataset.splits(
                            path=".", 
                            train='train_df.csv',
                            test='test_df.csv',
                            format='csv',
                            skip_header=True,
                            fields=datafields)

CPU times: user 9min 34s, sys: 7.08 s, total: 9min 41s
Wall time: 7min 8s


In [18]:
TEXT.build_vocab(train_data);LABEL.build_vocab(train_data)
TEXT.build_vocab(test_data);LABEL.build_vocab(test_data)

- through TabularDatset, tokenizing, but the word_to_integar process has not been done yet.
- In our case, word_to_integar converting is required for the TEXT part for train and text datasets.
- You can convert by using the code `TEXT.build_vocab (trn)`.
- The above operation will create torchtext for all elements in all training sets. Torchtext has a class called Vocab that handles vocabulary. The Vocab class maps the word and id to the stoi attribute and the itos attribute to reverse mappings.
- stoi: word_to_idx default dictionary
- itos: word list

In [19]:
BATCH_SIZE = 64
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

train_iter, test_iter = BucketIterator.splits(datasets=(train_data,test_data),
                                            batch_sizes=(BATCH_SIZE,BATCH_SIZE),  
                                            device=device,
                                            repeat=False)

In [73]:
class CNN(nn.Module) : 
    
    def __init__(self,VOCAB_SIZE , EMBED_SIZE , HID_SIZE , DROPOUT ,KERNEL_SIZE , NUM_FILTER , N_CLASS ) : 
        super(CNN, self).__init__()
        self.vocab_size = VOCAB_SIZE # whole vocab we use.
        self.embed_size = EMBED_SIZE # Volumn of Embedding dimension this is hyperparameter.
        self.hid_size = HID_SIZE # Volumn of Hidden layer dimension this is also hyperparameter.
        self.dropout = DROPOUT # probability of occurence of dropout.
        if type(KERNEL_SIZE) !=list :
            self.kernel_size = list(KERNEL_SIZE) # size of kernel, we can assign kernel_size as type of list.
        else : self.kernel_size = KERNEL_SIZE # longer size, more complex.
        self.num_filter = NUM_FILTER # parameter how many kernel is. the larger more complex.
        self.num_class = N_CLASS # argument about output_dimension, this is 1 becuz we gonna use sigmoid below.
#         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.device = 'cpu'
        
        self.embedding = nn.Embedding(
            num_embeddings = self.vocab_size,
            embedding_dim = self.embed_size,
            padding_idx = 1) 

        # padding_idx : we ignore pad token during training.
        # embedding vector's dimension : vocab_size * embed_size , similar with LOOKUP TABLE.
        
        self.convs = nn.ModuleList([(nn.Conv2d(in_channels = 1,out_channels = self.num_filter,\
        kernel_size = (kernel,self.embed_size))) for kernel in self.kernel_size])
        # in_channels : in field of computer_vision, the user used to use more than 1, but in nlp, 1 in in_channels in more common.
        # out_channels : we might catch more feature if we use larger out_channels potentially. so it's on your decision.
        # kernel_size : size of kernel
        
        self.fully_connect = nn.Sequential(
        nn.Linear(self.num_filter * len(self.kernel_size),self.hid_size),nn.ReLU(),
        nn.Dropout(self.dropout),nn.Linear(self.hid_size , self.num_class),
        )
        
        # 1. nn.Linear: concatenate num_filter * len (kernel_size) multiply a dimensional vector by a hidden layer.
        # 2. nn.ReLU: To ensure non-linearity, insert the activation function Relu.
        # 3. nn.Dropout: applies dropout to self.dropout probability.
        # 4. nn.Linear: Match the final output with n-class classification and apply softmax or sigmoid etc.
    
    def forward(self,x) : 
        # x's dimension : [max_length, batch_size], max_length gonna be 30, becuz of tokenizer func(we made it at first).
        if len(x.shape) == 1 :
            x.unsqueeze_(0) 
        # [1, max_length, batch_size] The reason for squeezing 1 dim in dimension 0 is for embedding operations together.        
        
        embed = self.embedding(x) #[max_length, batch_size, embedding_dim]
        embed = embed.unsqueeze(1) # [max_length, 1, batch_size, embedding_dim], for convolution.
        embed = embed.permute(2,1,0,3)
        # [batch_size, 1 , max_length , embedding_dim]
        # You can think of a rectangle as batch_size. Since the depth of the kernel does not exist,
        # Also put this information in unschEzE_ because you previously assigned this information to in_channels.
        
        
        convolution = [conv(embed).squeeze(3) for conv in self.convs]
        # [batch_size, num_filter, dimension after convolution(stride 값에 따라 변화할 수 있습니다!)]
        
        # Turning the convolution on nlp will result in 1 because the column size (length) of feature_map is equal to the embedding dimension.
        # dimension of embedding dimension Since the index was 3, it squeezes the third index.
        # Also, the second index, which had a length of max_length, changed due to the kernel size. This value will vary depending on the stride size.
        # Rather than having a rectangle of batch_size, the vector has num_filter,
        # The reason that such vectors 'vectors' are vectors is that after the convolution,
        # Depending on the num_kernels parameter, there are several such vectors. The vectors in a dataset have the same dimensions, but the value of the kernel
        # Note that the values of the elements of the vector are different. The words are longer, but the vectors are equal to num_kernels.)
        # It seems better to think that these datasets are batch_size, because they are going to be a bunch at a time.
        
        
        pooled = [F.max_pool1d(conv,(conv.size(2))).squeeze(2) for conv in convolution]
        # [batch_size, num_filter]
        # max_polling applies to the same filter with the same kernel size.
        # So when we do pooling, the second parameter is conv.size (2), which is the length of the vector.
        # Thus, only the max value of the vector value is extracted, and through squeeze, one vector becomes one scalar. That is, it becomes one-dimensional.
        
        dropout = [F.dropout(pool,self.dropout) for pool in pooled]
        # The second argument, dropout, is a probability value, dropout% dropout% of hidden neurons whenever the epoch changes.
        # This is a process that is a must for complex models with a large number of parameters, such as CNN.
        
        concatenate = torch.cat(dropout, dim = 1) 
        # [batch_size, num_filter * num_kernel]
        #concatenate and return NN. Concat the kernel. So, one dataset has one long vector.
        # It looks like an end, but you have to add nonlinearity, and you have to convert the output_dimension to a linear transform.
        
        logit = self.fully_connect(concatenate)
        # NN is the process of putting in layer. Put it in linear layer according to the dimension of concat,
        # Put in relu activation function for nonlinearity, then dropout
        # The last value is a vector with the same num_of_class.
        # if  binary classfication is a vector of length 2
        
        return logit

In [66]:
class fit() : 
    
    def __init__(self, model, train_iter, test_iter, epoch = 5) : 
        self.optimizer = optim.SGD(model.parameters(), lr=1e-3)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(device)
        self.criterion = nn.BCEWithLogitsLoss().to(device)
        self.train_iter = train_iter
        self.test_iter = test_iter
        self.epoch = epoch
        
    def binary_accuracy(self, preds, y):
        """
        Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
        """

        rounded_preds = torch.round(torch.sigmoid(preds))
        # In model, since we are fully connecting, we put sigmoid on it, and we do rounding.
        correct = (rounded_preds == y).float() # If they are the same, they are 1
        acc = correct.sum()/len(correct) # In one batch, the ratio is the ratio, or accuracy!
        return acc
    
    def train(self, model, iterator):
    
        epoch_loss = 0 # loss per epoch
        epoch_acc = 0 # accuracy per epoch

        model.train()
        for batch in iterator:
            if batch.X.size(0) == 0 : continue #If there is no data, continue. It does not matter because you applied padding manually.
            self.optimizer.zero_grad() # we should initialize manually gradient of optimizer in pytorch.

            predictions = model(batch.X).squeeze(1) # becuz the size of fc from model is [batch_size, num_layer].            
            loss = self.criterion(predictions, batch.y) # calculating the loss 

            acc = self.binary_accuracy(predictions, batch.y) # return the accracuy in form of ratio

            loss.backward() # back propagation

            self.optimizer.step() # update the SGD

            epoch_loss += loss.item()
            epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def evaluate(self, model, iterator):
    
        epoch_loss = 0
        epoch_acc = 0

        model.eval() # stop the every change in gradient of model

        with torch.no_grad():
            
            for batch in iterator:
                if batch.X.size(0) == 0 : continue
                predictions = model(batch.X).squeeze(1)
                loss = self.criterion(predictions, batch.y)

                acc = self.binary_accuracy(predictions, batch.y)

                epoch_loss += loss.item()
                epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def fit_by_iterate(self) : 
        
        for epoch in range(self.epoch+1):
            print('epoch : ',epoch+1,end='\r')
            train_loss, train_acc = self.train(self.model, self.train_iter)

        valid_loss, valid_acc = self.evaluate(self.model, self.test_iter)
        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

In [70]:
VOCAB_SIZE = len(TEXT.vocab)
EMBED_SIZE = 256
HID_SIZE = 128
DROPOUT = 0.5
KERNEL_SIZE = [2,3,4,5]
NUM_FILTER = 4
N_CLASS = 1

model = CNN(VOCAB_SIZE, EMBED_SIZE, HID_SIZE, DROPOUT, KERNEL_SIZE, NUM_FILTER, N_CLASS)
model

CNN(
  (embedding): Embedding(23044, 256, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 4, kernel_size=(2, 256), stride=(1, 1))
    (1): Conv2d(1, 4, kernel_size=(3, 256), stride=(1, 1))
    (2): Conv2d(1, 4, kernel_size=(4, 256), stride=(1, 1))
    (3): Conv2d(1, 4, kernel_size=(5, 256), stride=(1, 1))
  )
  (fully_connect): Sequential(
    (0): Linear(in_features=16, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [71]:
fitting_process = fit(model,train_iter,test_iter,epoch=5)

In [72]:
fitting_process.fit_by_iterate()

| Epoch: 05 | Train Loss: 0.639 | Train Acc: 62.87% | Val. Loss: 0.628 | Val. Acc: 64.05% |
