<a href="https://colab.research.google.com/github/leohsuofnthu/Pytorch-TextCNN/blob/master/textCNN_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re


import pandas as pd
from numpy.random import RandomState

import torchtext
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.vocab import GloVe
from torchtext.data import Iterator, BucketIterator
import torchtext.datasets
from torchtext.datasets import IMDB, SST

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [0]:
#%% Split whole dataset into train and valid set
df = pd.read_csv('./IMDB Dataset.csv')
rng = RandomState()

tr = df.sample(frac=0.7, random_state=rng)
tst = df.loc[~df.index.isin(tr.index)]
tr.to_csv('train.csv', index=False)
tst.to_csv('test.csv', index=False)

In [0]:
#%% Prepare the dataset via torchtext
spacy_en = spacy.load('en')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
  
# set up fields
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()
  
  


#Creating field for text and label
REVIEW = Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = Field(sequential=False)

#clean the text
REVIEW.preprocessing = torchtext.data.Pipeline(clean_str)

In [0]:
#%%
train_datafield = [('review', REVIEW),  ('sentiment', LABEL)]
train = TabularDataset(path ='./train.csv',  
                             format='csv',
                             skip_header=True,
                             fields=train_datafield)


#%%
test_datafield = [('review', REVIEW),  ('sentiment', LABEL)]

test = TabularDataset(path ='./test.csv', 
                       format='csv',
                       skip_header=True,
                       fields=test_datafield)

In [0]:
#%% Check the dataset
REVIEW.build_vocab(train, vectors=GloVe(name="6B", dim=300))
LABEL.build_vocab(train)
#%% load the pretrained embedding
vocab = REVIEW.vocab

#%% Create the Iterator for datasets (Iterator works like dataloader)

train_iter = Iterator(
        train, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = Iterator(test, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [0]:
#%% Text CNN model
class textCNN(nn.Module):
    
    def __init__(self, vocab_built, emb_dim, dim_channel, kernel_wins, num_class):
        super(textCNN, self).__init__()
        #load pretrained embedding in embedding layer.
        self.embed = nn.Embedding(len(vocab_built), emb_dim)
        self.embed.weight.data.copy_(vocab_built.vectors)
    
        #Convolutional Layers with different window size kernels
#        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, w, emb_dim) for w in kernel_wins])
        self.conv_w3 = nn.Conv2d(1, dim_channel, (3, emb_dim))
        self.conv_w4 = nn.Conv2d(1, dim_channel, (4, emb_dim))
        self.conv_w5 = nn.Conv2d(1, dim_channel, (5, emb_dim))
    
        #dropout layer
        self.dropout = nn.Dropout(0.6)
        
        #FC layer
        self.fc = nn.Linear(len(kernel_wins)*dim_channel, num_class)
        
    def forward(self, x):
        x = self.embed(x)
        x = x.unsqueeze(1)
        x_3 = F.relu(self.conv_w3(x))
        x_4 = F.relu(self.conv_w4(x))
        x_5 = F.relu(self.conv_w5(x))
 
        
        x_3 = F.max_pool1d(x_3.squeeze(-1), x_3.size()[2])
        x_4 = F.max_pool1d(x_4.squeeze(-1), x_4.size()[2])
        x_5 = F.max_pool1d(x_5.squeeze(-1), x_5.size()[2])
        
        xx = torch.cat((x_3,x_4,x_5), dim=1)
        xx = xx.squeeze(-1)
        xx = self.dropout(xx)
        logit = self.fc(xx)
        return logit
        

#%% Training the Model
def train(model, device, train_itr, optimizer, epoch, max_epoch):
    model.train()
    train_loss = 0
    for batch in train_itr:
        text, target = batch.review, batch.sentiment
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        
        optimizer.zero_grad()
        logit = model(text)
        
        
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()
        train_loss+= loss.item()
        
        return train_loss
    
def eval(model, device, data_iter):
    model.eval()
    corrects, avg_loss = 0,0
    for batch in data_iter:
        text, target = batch.review, batch.sentiment
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        
        logit = model(text)
        loss = F.cross_entropy(logit, target)

        
        avg_loss += loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
    
    size = len(data_iter.dataset)
    avg_loss /= size 
    accuracy = 100.0 * corrects/size
    print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss,accuracy,corrects,size))
    
    return accuracy

In [15]:
#%%
print(next(iter(train_iter)))

model = textCNN(vocab, 300, 100, [6, 12 , 24] , 2).to('cuda')
# print the model summery
print(model)    
    
train_loss = []
test_dicescore = []
best_test_dicescore = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    

#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 500+1):
    #train loss
    t_loss = train(model, device, train_iter, optimizer, epoch, 500)
    print('Train Epoch: {} \t Loss: {}'.format(epoch, t_loss))
    
    test_acc = eval(model, device, test_iter)
    
#    torch.cuda.empty_cache()
#    print('current memory allocated: {}'.format(torch.cuda.memory_allocated() / 1024 ** 2))
#    print('max memory allocated: {}'.format(torch.cuda.max_memory_allocated() / 1024 ** 2))
#    print('cached memory: {}'.format(torch.cuda.memory_cached() / 1024 ** 2))

    




[torchtext.data.batch.Batch of size 64]
	[.review]:[torch.cuda.LongTensor of size 656x64 (GPU 0)]
	[.sentiment]:[torch.cuda.LongTensor of size 64 (GPU 0)]
textCNN(
  (embed): Embedding(111629, 300)
  (conv_w3): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
  (conv_w4): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  (conv_w5): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  (dropout): Dropout(p=0.6)
  (fc): Linear(in_features=300, out_features=2, bias=True)
)
Train Epoch: 1 	 Loss: 0.7648428678512573

Evaluation - loss: 0.011033 acc: 49.0000%(7430/15000) 

Train Epoch: 2 	 Loss: 0.7414646148681641

Evaluation - loss: 0.011097 acc: 49.0000%(7429/15000) 

Train Epoch: 3 	 Loss: 0.7350847125053406

Evaluation - loss: 0.010740 acc: 57.0000%(8619/15000) 

Train Epoch: 4 	 Loss: 0.7557792663574219

Evaluation - loss: 0.010891 acc: 50.0000%(7574/15000) 

Train Epoch: 5 	 Loss: 0.7046993970870972

Evaluation - loss: 0.011524 acc: 50.0000%(7571/15000) 

Train Epoch: 6 	 Loss: