<a href="https://colab.research.google.com/github/leohsuofnthu/Pytorch-TextCNN/blob/master/textCNN_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re


import pandas as pd
from numpy.random import RandomState

import torchtext
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.vocab import GloVe
from torchtext.data import Iterator, BucketIterator
import torchtext.datasets
from torchtext.datasets import IMDB, SST

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [0]:
#%% Split whole dataset into train and valid set
df = pd.read_csv('./IMDB_Dataset.csv')
rng = RandomState()

tr = df.sample(frac=0.8, random_state=rng)
tst = df.loc[~df.index.isin(tr.index)]
tr.to_csv('train.csv', index=False)
tst.to_csv('valid.csv', index=False)

In [0]:
#%% Prepare the dataset via torchtext
spacy_en = spacy.load('en', disable=['tagger', 'parser', 'ner', 'textcat'
                                     'entity_ruler', 'sentencizer', 
                                     'merge_noun_chunks', 'merge_entities',
                                     'merge_subtokens'])

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
  
# set up fields
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()
  
  


#Creating field for text and label
TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = Field(sequential=False)

#clean the text
TEXT.preprocessing = torchtext.data.Pipeline(clean_str)

In [0]:
#%%
train_datafield = [('text', TEXT),  ('label', LABEL)]
train = TabularDataset(path ='./train.csv',  
                             format='csv',
                             skip_header=True,
                             fields=train_datafield)


#%%
test_datafield = [('text', TEXT),  ('label',LABEL)]

test = TabularDataset(path ='./valid.csv', 
                       format='csv',
                       skip_header=True,
                       fields=test_datafield)

In [5]:
#%%Show some example to show the dataset
print(train[0].text,  train[0].label)
print(test[0].text,  test[0].label)

['crackerjack', 'is', 'a', 'simple', 'but', 'feelgood', 'movie', 'where', 'the', 'good', 'guys', 'are', 'very', 'good', 'and', 'the', 'bad', 'guys', 'are', 'very', 'bad', 'and', 'the', 'central', 'character', 'is', 'tempted', 'by', 'both', 'sides br', 'br', 'the', 'combination', 'of', 'the', 'central', 'character', 'being', 'played', 'by', 'mick', 'malloy', 'and', 'the', 'central', 'setting', 'being', 'the', 'local', 'lawn', 'bowls', 'clubs', 'drew', 'an', 'unusually', 'broad', 'crowd', 'ranging', 'from', 'large', 'numbers', 'of', 'teenagers', 'to', 'large', 'numbers', 'of', 'senior', 'citizens', '', 'and', 'all', 'laughed', 'at', 'the', 'comedy br', 'br', 'as', 'would', 'be', 'expected', 'of', 'a', 'movie', 'with', 'mick', 'malloy', 'and', 'judith', 'lucy', 'there', 'was', 'quite', 'a', 'bit', 'of', 'swearing', ',', 'but', 'it', 'was', 'not', 'overdone', 'and', 'the', 'audience', 'i', 'sat', 'with', 'certainly', 'enjoyed', 'it ! br', 'br', 'mick', 'malloy', 'did', 'a', 'good', 'job', 

In [0]:
#%% Check the dataset
TEXT.build_vocab(train, vectors= 'glove.6B.300d')
LABEL.build_vocab(train)
#%% load the pretrained embedding
vocab = TEXT.vocab

#%% Create the Iterator for datasets (Iterator works like dataloader)

train_iter = Iterator(
        train, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)

test_iter = Iterator(test, batch_size=64, device=torch.device('cuda'), 
                     sort_within_batch=False, repeat=False)

In [0]:
#%% Text CNN model
class textCNN(nn.Module):
    
    def __init__(self, vocab_built, emb_dim, dim_channel, kernel_wins, num_class):
        super(textCNN, self).__init__()
        #load pretrained embedding in embedding layer.
        self.embed = nn.Embedding(len(vocab_built), emb_dim)
        self.embed.weight.data.copy_(vocab_built.vectors)
    
        #Convolutional Layers with different window size kernels
        print([w for w in kernel_wins])
        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, emb_dim)) for w in kernel_wins])
        #Dropout layer
        self.dropout = nn.Dropout(0.6)
        
        #FC layer
        self.fc = nn.Linear(len(kernel_wins)*dim_channel, num_class)
        
    def forward(self, x):
        emb_x = self.embed(x)
        emb_x = emb_x.unsqueeze(1)

        con_x = [conv(emb_x) for conv in self.convs]

        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]
        print(pool_x[0].size())
        
        fc_x = torch.cat(pool_x, dim=1)
        
        fc_x = fc_x.squeeze(-1)
        print(fc_x.size())
        fc_x = self.dropout(fc_x)
        logit = self.fc(fc_x)
        return logit
        

#%% Training the Model
def train(model, device, train_itr, optimizer, epoch, max_epoch):
    model.train()
    corrects, train_loss = 0.0,0
    for batch in train_itr:
        text, target = batch.text, batch.label
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        optimizer.zero_grad()
        logit = model(text)
        
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()
        
        train_loss+= loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
    
    size = len(train_itr.dataset)
    train_loss /= size 
    accuracy = 100.0 * corrects/size
  
    return train_loss, accuracy
    
def valid(model, device, test_itr):
    model.eval()
    corrects, test_loss = 0.0,0
    for batch in test_itr:
        text, target = batch.text, batch.label
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        
        logit = model(text)
        loss = F.cross_entropy(logit, target)

        
        test_loss += loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
    
    size = len(test_itr.dataset)
    test_loss /= size 
    accuracy = 100.0 * corrects/size
    
    return test_loss, accuracy

In [42]:
#%%
model = textCNN(vocab, 300, 100, [3, 4 , 5] , 2).to('cuda')
# print the model summery
print(model)    
    
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_test_acc = -1

# Use GPU if it is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    

#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 10+1):
    #train loss
    tr_loss, tr_acc = train(model, device, train_iter, optimizer, epoch, 100)
    print('Train Epoch: {} \t Loss: {} \t Accuracy: {}'.format(epoch, tr_loss, tr_acc))
    
    ts_loss, ts_acc = valid(model, device, test_iter)
    print('Valid Epoch: {} \t Loss: {} \t Accuracy: {}'.format(epoch, ts_loss, ts_acc))
    
    if ts_acc > best_test_acc:
        best_test_acc = ts_acc
        #save paras(snapshot)
        print("model saves at {} accuracy".format(best_test_acc))
        torch.save(model.state_dict(), "textCNN_IMDB_best_valid")
        
    train_loss.append(tr_loss)
    train_acc.append(tr_acc)
    test_loss.append(ts_loss)
    test_acc.append(ts_acc)



[3, 4, 5]
textCNN(
  (embed): Embedding(119131, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.6)
  (fc): Linear(in_features=300, out_features=2, bias=True)
)
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])
torch.Size([64, 100, 1])
torch.Size([64, 300])


KeyboardInterrupt: ignored

In [0]:
    #plot train/validation loss versus epoch
    x = list(range(1, 10))
    plt.figure()
    plt.title("train/validation loss versus epoch")
    plt.xlabel("epoch")
    plt.ylabel("total loss")
    plt.plot(x, train_loss,label="train loss")
    plt.plot(x, test_loss, color='red', label="test loss")
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.show()
    
    #plot train/validation accuracy versus epoch
    x = list(range(1, 10))
    plt.figure()
    plt.title("train/validation loss versus epoch")
    plt.xlabel("epoch")
    plt.ylabel("total loss")
    plt.plot(x, train_loss,label="train loss")
    plt.plot(x, test_loss, color='red', label="test loss")
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.show()