In [1]:
#!mkdir ./data
#!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O ./data/dataset.tar.gz
#%cd data
#!tar xvfz dataset.tar.gz
#%cd ..

## TextCNN for text classification

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import glob

from gensim.utils import tokenize, deaccent, simple_preprocess
from collections import Counter
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
import time
import copy
import torch
import torch.nn.functional as F

torch.__version__

'1.1.0'

In [4]:
positive_train = glob.glob("./data/aclImdb/train/pos/*.txt")
negative_train = glob.glob("./data/aclImdb/train/neg/*.txt")

positive_test = glob.glob("./data/aclImdb/test/pos/*.txt")
negative_test = glob.glob("./data/aclImdb/test/neg/*.txt")
len(negative_test), len(positive_test), len(positive_train), len(negative_train)

(12500, 12500, 12500, 12500)

### Build vocabulary

In [6]:
from collections import Counter
c = Counter()

for file in positive_train+negative_train+positive_test+negative_test:
    c.update(simple_preprocess(open(file).read()))
vocab = c.most_common(5000)

In [7]:
len(vocab)

5000

In [8]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    def __init__(self, files):
        self.files = files
            
    def __iter__(self):
        for file in self.files:
            
            text = open( file ).read().lower()
            
            yield simple_preprocess(text)

            

In [10]:
word2idx= { w[0]:i+2 for i, w in enumerate(vocab) }
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1 
idx2word = {i:w for w,i in word2idx.items()}
SENTENCE_LENGTH = 80
len(word2idx)

5002

In [11]:
def get_index( word, vocab):
    if word in vocab:
        return vocab[word]
    else:
        return vocab['<UNK>']

In [12]:
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    """Imbdb dataset."""

    def __init__(self, positives, negatives, word2idx, sentence_length):

        self.dataset = positives + negatives
        self.word2idx = word2idx
        self.labels = [1 for _ in range(len(positives))] + [0 for _ in range(len(negatives))]
        self.sentence_length = sentence_length
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        
        ret = torch.zeros( self.sentence_length, dtype=torch.int64)

        text = open(self.dataset[idx]).read().lower()
        s = simple_preprocess(text)
           
        for i in range(self.sentence_length):
            #print(s[i] in self.word2idx)
            ret[i]=0
            if i<len(s):
                ret[i] = get_index( s[i], self.word2idx)
            
        return ret, torch.tensor( self.labels[idx], dtype=torch.float32)
            

In [13]:

train_dataset = TextDataset(positive_train, 
                            negative_train,
                            word2idx,  sentence_length = SENTENCE_LENGTH)

test_dataset = TextDataset(positive_test,
                           negative_test, 
                           word2idx,   sentence_length=SENTENCE_LENGTH)

train_dataloader = DataLoader(train_dataset, batch_size=32,
                        shuffle=True, num_workers=0)

test_dataloader = DataLoader(test_dataset, batch_size=32,
                        shuffle=True, num_workers=0)


dataloaders = { "train": train_dataloader, "val":test_dataloader}

dataset_sizes = { "train": len(train_dataset), "val":len(test_dataset)}

train_dataset[0]

(tensor([1765,  617,   42,    2,    1,    1,    4,    2,  354,    1,    1,    1,
            4,    1,    9,    1,    1,    9,    1,    1,    2, 2598,    4,  354,
            3,  403,   35, 1370,    4,    1,  554,    9,   10,    1, 1121,  113,
          331,   25,  208,   74,    1,  119,  354, 1816,    5,    1, 1075,   26,
           48,   67,  798,    2,    1,    4,    1,    1, 4504, 2598,   15,  354,
            3,  403,   98,  152,    4, 1181, 1880,    1,    4,    2,    1, 1645,
          993,    5,  552,  169, 2700,    3,    2, 1238]),
 tensor(1.))

In [14]:
x_in, label = next(iter(train_dataloader))
x_in, label

(tensor([[  10,   18,    1,  ...,  615,  827, 2667],
         [ 332,    2,  263,  ...,    3,    1,    3],
         [   2, 1134,  109,  ...,   81,  704,    3],
         ...,
         [   1,   17, 2492,  ...,  252, 3038, 2802],
         [1509,  147,   36,  ...,    4,    1, 1299],
         [  45,   16,    2,  ...,  148,  595,    3]]),
 tensor([0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
         1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0.]))

In [130]:
x_in.shape

torch.Size([32, 80])

In [15]:
import torch
import torch.nn as nn
 

In [20]:
class TextClassifier(nn.Module):
    def __init__(self, embedding_dim , n_hidden, 
                 num_embeddings, sentence_length, wv_model):
        
        super(TextClassifier, self).__init__()
        
        self.sentence_length = sentence_length
        
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()

        
        weights = torch.FloatTensor(wv_model.wv.vectors)
        self.embedding = nn.Embedding.from_pretrained(weights,freeze=True)
        #print("req grad?=>", self.embedding.requires_grad)
        
        #self.embedding.requires_grad = False

        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(8, 4, kernel_size=3, padding=1)
        
        
        self.mp = nn.MaxPool2d(8, 3)
        
        #self.conv2 = nn.Conv2d(32, 16, kernel_size=3, padding=1)
        #self.conv3 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        
        
        #self.bn1 = nn.BatchNorm2d(32)
        #self.bn2 = nn.BatchNorm2d(16)
        
        self.fc1 = nn.Linear( 80, n_hidden )
        self.fc2 = nn.Linear( n_hidden, 1)
        
    
    def forward(self, X):
        batch_size = X.shape[0]
        
        out = self.embedding(X)
        out = out.unsqueeze(1)
 
        
        out = self.conv1(out)
        out = self.relu(out)
        out = self.mp(out)
       
    
        out = self.conv2(out)
        out = self.relu(out)
        out = self.mp(out)
        
        #print(out.shape)
       
        
        #return out
        #out = self.mp(out)
        #out = out.view(batch_size, -1)
        
        """
        return out
        out = self.bn1(out)
        out = self.relu(out)
    
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
    
     
        out = self.conv3(out)
        
        out = out.view(batch_size, -1)
        out = self.fc1(out)
        out = self.fc2(out)
        
        
        """
        
        out = out.view(batch_size, -1)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = torch.sigmoid(out)
        
        return out 



class TextCNN(nn.Module):
    def __init__(self, embed_num,  embed_dim=200, class_num=1, ):
        super(TextCNN, self).__init__()
        
       

        V = embed_num
        D = embed_dim
        C = class_num
        Ci = 1
        Co = 3 
        Ks = [3,4,5]
        
        self.embedding = nn.Embedding(V, D)
        #weights = torch.FloatTensor(wv_model.wv.vectors)
        #self.embedding = nn.Embedding.from_pretrained(weights,freeze=False)
        
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks) * Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3) 
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embedding(x)  
        x = x.unsqueeze(1) 

        x = [F.relu(conv(x)).squeeze(3)
             for conv in self.convs1]  

        x = [F.max_pool1d(i, i.size(2)).squeeze(2)
             for i in x]  

        x = torch.cat(x, 1)
 
        x = self.dropout(x)   
        out = self.fc1(x)  
        return torch.sigmoid(out)
    

In [21]:

net = TextCNN(embed_num=len(word2idx)+2)
net(x_in).shape

torch.Size([32, 1])

In [22]:
"""
net = TextClassifier( n_hidden = 100, embedding_dim = 200, 
                     num_embeddings=len(word2idx), sentence_length=SENTENCE_LENGTH,
                    wv_model=model)

"""


'\nnet = TextClassifier( n_hidden = 100, embedding_dim = 200, \n                     num_embeddings=len(word2idx), sentence_length=SENTENCE_LENGTH,\n                    wv_model=model)\n\n'

In [23]:
def train_model(model, criterion, optimizer, dataloaders, scheduler,  num_epochs=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        model.train()
        # Iterate over data.
        epoch_loss = []
        for inputs, labels in dataloaders["train"]:

            inputs = inputs.to(device)
            labels = labels.reshape(-1,1).to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
                    
            loss = criterion(outputs, labels)
             
            loss.backward()
            optimizer.step()

            epoch_loss.append( loss.item( ))
        
        
        
        model.eval()
        val_losses = []
        targets = [] 
        predicted  = []
        for inputs, labels in dataloaders["val"]:
        
            inputs = inputs.to(device)
            labels = labels.reshape(-1,1).to(device)
            
            outputs = model(inputs)
            
            loss = criterion(outputs, labels) 
            
            preds = outputs.clone().detach()
          
       
            preds[preds<0.5]  = 0
            preds[preds>=0.5] = 1 
            
          
            targets.extend( list(labels.view(-1).numpy()))
            predicted.extend( list(preds.view(-1).numpy()))
            
            
            val_losses.append(  loss.item() )
    
    
    
        epoch_acc = accuracy_score( targets, predicted)
        
        if epoch_acc> best_acc:
            best_acc = epoch_acc
        print( f"epoch {epoch}, mean loss = {np.mean(epoch_loss):.3}, validation loss={np.mean(val_losses):.3}, epoch acc={epoch_acc:.3}")

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    #model.load_state_dict(best_model_wts)
    return model

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:
net = TextCNN(embed_num=len(word2idx))
optimizer = torch.optim.Adam(  net.parameters())
criterion = torch.nn.BCELoss()
net

TextCNN(
  (embedding): Embedding(5002, 200)
  (convs1): ModuleList(
    (0): Conv2d(1, 3, kernel_size=(3, 200), stride=(1, 1))
    (1): Conv2d(1, 3, kernel_size=(4, 200), stride=(1, 1))
    (2): Conv2d(1, 3, kernel_size=(5, 200), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5)
  (fc1): Linear(in_features=9, out_features=1, bias=True)
)

In [None]:
m = train_model(net, criterion, optimizer, dataloaders, scheduler=None, num_epochs=40)

Epoch 0/39
----------
epoch 0, mean loss = 0.672, validation loss=0.584, epoch acc=0.708
Epoch 1/39
----------
epoch 1, mean loss = 0.591, validation loss=0.539, epoch acc=0.737
Epoch 2/39
----------
epoch 2, mean loss = 0.547, validation loss=0.51, epoch acc=0.75
Epoch 3/39
----------


In [None]:
for p in net.parameters():
    print(p.shape)