In [110]:
import os
import numpy as np
import torch
import torch.nn.functional as F

In [2]:
from torchnlp.datasets import imdb_dataset 

In [3]:
train = imdb_dataset(train=True)  # doctest: +SKIP


In [4]:
test = imdb_dataset(test=True)  # doctest: +SKIP


## препроцессинг

In [5]:
from string import punctuation

def preprocess(text):
    text = text.lower()
    text = "".join([ch for ch in text if ch not in punctuation])
    text=text.replace("\t", "")
    all_reviews = text
    all_words = text.split(" ")
    
    return all_reviews, all_words

In [6]:
def text_labels_separate(train):
    labels=[]
    text=[]
    words=[]
    for i in range(len(train)):
        if train[i]['sentiment'] == 'pos':
            labels.append(1)
        else:
            labels.append(0)
        text.append(preprocess(train[i]['text'])[0])
        words=words+preprocess(train[i]['text'])[1]
    return text, labels, words
            
        
        

In [7]:
train_texts, train_labels, train_words= text_labels_separate(train)

In [8]:
test_texts, test_labels, test_words= text_labels_separate(test)

In [9]:
all_words=train_words+test_words

## Создание словаря

In [10]:
from collections import Counter

word_counts = Counter(all_words)
word_list = sorted(word_counts, key=word_counts.get, reverse = True)


In [11]:
word_list[-10:]

['clichélandmine',
 'asymmetric',
 'metaessay',
 'warnabys',
 'browfurrowingbr',
 'secdonly',
 'rena',
 'bryces',
 'wagonthey',
 '5m']

In [12]:
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}


In [13]:
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}


In [14]:
encoded_reviews_train = [[vocab_to_int[word] for word in review.split(' ')] for review in train_texts]


In [15]:
encoded_reviews_test = [[vocab_to_int[word] for word in review.split(' ')] for review in test_texts]

In [16]:
length = []
for review in encoded_reviews_train + encoded_reviews_test:
    length.append(len(review))

In [21]:
import numpy as np
print('max:', max(length), 'mean:' , np.mean(length), np.std(length))

max: 2470 mean: 231.14594 171.3247057534797


In [22]:
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) <= seq_length:
            reviews.append([0]*(seq_length-len(review)) + review )
        else:
            reviews.append(review[:seq_length])
        
    return np.array(reviews)


padded_reviews_train = pad_text(encoded_reviews_train, seq_length = 250)
padded_reviews_test = pad_text(encoded_reviews_test, seq_length = 250)

In [23]:
padded_reviews_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0, 32611,   320,     6,     3,  1061,   214,     8,
        2033,    30,     1,   163,    60,    14,    47,    78,  6088,
          43,   376,

In [24]:
train_labels = np.array( [label for idx, label in enumerate(train_labels) if len(padded_reviews_train[idx]) > 0] ).astype(float)
reviews_train = [review for review in padded_reviews_train if len(review) > 0]

In [25]:
test_labels = np.array( [label for idx, label in enumerate(test_labels) if len(padded_reviews_test[idx]) > 0] ).astype(float)
reviews_test = [review for review in padded_reviews_test if len(review) > 0]


## Shuffle

In [26]:
from sklearn.utils import shuffle

In [27]:
reviews_test, test_labels = shuffle(reviews_test, test_labels, random_state=42)

In [28]:
test_labels[-10:]

array([0., 1., 0., 1., 1., 0., 1., 1., 0., 0.])

In [29]:
type(reviews_train[0][0])

numpy.int32

In [30]:
type(train_labels[0])

numpy.float64

In [31]:
valid_ratio = 0.3
total = len(reviews_test)
valid_cutoff = int(total * valid_ratio)

train_x = reviews_train
train_y = train_labels


In [32]:
train_x=torch.Tensor(train_x)

In [36]:
print(train_x[0].shape)

torch.Size([250])


In [37]:
train_y=torch.Tensor(train_y)

In [38]:
valid_x, valid_y = torch.Tensor(reviews_test[:valid_cutoff]), torch.Tensor(test_labels[:valid_cutoff])
test_x, test_y = torch.Tensor(reviews_test[valid_cutoff:]), torch.Tensor(test_labels[valid_cutoff:])




In [39]:
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)



In [40]:
batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

## cnn structure

In [251]:
from torch import nn

class ConvNet(nn.Module): 
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_hidden = n_hidden   # number of layers 
          
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.layer1 = nn.Sequential( nn.Conv1d(250, 248, kernel_size=2, stride=1), 
            nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential( nn.Conv1d(248, 50, kernel_size=5, stride=4, padding=2), 
            nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2))
        self.layer3 = nn.Sequential( nn.Conv1d(50, 25, kernel_size=5, stride=4, padding=2), 
            nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2))
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(n_hidden, n_output)
        #self.sigmoid = nn.Sigmoid()
        self.soft=nn.Softmax()
    
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        out = self.layer1(embedded_words)          # (batch_size, seq_length, n_hidden)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)  # (batch_size*seq_length, n_hidden)
        out = self.dropout(out) 
        fc_out = self.fc(out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.soft(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        #sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_out
    
    
    
    


In [252]:
n_vocab = len(vocab_to_int)
n_embed = 100
n_hidden = 300
n_output = 2   # 1 ("positive") or 0 ("negative")



In [253]:
from torch import optim
net = ConvNet(n_vocab, n_embed, n_hidden, n_output)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001 )
# create a stochastic gradient descent optimizer


In [254]:

print_every = 200
step = 0
n_epochs = 10  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device).long(), labels.to(device)
        
        
        
        
        net.zero_grad()
        output= net(inputs)
        loss = criterion(output, labels.long())
        loss.backward()
        #nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device).long(), labels.to(device)
        
                
                
                v_output = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.long())
                valid_losses.append(v_loss.item())
                

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()





Epoch: 1/10 Step: 200 Training Loss: -0.4800 Validation Loss: -0.4800
Epoch: 1/10 Step: 400 Training Loss: -0.4400 Validation Loss: -0.4400
Epoch: 2/10 Step: 600 Training Loss: -0.3800 Validation Loss: -0.3800
Epoch: 2/10 Step: 800 Training Loss: -0.4000 Validation Loss: -0.4000
Epoch: 2/10 Step: 1000 Training Loss: -0.7000 Validation Loss: -0.7000
Epoch: 3/10 Step: 1200 Training Loss: -0.4800 Validation Loss: -0.4800
Epoch: 3/10 Step: 1400 Training Loss: -0.4800 Validation Loss: -0.4800
Epoch: 4/10 Step: 1600 Training Loss: -0.4201 Validation Loss: -0.4200
Epoch: 4/10 Step: 1800 Training Loss: -0.4200 Validation Loss: -0.4200
Epoch: 4/10 Step: 2000 Training Loss: -0.4401 Validation Loss: -0.4400
Epoch: 5/10 Step: 2200 Training Loss: -0.5800 Validation Loss: -0.5800
Epoch: 5/10 Step: 2400 Training Loss: -0.5600 Validation Loss: -0.5600
Epoch: 6/10 Step: 2600 Training Loss: -0.5400 Validation Loss: -0.5400
Epoch: 6/10 Step: 2800 Training Loss: -0.5000 Validation Loss: -0.5000
Epoch: 6/1

In [255]:
net.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for test_in, labels in test_loader:
        outputs = net(test_in.long())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test inputs: {} %'.format((correct / total) * 100))




Test Accuracy of the model on the 10000 test inputs: 50.011428571428574 %


In [171]:
net.eval()
test_losses = []
num_correct = 0

acc_list = []

for inputs, labels in test_loader:
    
    test_output = net(inputs.long())
    loss = criterion(test_output, labels.long())
    test_losses.append(loss.item())
    
    #preds = torch.round(test_output)
    #correct_tensor = preds.eq(labels.float())
    #correct = np.squeeze(correct_tensor.numpy())
    #num_correct += np.sum(correct)
    #test_loss += criterion(test_output, labels.long()).data
    #pred = test_output.data.max(1)[1] 
    _, predicted = torch.max(test_output.data, 1)
    correct = (predicted == labels).sum().item()
    num_correct += np.sum(correct)
    # get the index of the max log-probability
    #num_correct += pred.eq(labels.data).sum()
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.4f}".format(num_correct/len(test_loader.dataset)))



Test Loss: 0.4722
Test Accuracy: 0.8350


In [163]:
num_epochs=10
total_step = len(train_loader)
loss_list = []
acc_list = []
for epoch in range(num_epochs):
    for i, (reviews, labels) in enumerate(train_loader):
        # Прямой запуск
        outputs = model(reviews)
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())

        # Обратное распространение и оптимизатор
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Отслеживание точности
        total = labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        correct = (predicted == labels).sum().item()
        acc_list.append(correct / total)

        if (i + 1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
                  .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(),
                          (correct / total) * 100))

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [205]:
loss = nn.NLLLoss()
a = torch.tensor(([0.88, 0.12], [0.51, 0.49]), dtype = torch.float)
target = torch.tensor([1, 0])
output = loss(a, target)
print(output)


tensor(-0.3150)


In [204]:
losss=nn.CrossEntropyLoss()
losss(a, target)


tensor(0.9134)