In [140]:
import torch
from torchtext import data, datasets
import torch.nn as nn
import time
from pymystem3 import Mystem

In [141]:
GLOBAL_SEED = 42
torch.manual_seed(GLOBAL_SEED)
torch.backends.cudnn.deterministic = True

In [142]:
mystem = Mystem()

def tokenize(line):
    line = line.replace(',', ' ')
    line = line.replace('.', ' ')
    line = line.replace('!', ' ')
    line = line.replace('?', ' ')
    line = line.replace('\n', ' ')
    line = line.replace(')', ' ')
    line = line.replace('(', ' ')
    line = line.replace(':', ' ')
    line = line.replace(';', ' ')
    line = line.replace('"', ' ')
    line = line.lower()
    line = line.replace('ё', 'е')
    words = mystem.lemmatize(line)
    return list(filter(lambda x: x not in ' \n-', words))[:200]

In [143]:
TEXT = data.Field(tokenize=tokenize)
LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)

In [144]:
def load_data():
    with open('train.txt') as data:
        return data.readlines()
    
def load_test():
    with open('test.txt') as data:
        return data.readlines()

def load_labels():
    with open('labels.txt') as data:
        return list(map(lambda x: float(x), data.readlines()))
    
labels = load_labels()
    
def load_dataset(f):
    pairs = zip(f(), labels)
    return list(map(lambda x: data.Example.fromlist(x, fields=[('text', TEXT), ('label', LABEL)]), pairs))

In [145]:
all_data = data.Dataset(examples=load_dataset(load_data), fields=[('text', TEXT), ('label', LABEL)])
test_data = data.Dataset(examples=load_dataset(load_test), fields=[('text', TEXT)])

In [146]:
import random

train_data, valid_data = all_data.split(random_state = random.seed(GLOBAL_SEED))

In [147]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(vars(train_data.examples[0]))

Number of training examples: 14000
Number of validation examples: 6000
{'text': ['боже', '  ', 'какой', 'нудятина', 'тоска', 'смертный'], 'label': 3.0}


In [148]:
MAX_VOCAB_SIZE = 10000 # cut top words for vectorization

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, min_freq=10)
LABEL.build_vocab(train_data)

In [149]:
print(TEXT.vocab.freqs.most_common(10000)[-10:])

[('панда', 3), ('беззаботность', 3), ('буддийский', 3), ('уравновешивать', 3), ('правота', 3), ('воспитываться', 3), ('развлекуха', 3), ('романс', 3), ('неповоротливый', 3), ('            ', 3)]


In [150]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
#         self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=1, bias=False, 
#                           dropout=0.33
                         )
#         self.rnn = nn.RNN(embedding_dim, hidden_dim, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        
#         print('output', output.size())
#         print('output -1', output[-1,:,:].size())
#         print('hidden', hidden.squeeze(0).size())
#         print()

#         assert torch.equal(output[-1,:,:], hidden.squeeze(0))
#         return self.fc(hidden.squeeze(0))
        return self.fc(output[-1,:,:].squeeze(0))

In [151]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [152]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device,
    sort_key=lambda x: len(x.text))

In [153]:
import torch.optim as optim

# optimizer = optim.SGD(model.parameters(), lr=0.004)
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [154]:
criterion = nn.MSELoss()

In [155]:
model = model.to(device)
criterion = criterion.to(device)

In [156]:
def accuracy(preds, y):
    rounded_preds = torch.round(preds)
    rounded_y = torch.round(y)
    equal = rounded_preds == rounded_y
    
    return torch.mean(equal.float())

In [157]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
    
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        acc = accuracy(predictions, batch.label)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 4.0)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [158]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [159]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [160]:
best_valid_loss = float('inf')

In [161]:
N_EPOCHS = 30

# model.load_state_dict(torch.load('model1.pt'))

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model1.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Validation Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 5.039 | Train Acc: 18.40%
	 Val. Loss: 6.103 |  Validation Acc: 11.49%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 3.532 | Train Acc: 25.06%
	 Val. Loss: 5.407 |  Validation Acc: 15.98%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 2.695 | Train Acc: 28.44%
	 Val. Loss: 5.915 |  Validation Acc: 15.65%
Epoch: 04 | Epoch Time: 0m 7s
	Train Loss: 2.148 | Train Acc: 31.84%
	 Val. Loss: 5.279 |  Validation Acc: 21.91%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 1.713 | Train Acc: 35.03%
	 Val. Loss: 5.803 |  Validation Acc: 19.76%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 1.444 | Train Acc: 37.17%
	 Val. Loss: 5.571 |  Validation Acc: 20.48%
Epoch: 07 | Epoch Time: 0m 7s
	Train Loss: 1.179 | Train Acc: 40.84%
	 Val. Loss: 5.863 |  Validation Acc: 18.55%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 1.007 | Train Acc: 43.27%
	 Val. Loss: 5.667 |  Validation Acc: 19.11%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 0.879 | Train Acc: 45.54%
	 Val. Loss: 5.554 

In [163]:
model.eval()

with torch.no_grad():
    for batch in test_iterator:
        predictions = model(batch.text).squeeze(1)
        rounded_preds = torch.round(predictions)
        for x in rounded_preds:
            val = int(x.tolist())
            print(max(val, 1))
        
        #print(batch.label)
        #print(rounded_preds, '\n\n\n\n\n')

5
9
6
2
8
3
4
9
6
7
9
6
7
7
9
9
8
4
5
5
7
8
9
9
5
7
7
8
8
4
7
7
1
7
9
7
9
9
7
9
9
5
9
9
1
7
9
9
8
8
8
9
9
9
9
1
2
5
8
9
9
9
6
5
8
10
6
2
5
8
4
8
9
1
9
6
10
4
7
9
8
8
10
8
4
9
7
9
6
6
4
9
9
9
10
11
7
9
9
9
9
9
10
10
8
1
8
9
8
6
2
3
9
9
7
2
9
7
9
7
9
9
8
6
9
9
5
7
3
7
7
3
4
8
7
9
2
4
4
7
7
9
6
8
7
9
8
9
6
7
8
7
6
8
8
7
8
9
9
5
8
7
2
9
7
9
10
7
9
8
9
9
9
2
9
8
9
7
10
8
7
9
9
8
7
9
4
10
8
8
9
10
5
10
3
8
2
5
9
8
6
8
7
5
8
8
3
9
6
9
10
7
8
5
8
8
6
9
9
5
9
8
3
7
8
8
4
7
4
9
6
10
7
9
3
9
4
8
2
9
10
9
7
9
3
10
8
9
4
9
9
7
8
8
1
7
6
8
9
4
9
9
10
7
9
9
6
9
8
10
9
6
7
4
10
9
8
7
7
8
8
9
8
7
3
7
3
10
7
9
1
5
8
9
5
9
5
8
9
6
6
9
10
10
9
7
10
10
9
7
9
6
10
10
9
3
3
9
8
8
11
10
8
5
6
9
8
9
9
2
3
4
6
8
8
10
8
9
9
8
9
8
8
7
11
7
10
9
9
8
8
8
9
8
9
9
6
5
6
8
6
10
9
9
10
10
10
9
8
5
9
9
9
9
4
3
6
8
9
6
8
9
11
9
6
8
8
3
5
7
8
9
2
7
2
8
7
7
8
10
6
9
7
7
3
5
3
8
8
9
9
3
9
9
9
8
8
7
7
2
10
10
9
10
9
9
9
9
7
7
7
8
8
10
9
7
9
9
10
3
6
9
8
7
7
10
9
9
9
7
3
7
4
3
10
7
8
6
9
10
7
8
8
4
10
5
7
7
8
8
7
9
6
5
5
9
8


In [84]:
torch.cuda.empty_cache()