## Preprocess data for experimentation

Steps for sentences preprocessing:
1. Get content words for each sentence in the corpus
2. Create vocabulary for bag of words from those words
3. Remove sentences without content words
4. Get embeddings of this sentences
5. Save embeddings, content words of sentences and vocabulary

In [31]:
# imports
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch import optim
from content_model_pytorch.config import ModelConfig
from content_model_pytorch.data_loader import SentencesDataset
from content_model_pytorch.model import ContentModel
from tqdm import tqdm, trange

In [32]:
# datasets Path
DATASETS_PATH = "./datasets/"

#### Train subset

In [None]:
dataset = "formality-score"

# load content words for dataset
with open(f"{DATASETS_PATH}formality-score/sentences_content_words/train.txt", 'r+') as fd:
    sentences_content_words = fd.readlines()

# count number of sentences in dataset without content words and build
# accumulate table
cont = 0
acc_table = []
for sentence in sentences_content_words:
    if sentence == '\n':
        cont += 1
    acc_table.append(cont)

# remove sentences without content words
sentences_content_words = [(sentence,idx) for idx,sentence in enumerate(sentences_content_words) if sentence != '\n']
# len(sentences_content_words)

### Validation dataset

In [None]:
dataset = "formality-score"

# load content words for dataset
with open(f"{DATASETS_PATH}formality-score/sentences_content_words/test.txt", 'r+') as fd:
    sentences_content_words_val = fd.readlines()
# print(len(sentences_content_words_val))
# count number of sentences in dataset without content words and build
# accumulate table for validation
cont = 0
acc_table_val = []
for sentence in sentences_content_words_val:
    if sentence == '\n':
        cont += 1
    acc_table_val.append(cont)

# remove sentences without content words
sentences_content_words_val = [(sentence,idx) for idx,sentence in enumerate(sentences_content_words_val) if sentence != '\n']
# len(sentences_content_words_val)

In [None]:
embeddings_path = f"{DATASETS_PATH}{dataset}/embeddings/"
# create data_loader instance
dataset = SentencesDataset(embeddings_path,
        sentences_content_words,
        mode="train",batches_size = 100)
# create data_loader instance for validation
dataset_val = SentencesDataset(embeddings_path,
        sentences_content_words_val,
        mode="test",word2indx=dataset.vocab.word2indx,batches_size = 100)
# Load Model
config = ModelConfig(len(dataset.vocab.word2indx), embedding_size = 100, number_of_layers = 10)
data_loader = DataLoader(dataset,100)
data_loader_val = DataLoader(dataset_val,100)

# create model instance
model = ContentModel(config)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)

# define optimizer
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [None]:
print("Training started!")
loss_list = []
loss_list_val = []
for epoch in trange(20, desc="Epoch"):
    model.train()
    total_loss = 0
    total_loss_val = 0
    for i, (sentences_embeddings, sentences_bow) in enumerate(tqdm(data_loader, desc="Iteration")):
        # print(sentences_embeddings == None)
        # print(sentences_bow == None)
        # print(len(sentences_embeddings))
        sentences_embeddings.to(device)
        sentences_bow.to(device)
        optimizer.zero_grad()
        # forward pass
        # print(len(sentences_embeddings), sentences_embeddings[0].shape, sentences_bow[0].shape)

        loss = model(sentences_embeddings,sentences_bow)
        # calculate loss
        # loss = nn.BCELoss()(output, sentences_bow)
        # backward pass
        loss.backward()
        # update weights
        optimizer.step()
        total_loss += loss.item()
    # loss for validation
    model.eval()
    for i, (sentences_embeddings, sentences_bow) in enumerate(tqdm(data_loader_val, desc="Iteration")):
        # print(sentences_embeddings == None)
        # print(sentences_bow == None)
        sentences_embeddings.to(device)
        sentences_bow.to(device)
        # forward pass
        with torch.no_grad():
            loss = model(sentences_embeddings,sentences_bow)
            total_loss_val += loss.item()
    loss_list.append(total_loss/len(data_loader))
    loss_list_val.append(total_loss_val/len(data_loader_val))
    print(f"Epoch: {epoch}, Loss: {total_loss/len(data_loader)}, Loss_val: {total_loss_val/len(data_loader_val)}")


In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_list)
# plt.plot(loss_list_val)