In [1]:
import torch
import torch.nn as nn
import torchtext

import numpy as np
import pandas as pd 
from transformers import BertModel

import spacy

In [2]:
from torch.utils.data import Dataset, DataLoader

class CustomTextDataset(Dataset):
    def __init__(self, text_data, labels, transform=None):
        self.data = text_data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            text = self.transform(text)
        return text, label


In [119]:
x_train_og = pd.read_csv('../data/processed/train_og.csv')
x_train_aug = pd.read_csv('../data/processed/train_augmented.csv')
x_train_fin = pd.read_csv('../data/processed/train_final.csv')
x_test = pd.read_csv('../data/processed/test.csv')

In [143]:
### Defining the feature processing

TEXT = torchtext.data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='pt_core_news_sm', 
    # stop_words='pt_core_news_sm',
    lower=True
)

### Defining the label processing
LABEL = torchtext.data.LabelField(dtype=torch.int8)

fields = [('titulo', TEXT), 
          ('genero', TEXT),  
          ('alt_title', TEXT), 
          ('label', LABEL), 
          ('sf_historia', LABEL),
          ('sf_administracao', LABEL), 
          ('sf_geografia', LABEL), 
          ('sf_biologia', LABEL), 
          ('sf_literatura', LABEL),
          ('sf_artes', LABEL), 
          ('sf_matematica', LABEL)]

x_train_og = torchtext.data.TabularDataset(
    path='../data/processed/train_og.csv', format='csv',
    skip_header=True, fields=fields)

x_train_aug = torchtext.data.TabularDataset(
    path='../data/processed/train_augmented.csv', format='csv',
    skip_header=True, fields=fields)

x_train_fin = torchtext.data.TabularDataset(
    path='../data/processed/train_final.csv', format='csv',
    skip_header=True, fields=fields)

x_test = torchtext.data.TabularDataset(
    path='../data/processed/test.csv', format='csv',
    skip_header=True, fields=fields)

vars(x_test.examples[0])

{'titulo': ['introdução', 'à', 'teoria', 'da', 'literatura'],
 'genero': ['literatura'],
 'alt_title': ['introducao', 'teoria', 'literatura'],
 'label': '6'}

In [144]:
del x_train_og.fields['titulo']
del x_train_og.fields['genero']

del x_train_aug.fields['titulo']
del x_train_aug.fields['genero']

del x_train_fin.fields['titulo']
del x_train_fin.fields['genero']

del x_test.fields['titulo']
del x_test.fields['genero']

In [145]:
TEXT.build_vocab(x_train_og)

print(TEXT.vocab.freqs.most_common(10))

[('historia', 128), ('brasil', 85), ('matematica', 64), ('geografia', 60), ('biologia', 49), ('geral', 44), ('administracao', 44), ('arte', 26), ('mundo', 23), ('espaco', 20)]


In [146]:
BATCH_SIZE = 32

In [147]:
train_loader ,test_loader = \
torchtext.data.BucketIterator.splits(
        (x_train_og, x_test),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.alt_title),
         ) 
         

In [148]:
train_loader

<torchtext.data.iterator.BucketIterator at 0x1cf99fa9050>

In [149]:
vars(train_loader.dataset.examples[0])

{'titulo': ['biologia', 'dos', 'organismos'],
 'genero': ['biologia'],
 'alt_title': ['biologia', 'organismos'],
 'label': '3'}

In [152]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.alt_title.size()}')
    print(f'Target vector size: {batch.lable.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.alt_title.size()}')
    print(f'Target vector size: {batch.lable.size()}')
    break

Train


AttributeError: 'LabelField' object has no attribute 'vocab'

In [4]:

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        return out

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        out, hn = self.gru(x, h0.detach())
        out = self.fc(out[:, -1, :])
        return out


class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = nn.Sequential(
            nn.Linear(ninp, nhid),
            nn.ReLU(),
            nn.Linear(nhid, nhead))
        encoder_layers = TransformerEncoderLayer(nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

