In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import vocab
from torchtext import data
from nltk.tokenize.casual import casual_tokenize
from functools import partial
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

SEED = 777

In [6]:
tokenizer = partial(casual_tokenize, preserve_case=False)

In [7]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True 

In [8]:
TEXT = data.Field(tokenize=tokenizer,
                  batch_first=True)

LABEL = data.LabelField(dtype=torch.long,
                        batch_first=True,
                       #sequential=False
                       )

fields = [('text', TEXT), ('i_label', LABEL)]

In [9]:
train_data, valid_data = data.TabularDataset.splits(path = '',
                                                    train='train.csv',
                                                    validation='valid.csv',
                                                    format = 'csv',
                                                    fields = fields,
                                                    skip_header = True)

In [14]:
#usar min freq? max_size?

#https://github.com/dccuchile/spanish-word-embeddings
vec = vocab.Vectors('glove-sbwc.i25.vec')
TEXT.build_vocab(train_data,
                 vectors=vec,
                 unk_init=torch.Tensor.normal_,
                 max_size=15000)  
LABEL.build_vocab(train_data)

In [15]:
print(len(TEXT.vocab))
print(TEXT.vocab.freqs.most_common(10))

15002
[('de', 56807), (',', 54810), ('.', 36094), ('la', 33320), ('el', 29547), ('que', 28338), ('en', 27223), ('y', 19726), ('a', 19136), ('los', 14248)]


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

BATCH_SIZE = 32

train_iterator = data.BucketIterator(train_data, 
                                    batch_size=BATCH_SIZE,
                                    device=device)

valid_iterator = data.BucketIterator(valid_data, 
                                    batch_size=BATCH_SIZE*2,
                                    device=device)

cuda


In [17]:
class CNN(nn.Module):
    def __init__(self, n_filters, output_dim, emb_vec,
                 filter_sizes=[3, 4, 5, 6, 7], freeze=True, dropout=0.3):
        
        super().__init__()
        
        vocab_size = emb_vec.size()[0]
        emb_dim = emb_vec.size()[1]
        
        self.embedding = nn.Embedding.from_pretrained(emb_vec, freeze=freeze)        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, emb_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters,
                            output_dim)
        
        self.dropout = nn.Dropout(dropout)
        self.act = nn.LogSoftmax(dim=1)
        
    def forward(self, text):
                
        embedded = self.embedding(text)       
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        out = (self.fc(cat)).squeeze(1)
        return self.act(out)


In [18]:
N_FILTERS = 100
OUTPUT_DIM = 7

pretrained_embeddings = TEXT.vocab.vectors
model = CNN(N_FILTERS, OUTPUT_DIM, pretrained_embeddings)

In [19]:
optimizer = optim.Adam(model.parameters())
criterion = nn.NLLLoss()
model = model.to(device)
criterion = criterion.to(device)

In [20]:
print(model)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

CNN(
  (embedding): Embedding(15002, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(6, 300), stride=(1, 1))
    (4): Conv2d(1, 100, kernel_size=(7, 300), stride=(1, 1))
  )
  (fc): Linear(in_features=500, out_features=7, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (act): LogSoftmax(dim=1)
)
The model has 754,007 trainable parameters


In [21]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss, Accuracy
from ignite.handlers import ModelCheckpoint
from ignite.handlers import Timer

max_epochs = 10
t = Timer(average=True)
trainer = create_supervised_trainer(model, optimizer, criterion) # Creo un engine para entrenar
metrics = {'Loss': Loss(criterion), 'Acc': Accuracy()}
evaluator = create_supervised_evaluator(model, metrics=metrics) # Creo un engine para validar

@trainer.on(Events.EPOCH_COMPLETED(every=1)) # Cada 1 epocas
def log_results(engine):
    evaluator.run(valid_iterator) # Evaluo el conjunto de validación
    loss = evaluator.state.metrics['Loss']
    acc = evaluator.state.metrics['Acc']
    t.step()
    print(f"Epoca:{engine.state.epoch}"
          +f" \t Loss: {loss:.2f} \t  Accuracy: {acc:.2f}\ttiempo {t.value():.2f}")
best_model_handler = ModelCheckpoint(dirname='.', require_empty=False,
                                     filename_prefix="best", n_saved=1,
                                     score_function=lambda engine: -engine.state.metrics['Loss'],
                                     score_name="val_loss")

# Lo siguiente se ejecuta cada ves que termine el loop de validación
evaluator.add_event_handler(Events.COMPLETED,
                            best_model_handler, {'mymodel': model})

trainer.run(train_iterator, max_epochs=max_epochs)

Epoca:1 	 Loss: 0.84 	  Accuracy: 0.71	tiempo 30.73
Epoca:2 	 Loss: 0.73 	  Accuracy: 0.75	tiempo 30.05
Epoca:3 	 Loss: 0.69 	  Accuracy: 0.76	tiempo 30.12
Epoca:4 	 Loss: 0.66 	  Accuracy: 0.77	tiempo 29.94
Epoca:5 	 Loss: 0.65 	  Accuracy: 0.78	tiempo 30.01
Epoca:6 	 Loss: 0.65 	  Accuracy: 0.78	tiempo 29.98
Epoca:7 	 Loss: 0.67 	  Accuracy: 0.78	tiempo 29.91
Epoca:8 	 Loss: 0.67 	  Accuracy: 0.79	tiempo 29.92
Epoca:9 	 Loss: 0.72 	  Accuracy: 0.77	tiempo 29.94
Epoca:10 	 Loss: 0.69 	  Accuracy: 0.79	tiempo 29.98


State:
	iteration: 1460
	epoch: 10
	epoch_length: 146
	max_epochs: 10
	output: 0.03309398517012596
	batch: <class 'torchtext.data.batch.Batch'>
	metrics: <class 'dict'>
	dataloader: <class 'torchtext.data.iterator.BucketIterator'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [40]:
test_data = data.TabularDataset('test.csv',
                                format = 'csv',
                                fields = fields,
                                skip_header = True)

In [85]:
test_iterator = data.BucketIterator(test_data, 
                                    batch_size = BATCH_SIZE*2,
                                    device = device)

In [90]:
model.load_state_dict(torch.load('best_mymodel_val_loss=-0.6461.pt'))
evaluator = create_supervised_evaluator(model, metrics=metrics)
evaluator.run(test_iterator)
print(evaluator.state.metrics['Acc'])

0.7882117882117882


In [109]:
lbls = [ 'cultura',
         'deportes',
         'economia',
         'mundo',
         'pais',
         'tecnologias',
         'tendencias' ]


def predict(model, sentence):
    tokenized = [tok for tok in tokenizer(sentence)]
    tokenized = tokenized + [" " for i in range(10 - len(tokenized))]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          
    tensor = torch.LongTensor(indexed).to(device)              
    tensor = tensor.unsqueeze(1).T                          
    prediction = model(tensor)
    return int(torch.argmax(prediction[0]))

In [110]:
xd2 = """
Los pilotos chilenos Francisco “Chaleco” López e Ignacio “Perro” Casale aseguraron este
martes, mediante una video conferencia, estar listos y en cuarentena preventiva para
viajar a competir en el Rally Dakar 2021, que en su cuadragésima tercera edición 
y por segundo año consecutivo, se vivirá en las exigentes tierras y desiertos de 
Arabia Saudita.
Ambos pilotos admitieron que les acomoda y agrada la sede de la exigente competición, 
recordando el haber competido en enero del presente año en dichas tierras y caminos. 
De hecho, Casale terminó en el primer lugar en la Categoría Quads y López concluyó 
tercero en la de Side by Side, lo cual les brinda cierta ventaja y conocimiento de los 
duros caminos y dunas a los que se deberán enfrentar.
"""
lbls[predict(model, xd2)]

'deportes'

In [111]:
import pandas as pd
test_df = pd.read_csv('test.csv')
from sklearn.metrics import classification_report

predictions = [predict(model, noticia) for noticia in test_df['text']]

In [112]:
print(classification_report(test_df['i_label'], predictions, target_names=lbls))

              precision    recall  f1-score   support

     cultura       0.82      0.92      0.87       143
    deportes       0.91      0.88      0.90       143
    economia       0.71      0.77      0.74       143
       mundo       0.82      0.77      0.79       143
        pais       0.74      0.76      0.75       143
 tecnologias       0.80      0.69      0.74       143
  tendencias       0.68      0.69      0.68       143

    accuracy                           0.78      1001
   macro avg       0.78      0.78      0.78      1001
weighted avg       0.78      0.78      0.78      1001

