In [1]:
from fastai.text import * 
from fastai.callbacks import *
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle as pkl

torch.cuda.set_device(1)

In [2]:
import unicodedata
import pandas as pd
import re
import spacy
import json

In [3]:
PATH = Path('.')

In [4]:
def normalize_title(title):
    s = re.sub(r'[^a-zA-Z0-9ñç% ]', ' ', unicodedata.normalize('NFKD', title.lower()).encode('ascii', 'ignore').decode("utf-8"))
#    s = re.sub(r'[\d]+', "1", s)
    s = re.sub(r's |s$', ' ', s)
    s = re.sub(r' +', ' ', s)
    s = re.sub(r'(1 )+', '1 ', s)
    return re.sub(r'o |o$', 'a ', s).strip()

In [7]:
with open("main_categories.json", "r") as f:
    cats = json.load(f)

In [12]:
vocab = Vocab.load('models/v4_vocab_es' + '.pkl')

In [18]:
df = pd.read_csv(PATH / 'train.csv')
df = df[df.language == 'spanish']
df.main_category = df.category.map(cats)
df.title = df.title.apply(normalize_title)
df = df[~df.title.isna() & (df.title != 'nan') & (df.title != '')]

In [30]:
df['main_category'] = df.category.map(cats)

In [33]:
for cat in set(cats.values()):
    d = df[df.main_category == cat]
    print(cat, len(d))
    if len(d):
        train, valid = train_test_split(d, test_size=0.1, stratify=d.category, random_state = 42)
        data_class = ItemLists('', TextList.from_df(train, vocab=data_lm.vocab, cols=['title'], processor=SPProcessor.load('',tmp_dir='lm_es_v4')),
                                TextList.from_df(valid, vocab=data_lm.vocab, cols=['title'], processor=SPProcessor.load('',tmp_dir='lm_es_v4')))
        data_class = data_class.label_from_df(cols=['category'])
        data_class = data_class.databunch(bs=800, num_workers=1)      
        data_class.save(f'/data/anime/ml/{cat}.pkl')

In [5]:
from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): 
    y_pred = np.argmax(inp, axis=-1)
    return f1_score(targ, y_pred, average='weighted',  labels=np.unique(y_pred))

class WeightedLabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, weight, eps:float=0.1, reduction='mean'):
        super().__init__()
        self.weight,self.eps,self.reduction = weight,eps,reduction
        
    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        if self.reduction=='sum': loss = -log_preds.sum()
        else:
            loss = -log_preds.sum(dim=-1)
            if self.reduction=='mean':  loss = loss.mean()
        return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, weight=self.weight, reduction=self.reduction)

In [8]:
for cat in set(cats.values()):
    print(cat)
    if not (Path(f'/data/anime/ml/{cat}_model.pth')).exists() or True:
        data_class = load_data('/data/anime/ml/', cat + '.pkl', bs=500)
        num_trn = len(data_class.train_ds.x)
        num_val = len(data_class.valid_ds.x)
        num_trn, num_val, num_trn+num_val
        trn_LabelCounts = np.unique(data_class.train_ds.y.items, return_counts=True)[1]
        val_LabelCounts = np.unique(data_class.valid_ds.y.items, return_counts=True)[1]
        trn_LabelCounts, val_LabelCounts
        trn_weights = [1 - count/num_trn for count in trn_LabelCounts]
        val_weights = [1 - count/num_val for count in val_LabelCounts]
        trn_weights, val_weights
        
        config = awd_lstm_clas_config.copy()
        config['qrnn'] = True
        config['n_hid'] = 1550 #default 1152
        config['n_layers'] = 4 #default 3      
        
        learn_c = text_classifier_learner(data_class, AWD_LSTM, pretrained=False, config=config, drop_mult=0.3)
        learn_c.load_encoder('best_lm_es_v4_encoder');
        
        loss_weights = torch.FloatTensor(trn_weights).cuda()
        learn_c.loss_func = FlattenedLoss(WeightedLabelSmoothingCrossEntropy, weight=loss_weights)
        
        learn_c.freeze()
        learn_c.fit_one_cycle(2, 3e-2, moms=(0.8,0.7))
        
        learn_c.freeze_to(-2)
        learn_c.fit_one_cycle(2, 3e-2, wd=0.01, moms=(0.8,0.7))
        
        lr = 3e-2
        wd = 0.01
        learn_c.freeze_to(-3)
        learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), wd=wd, moms=(0.8,0.7))
        
        learn_c.unfreeze()
        learn_c.fit_one_cycle(2, slice(lr/10/(2.6**4),lr/10), wd=wd, moms=(0.8,0.7))
        
        learn_c.save(f'/data/anime/ml/{cat}_model')
        del learn_c
        torch.cuda.empty_cache()
        gc.collect()

Acessorios para Veiculos


epoch,train_loss,valid_loss,accuracy,time
0,3.109625,1.973844,0.761047,03:50
1,3.014726,1.927265,0.782062,03:40


epoch,train_loss,valid_loss,accuracy,time
0,1.931784,1.487716,0.89172,04:19
1,1.74526,1.384331,0.900834,04:04


epoch,train_loss,valid_loss,accuracy,time
0,1.539113,1.309836,0.906958,04:55
1,1.47067,1.245372,0.915326,05:12


epoch,train_loss,valid_loss,accuracy,time
0,1.374025,1.226636,0.917202,08:49
1,1.343664,1.222797,0.918359,08:09


Casa, Moveis e Decoracao


epoch,train_loss,valid_loss,accuracy,time
0,2.924116,2.006023,0.752154,03:13
1,2.825181,1.963591,0.769447,03:01


epoch,train_loss,valid_loss,accuracy,time
0,2.109592,1.673738,0.826178,03:09
1,2.019976,1.594168,0.836846,03:15


epoch,train_loss,valid_loss,accuracy,time
0,1.800165,1.50706,0.84527,03:48
1,1.665766,1.449819,0.856899,04:03


epoch,train_loss,valid_loss,accuracy,time
0,1.648047,1.435166,0.859073,06:24
1,1.601861,1.429371,0.860179,07:05


Instrumentos Musicais


epoch,train_loss,valid_loss,accuracy,time
0,1.85992,1.314034,0.875186,00:38
1,1.771311,1.272795,0.890366,00:36


epoch,train_loss,valid_loss,accuracy,time
0,1.328103,1.037855,0.918841,00:39
1,1.202771,0.991644,0.92658,00:40


epoch,train_loss,valid_loss,accuracy,time
0,1.113141,0.939394,0.932368,00:52
1,1.0513,0.91228,0.937626,00:48


epoch,train_loss,valid_loss,accuracy,time
0,1.011343,0.904784,0.937692,01:25
1,0.993267,0.900517,0.939412,01:22


Mais Categorias


epoch,train_loss,valid_loss,accuracy,time
0,1.803051,1.304162,0.823849,00:13
1,1.700268,1.253738,0.838528,00:13


epoch,train_loss,valid_loss,accuracy,time
0,1.463226,1.136481,0.866378,00:13
1,1.316795,1.069333,0.879147,00:14


epoch,train_loss,valid_loss,accuracy,time
0,1.248753,1.035648,0.884376,00:17
1,1.168155,0.99074,0.895234,00:16


epoch,train_loss,valid_loss,accuracy,time
0,1.138371,0.981754,0.896742,00:25
1,1.112226,0.96976,0.897044,00:28


Ferramentas e Construcao


epoch,train_loss,valid_loss,accuracy,time
0,2.409366,1.661872,0.829722,01:26


KeyboardInterrupt: 

In [11]:
for cat in set(cats.values()):
    config = awd_lstm_clas_config.copy()
    config['qrnn'] = True
    config['n_hid'] = 1550 #default 1152
    config['n_layers'] = 4 #default 3 
    data_class = load_data('/data/anime/ml/', cat + '.pkl', bs=500)
    learn_c = text_classifier_learner(data_class, AWD_LSTM, pretrained=False, config=config, drop_mult=0.3)
    learn_c.load(f'/data/anime/ml/{cat}_model')
    learn_c.export(f'/data/anime/ml/es/{cat}_model_export')
    del learn_c
    torch.cuda.empty_cache()
    gc.collect()