In [39]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [40]:
import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *
from fastai.structured import *
from fastai import sgdr

import spacy
spacy_en = spacy.load('en')

import dill as pickle

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)

In [41]:
PATH = 'data/toxic'

In [42]:
raw_train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

In [43]:
print(f'Train size: {len(raw_train_df)} | Test size: {len(test_df)}')
display(raw_train_df.head(3))
display(test_df.head(3))
display(sample_subm_df.head(3))

Train size: 159571 | Test size: 153164


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0


Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5


In [44]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

In [45]:
class TextMultiLabelDataset(torchtext.data.Dataset):
    def __init__(self, df, tt_text_field, tt_label_field, txt_col, lbl_cols, **kwargs):
        # torchtext Field objects
        fields = [('text', tt_text_field)]
        for l in lbl_cols: fields.append((l, tt_label_field))
            
        is_test = False if lbl_cols[0] in df.columns else True
        n_labels = len(lbl_cols)
        
        examples = []
        for idx, row in df.iterrows():
            if not is_test:
                lbls = list(map(float,[ row[l] for l in lbl_cols ]))
            else:
                lbls = [0.0] * n_labels
                
            txt = str(row[txt_col])
            examples.append(data.Example.fromlist([txt]+lbls, fields))
                            
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(example): 
        return len(example.text)
    
    @classmethod
    def splits(cls, text_field, label_field, train_df, txt_col, lbl_cols, val_df=None, test_df=None, **kwargs):
        # build train, val, and test data
        train_data, val_data, test_data = (None, None, None)
        
        if train_df is not None: 
            train_data = cls(train_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)
        if val_df is not None: 
            val_data = cls(val_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)
        if test_df is not None: 
            test_data = cls(test_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [46]:
class TextMultiLabelDataLoader():
    def __init__(self, src, x_fld, y_flds, y_dtype='torch.cuda.FloatTensor'):
        self.src, self.x_fld, self.y_flds = src, x_fld, y_flds
        self.y_dtype = y_dtype

    def __len__(self): return len(self.src)#-1

    def __iter__(self):
        it = iter(self.src)
        for i in range(len(self)):
            b = next(it)
            
            if (len(self.y_flds) > 1):
                targ = [ getattr(b, y) for y in self.y_flds ] 
                targ = torch.stack(targ, dim=1).type(self.y_dtype)
            else: 
                targ = getattr(b, self.y_flds[0])
                targ = targ.type(self.y_dtype)

            yield getattr(b, self.x_fld), targ

class TextMultiLabelData(ModelData):

    @classmethod
    def from_splits(cls, path, splits, bs, text_name='text', label_names=['label'], 
                    target_dtype='torch.cuda.FloatTensor'):
        
        text_fld = splits[0].fields[text_name]
        
        label_flds = []
        if (len(label_names) == 1): 
            label_fld = splits[0].fields[label_names[0]]
            label_flds.append(label_fld)
            if (label_fld.use_vocab): 
                label_fld.build_vocab(splits[0])
                target_dtype = 'torch.cuda.FloatTensor'
        else:
            for n in label_names:
                label_fld = splits[0].fields[n]
                label_flds.append(label_fld)

        iters = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
        trn_iter,val_iter,test_iter = iters[0],iters[1],None
        test_dl = None
        if len(iters) == 3:
            test_iter = iters[2]
            test_dl = TextMultiLabelDataLoader(test_iter, text_name, label_names, target_dtype)
        trn_dl = TextMultiLabelDataLoader(trn_iter, text_name, label_names, target_dtype)
        val_dl = TextMultiLabelDataLoader(val_iter, text_name, label_names, target_dtype)

        obj = cls.from_dls(path, trn_dl, val_dl, test_dl)
        obj.bs = bs
        obj.pad_idx = text_fld.vocab.stoi[text_fld.pad_token]
        obj.nt = len(text_fld.vocab)

        # if multiple labels, assume the # of classes = the # of labels 
        if (len(label_names) > 1):
            c = len(label_names)
        # if label has a vocab, assume the vocab represents the # of classes
        elif (hasattr(label_flds[0], 'vocab')): 
            c = len(label_flds[0].vocab)
        else:
            c = 1
            
        obj.c = c

        return obj

In [47]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenizer(s): return re_tok.sub(r' \1 ', s).split()

In [90]:
bsz = 32 #16 #32 #64

max_tokens = 20000 # max number of words based on frequency
max_len = None #100      # max length of each comment to look at

n_hidden = 256
n_fac = 128

In [91]:
class LstmClassifier(nn.Module):
    def __init__(self, vocab_size, n_fac, bsz, 
                 fc_szs=[], fc_drops=[], n_lstm_hidden=256, n_lstm_layers=1, out_sz=1,
                 lstm_drop=0.5, is_multi=False, y_range=None, use_bn=False):
        
        super().__init__()
        
        self.vocab_size, self.out_sz = vocab_size, out_sz
        self.n_lstm_layers, self.n_lstm_hidden = n_lstm_layers, n_lstm_hidden
        
        self.is_multi = is_multi
        self.y_range = y_range
        self.use_bn = use_bn
        
        self.e = nn.Embedding(vocab_size, n_fac)
        self.e.weight.data.uniform_(-0.1, 0.1)
        
        self.rnn = nn.LSTM(n_fac, n_lstm_hidden, n_lstm_layers, dropout=lstm_drop)
        
        fc_szs = [n_lstm_hidden] + fc_szs
        
        self.linears, self.linear_drops, self.linear_bns = [], [], []
        if (len(fc_szs) > 1):
            self.linears = nn.ModuleList(
                [ nn.Linear(fc_szs[idx], sz) for idx, sz in enumerate(fc_szs[1:]) ]
            )
            
            self.linear_bns = nn.ModuleList(
                [ nn.BatchNorm1d(sz) for sz in fc_szs[1:] ]
            )
            
            for l in self.linears: kaiming_normal(l.weight.data)
                
        if (len(fc_drops) > 0):
            self.linear_drops = nn.ModuleList([ nn.Dropout(d) for d in fc_drops ])
                
        self.outp = nn.Linear(fc_szs[-1], out_sz)
        kaiming_normal(self.outp.weight.data)
            
        self.h = self.init_hidden(bsz)
        
    def forward(self, words):
        bsz = words[0].size(0)
        if (self.h[0].size(1) != bsz): self.h = self.init_hidden(bsz)
            
        x, h = self.rnn(self.e(words), self.h)
        self.h = repackage_var(h)
        
        x = x[-1]
        
        for l, d, b in zip(self.linears, self.linear_drops, self.linear_bns):
            x = F.relu(l(x))
            if (self.use_bn): x = b(x)
            x = d(x)
        
        x = self.outp(x)

        if (self.is_multi):
            return F.sigmoid(x)
        
        if (not self.is_multi and self.out_sz > 1):
            return F.log_softmax(x)
        
        if (self.y_range):
            x = F.sigmoid(x)
            x = x * (self.y_range[1] - self.y_range[0])
            x = x + self.y_range[0]
            
        return x
    
    def init_hidden(self, bsz):
        return(V(torch.zeros(self.n_lstm_layers, bsz, self.n_lstm_hidden)), 
               V(torch.zeros(self.n_lstm_layers, bsz, self.n_lstm_hidden)))

In [93]:


txt_col = 'comment_text'
val_idxs = get_cv_idxs(len(raw_train_df), val_pct=0.2)

train_df =  raw_train_df.drop(val_idxs)
val_df = raw_train_df.copy().iloc[val_idxs]

len(train_df), len(val_df), len(test_df)



(127657, 31914, 153164)

In [94]:
train_df[label_cols] = train_df[label_cols].astype(np.float32)
val_df[label_cols] = val_df[label_cols].astype(np.float32)

train_df.dtypes

id               object 
comment_text     object 
toxic            float32
severe_toxic     float32
obscene          float32
threat           float32
insult           float32
identity_hate    float32
dtype: object

In [95]:
val_df.dtypes

id               object 
comment_text     object 
toxic            float32
severe_toxic     float32
obscene          float32
threat           float32
insult           float32
identity_hate    float32
dtype: object

In [96]:
tt_TEXT = data.Field(sequential=True, tokenize=tokenizer, fix_length=max_len)
tt_LABEL = data.Field(sequential=False, use_vocab=False,tensor_type=torch.cuda.FloatTensor)

In [97]:

splits = TextMultiLabelDataset.splits(tt_TEXT, tt_LABEL, train_df, 
                                      'comment_text', label_cols, val_df, test_df)

In [98]:
t = splits[0].examples[0]
t.toxic, t.insult, ' '.join(t.text)

(0.0,
 0.0,
 "D ' aww ! He matches this background colour I ' m seemingly stuck with . Thanks . ( talk ) 21 : 51 , January 11 , 2016 ( UTC )")

In [99]:
tt_TEXT.build_vocab(splits[0])

In [100]:
md = TextMultiLabelData.from_splits(PATH, splits, bsz, text_name='text', label_names=label_cols)
# md = TextMultiLabelData.from_splits(PATH, splits, bsz, text_name='text', label_names=label_cols)

len(md.trn_dl), md.nt, md.c

(3990, 204998, 6)

In [101]:
print(tt_TEXT.vocab.stoi[tt_TEXT.pad_token])
print(tt_TEXT.vocab.itos[:10])

1
['<unk>', '<pad>', '.', ',', 'the', '"', 'to', 'I', 'of', "'"]


In [102]:
print('most common words: ')
tt_TEXT.vocab.freqs.most_common(10)

most common words: 


[('.', 545820),
 (',', 380544),
 ('the', 360798),
 ('"', 315858),
 ('to', 234613),
 ('I', 180612),
 ('of', 177584),
 ("'", 176148),
 ('and', 170990),
 ('a', 163203)]

In [103]:
print('least common words: ')
tt_TEXT.vocab.freqs.most_common()[-10:]

least common words: 


[('Timoshenko', 1),
 ('amblocked', 1),
 ('2K15', 1),
 ('gratest', 1),
 ('Gag01001', 1),
 ('Aberration', 1),
 ('automakers', 1),
 ('Boastful', 1),
 ('Superlatives', 1),
 ('Classifying', 1)]

In [104]:
m = LstmClassifier(md.nt, n_fac, bsz, [512, 256], [0.1, 0.1],
                   n_hidden, n_lstm_layers=1, out_sz=md.c, is_multi=True, use_bn=False).cuda()

lo = LayerOptimizer(optim.Adam, m, 1e-3, 1e-4)

In [112]:
fit(m, md, 1, lo.opt, F.binary_cross_entropy_with_logits) 

epoch:   0, train_loss: 0.693232, val_loss: 0.691652           



In [109]:
predstest = predict(m, md.test_dl)


In [110]:
predstest[:6,:]

array([[0.21456, 0.05024, 0.16414, 0.04271, 0.1032 , 0.04697],
       [0.16591, 0.04566, 0.12862, 0.03937, 0.08501, 0.04313],
       [0.1685 , 0.04593, 0.13051, 0.03957, 0.086  , 0.04336],
       [0.1685 , 0.04593, 0.13051, 0.03957, 0.086  , 0.04336],
       [0.16535, 0.04561, 0.12821, 0.03934, 0.0848 , 0.04309],
       [0.1685 , 0.04593, 0.13051, 0.03957, 0.086  , 0.04336]], dtype=float32)

In [87]:
subm_df = pd.DataFrame(data=predstest, columns=label_cols)
subm_df.insert(0, 'id', test_df.id)

In [113]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}/models/multi-label-cyc_{cycle}-of-4')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]

fit(m, md, 2**4-1, lo.opt, F.binary_cross_entropy_with_logits, callbacks=cb)

epoch:   0, train_loss: 0.693168, val_loss: 0.693163           
epoch:   1, train_loss: 0.694071, val_loss: 0.693147           
epoch:   2, train_loss: 0.693166, val_loss: 0.693793           
epoch:   3, train_loss: 0.693953, val_loss: 0.693147           
epoch:   4, train_loss: 0.694330, val_loss: 0.693147           
epoch:   5, train_loss: 0.693323, val_loss: 0.693147           
epoch:   6, train_loss: 0.693168, val_loss: 0.693166           
epoch:   7, train_loss: 0.693167, val_loss: 0.693164           
epoch:   8, train_loss: 0.694215, val_loss: 0.693147           
epoch:   9, train_loss: 0.693457, val_loss: 0.693147           
epoch:  10, train_loss: 0.693169, val_loss: 0.693166           
epoch:  11, train_loss: 0.693167, val_loss: 0.693163           
epoch:  12, train_loss: 0.693196, val_loss: 0.693147           
epoch:  13, train_loss: 0.693420, val_loss: 0.693147           
epoch:  14, train_loss: 0.694372, val_loss: 0.693165           



In [None]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}/models/multi-label-2-cyc_{cycle}-of-1')
cb = [CosAnneal(lo, len(md.trn_dl) * 10, on_cycle_end=on_end)]

fit(m, md, 10, lo.opt, F.binary_cross_entropy, callbacks=cb)

In [None]:
load_model(m, f'{PATH}/models/multi-label-2-cyc_0-of-1')

In [None]:
preds = predict(m, md.test_dl)
len(preds), len(test_df), len(train_df), len(val_df)

In [None]:
#subm_df = pd.DataFrame(data=preds, columns=label_cols) # -- if "none" included
subm_df = pd.DataFrame(data=preds[:,:6], columns=label_cols) # -- if "none" included



In [None]:
len(preds), len(test_df)

In [89]:
subm_df.to_csv(f'{PATH}/toxic_submission2.csv', index=None)

In [79]:
subm_df.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.140499,0.010649,0.084255,0.013067,0.065696,0.019294
1,0000247867823ef7,0.093443,0.005213,0.052726,0.006935,0.040252,0.011026
2,00013b17ad220c46,0.140499,0.010649,0.084255,0.013067,0.065696,0.019294
3,00017563c3f7919a,0.282951,0.033744,0.183968,0.033563,0.148715,0.045684
4,00017695ad8997eb,0.140499,0.010649,0.084255,0.013067,0.065696,0.019294
5,0001ea8717f6de06,0.106706,0.006539,0.06106,0.008538,0.04703,0.013383
6,00024115d4cbde0f,0.140499,0.010649,0.084255,0.013067,0.065696,0.019294
7,000247e83dcc1211,0.114128,0.007326,0.065793,0.009434,0.050913,0.01463
8,00025358d4737918,0.253572,0.026371,0.160273,0.027289,0.128577,0.037742
9,00026d1092fe71cc,0.090045,0.00467,0.04998,0.006279,0.038038,0.010096


In [88]:
subm_df.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.177592,0.018022,0.10854,0.021443,0.086809,0.036835
1,0000247867823ef7,0.131184,0.008682,0.072391,0.011113,0.053415,0.020531
2,00013b17ad220c46,0.177592,0.018022,0.10854,0.021443,0.086809,0.036835
3,00017563c3f7919a,0.173275,0.01699,0.105047,0.020346,0.083488,0.035156
4,00017695ad8997eb,0.177592,0.018022,0.10854,0.021443,0.086809,0.036835
5,0001ea8717f6de06,0.122864,0.007998,0.067738,0.010409,0.048911,0.018991
6,00024115d4cbde0f,0.177592,0.018022,0.10854,0.021443,0.086809,0.036835
7,000247e83dcc1211,0.156944,0.014638,0.094323,0.018067,0.072798,0.030888
8,00025358d4737918,0.689917,0.285251,0.572844,0.099842,0.523914,0.178238
9,00026d1092fe71cc,0.1277,0.008241,0.070161,0.010535,0.051482,0.019643
