<a href="https://colab.research.google.com/github/jameszlj/NLP_with_python/blob/master/fast_ai_text_multi_label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from fastai.text import *

In [8]:
!pip install pytorch-transformers



In [0]:
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
class BertFastaiTokenizer(BaseTokenizer):
    def __init__(self, tokenizer, max_seq_len=128, **kwargs):
        self.pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t):
        return ["[CLS]"] + self.pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]


class MyNoTupleModel(BertForSequenceClassification):
    def forward(self, *args, **kwargs):
        return super().forward(*args, **kwargs)[0]

In [0]:
max_seq_len = 256
batch_size = 32
path = Path(".")
bert_model = "bert-base-uncased"

In [0]:
train, test = [pd.read_csv(path / fname) for fname in ["train.csv", "test.csv"]]
train, valid = train_test_split(train, random_state=2)

In [13]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5400,0e65327b5bbee0a5,"""\n\nI think it is both a Khmer character and ...",0,0,0,0,0,0
122278,8e162cbf88bfc540,"""\nI have added information to the article cre...",0,0,0,0,0,0
115832,6b646f8e87da9b5f,"Thanks kmccoy, I could really use your help on...",0,0,0,0,0,0
125459,9f1ab871e0d50caf,"""\nI never heard that, and I can't find any re...",0,0,0,0,0,0
131403,befbff9b502b865f,"""\n\nWell objectively, 200 PKK militants train...",0,0,0,0,0,0
158699,f218b5761c26faaa,I was thinking just adding a bullet point to t...,0,0,0,0,0,0
75648,ca678d42144fbb2d,Your objections to that sentence are quite cor...,0,0,0,0,0,0
118733,7a8f77565e0df03e,Thanks for being late to the party \n\nbut I h...,1,1,1,0,1,0
115153,67b7eec36480cd1c,I give up trying to write a page today. You lo...,0,0,0,0,0,0
109009,46dd6c0f2cdca00c,"""\n\n Robots everywhere! \n\nI see you have me...",0,0,0,0,0,0


In [14]:
bert_tokenizer = BertTokenizer.from_pretrained(bert_model)
print(list(bert_tokenizer.vocab.items())[1000:1005])
bert_vocab = Vocab(list(bert_tokenizer.vocab.keys()))
tok_func = BertFastaiTokenizer(bert_tokenizer, max_seq_len=max_seq_len)

100%|██████████| 231508/231508 [00:00<00:00, 1227939.08B/s]


[('"', 1000), ('#', 1001), ('$', 1002), ('%', 1003), ('&', 1004)]


In [15]:
bert_fastai_tokenizer = Tokenizer(tok_func=tok_func, pre_rules=[], post_rules=[])
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
## preprocess databunch
databunch = TextClasDataBunch.from_df(path, train, valid, test,
                                      tokenizer=bert_fastai_tokenizer,
                                      vocab=bert_vocab,
                                      include_bos=False,
                                      include_eos=False,
                                      text_cols="comment_text",
                                      label_cols=label_cols,
                                      bs=batch_size,
                                      collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
                                      )
databunch.show_batch()

text,target
"[CLS] english english vs american english vs . . . in an earlier edit changing petrol to gas , i wrote in the edit comment that i was changing the word ##ing to american english ( gas ) because that ' s the standard for this wikipedia . another user pointed out that my assertion of an american english standard for this wikipedia was incorrect according to the wikipedia manual",
"[CLS] you ' re fucking gay . get a job and quit arguing with people on the internet . it doesn ' t make you cool , you aren ' t adding shit , you ' re just making it worse . get a fucking life . you ' re fucking gay . get a job and quit arguing with people on the internet . it doesn ' t make",toxic;obscene;insult;identity_hate
"[CLS] "" thanks for the help hey , thanks for directing me to the breeding ##bet ##tas page , even though it didn ' t have the exact info i was looking for , it did help with some other questions i had . porsche ##9 ##9 ##7 ##sb ##s . . . big thanks for the automobiles user ##box ##es . hope a mustang ( usa ) will turn",
[CLS] hello fran ##ia . i never mentioned anything yet about charles vii . it had no con ##tri ##dict ##ing in fact ##n with sal ##ic law as it was only su ##cc ##est ##ion for the private norm and in fact had nothing to do with france . it was used as a cover up rather then a legal law in a council held in 131 ##7 .,
"[CLS] london boroughs thanks for reviewing some of the london boroughs pages , as you ' ve noticed most are in a distress ##ing state - and it ' s a good day when the page is extended from harmless to mostly harmless . i would however urge you to use the fact tag spa ##ring ##ly - either for things that don ' t make sense , or that",


In [16]:
bert_pretrained_model = MyNoTupleModel.from_pretrained(bert_model, num_labels=6)
loss_func = nn.BCEWithLogitsLoss()
learn = Learner(databunch,
                bert_pretrained_model,
                loss_func=loss_func,
                metrics=accuracy)

100%|██████████| 313/313 [00:00<00:00, 74791.61B/s]
100%|██████████| 440473133/440473133 [00:11<00:00, 36750213.67B/s]


In [1]:
learn.lr_find()

NameError: ignored

In [0]:
learn.recorder.plot()

In [0]:
learn.fit_one_cycle(4, 3e-5)