In [1]:
import spacy
from spacy.util import minibatch
import random
import torch
from tqdm import tqdm
import pandas as pd

In [2]:
spacy.__version__

'2.2.3'

In [3]:
train_df=pd.read_csv('dataset/train.csv')
train_df.head()

Unnamed: 0,text,label
0,rCriticalSection & rSemaphore removed ...,1
1,[ALSA] Improve SPDIF playback via the P16V...,0
2,KVM: Change the emulator_{read write cmpxc...,0
3,Bump test projects up to .NET 4.5.2 - ...,0
4,Update proto name to fix Windows portabili...,1


In [4]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
is_using_gpu

False

In [5]:
def create_dataset(df):
    dataset=[]
    for i in df.itertuples():
        if i.label==1:
            dataset.append((i.text,{"cats":{"BugFix": 1.0, "NotBugFix": 0.0}}))
        else:
            dataset.append((i.text,{"cats":{"BugFix": 0.0, "NotBugFix": 1.0}}))
    return dataset

In [6]:
nlp = spacy.load("en_trf_distilbertbaseuncased_lg")
print(nlp.pipe_names) # ["sentencizer", "trf_wordpiecer", "trf_tok2vec"]
textcat = nlp.create_pipe("trf_textcat", config={"exclusive_classes": False})

I0101 15:54:14.536913 140125099353920 file_utils.py:39] PyTorch version 1.3.0 available.
I0101 15:54:15.604322 140125099353920 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


In [7]:
for label in ["BugFix","NotBugFix"]:
    textcat.add_label(label)
nlp.add_pipe(textcat)

In [8]:
TRAIN_DATA=create_dataset(train_df)
batch_size=8
learn_rate=2e-5
len(TRAIN_DATA)/batch_size

228.875

In [9]:
def lr_rate():
    while True:
        yield 2e-5

In [10]:
optimizer = nlp.resume_training()
optimizer.trf_lr = next(lr_rate())

In [32]:
#number of epochs
#4
for i in tqdm(range(4)):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for batch in tqdm(minibatch(TRAIN_DATA, size=8)):
        texts, cats = zip(*batch)
        nlp.update(texts, cats, sgd=optimizer, losses=losses)
    print(i, losses)

  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:06,  6.63s/it][A
2it [00:09,  5.60s/it][A
3it [00:14,  5.45s/it][A
4it [00:25,  6.97s/it][A
5it [00:34,  7.74s/it][A
6it [00:44,  8.41s/it][A
7it [00:48,  6.96s/it][A
8it [00:57,  7.46s/it][A
9it [01:01,  6.66s/it][A
10it [01:12,  7.80s/it][A
11it [01:18,  7.32s/it][A
12it [01:27,  7.85s/it][A
13it [01:36,  8.24s/it][A
14it [01:42,  7.58s/it][A
15it [01:51,  7.76s/it][A
16it [02:02,  8.96s/it][A
17it [02:13,  9.40s/it][A
18it [02:22,  9.25s/it][A
19it [02:36, 10.64s/it][A
20it [02:40,  8.78s/it][A
21it [02:51,  9.58s/it][A
22it [02:59,  8.84s/it][A
23it [03:08,  9.09s/it][A
24it [03:21, 10.11s/it][A
25it [03:27,  9.02s/it][A
26it [03:36,  9.07s/it][A
27it [03:40,  7.58s/it][A
28it [03:51,  8.58s/it][A
29it [03:55,  7.05s/it][A
30it [04:06,  8.20s/it][A
31it [04:18,  9.27s/it][A
32it [04:30, 10.29s/it][A
33it [04:41, 10.42s/it][A
34it [04:47,  9.22s/it][A
35it [04:53,  8.16s/it][A
36it

0 {'trf_textcat': 0.15650997367637803}





In [11]:
nlp.from_disk("distilbert-textcat")

<spacy_transformers.language.TransformersLanguage at 0x7f712af46978>

In [12]:
def evaluate(df):
    tp,fp,tn,fn=0,0,0,0
    for i in tqdm(df.itertuples()):
        doc=nlp(i.text)
        if i.label==1 and (doc.cats['BugFix']>doc.cats['NotBugFix']):
            tp+=1
        if i.label==0 and (doc.cats['NotBugFix']>doc.cats['BugFix']):
            tn+=1
        if i.label==0 and (doc.cats['BugFix']>doc.cats['NotBugFix']):
            fp+=1
        if i.label==1 and (doc.cats['NotBugFix']>doc.cats['BugFix']):
            fn+=1
    return tp,fp,tn,fn

In [13]:
eval_df=pd.read_csv('dataset/test.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

324it [00:14, 22.57it/s]

98 25 192 9
0.8950617283950617





In [14]:
eval_df=pd.read_csv('dataset/levin et al.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

1151it [00:43, 26.33it/s]

294 36 615 206
0.789748045178106





In [15]:
eval_df=pd.read_csv('dataset/berger et al.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

375it [00:12, 30.53it/s]

114 39 196 26
0.8266666666666667





In [16]:
eval_df=pd.read_csv('dataset/berger et al subset.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

271it [00:09, 28.86it/s]

83 16 165 7
0.915129151291513



