In [0]:
!nvidia-smi

Mon Dec 16 04:33:26 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
#this can be skipped if you've already setup these libraries and dowloaded the model
!pip install -U spacy[cuda100] 
!pip install -U tqdm
!python -m spacy download en_trf_distilbertbaseuncased_lg

Requirement already up-to-date: spacy[cuda100] in /usr/local/lib/python3.6/dist-packages (2.2.3)
Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.40.2)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_trf_distilbertbaseuncased_lg')


In [0]:
import spacy
from spacy.util import minibatch
import random
import torch
from tqdm import tqdm
import pandas as pd

In [0]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
is_using_gpu

True

In [None]:
nlp=spacy.load('en_trf_distilbertbaseuncased_lg')

In [0]:
print(nlp.pipe_names) # ["sentencizer", "trf_wordpiecer", "trf_tok2vec"]
textcat = nlp.create_pipe("trf_textcat", config={"exclusive_classes": False})
for label in ["BugFix","NotBugFix"]:
    textcat.add_label(label)
nlp.add_pipe(textcat)
nlp=nlp.from_disk("distilbert-textcat/") 

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


In [0]:
train_df=pd.read_csv('dataset/train.csv')
train_df.head()

Unnamed: 0,text,label
0,rCriticalSection & rSemaphore removed ...,1
1,[ALSA] Improve SPDIF playback via the P16V...,0
2,KVM: Change the emulator_{read write cmpxc...,0
3,Bump test projects up to .NET 4.5.2 - ...,0
4,Update proto name to fix Windows portabili...,1


In [0]:
def create_dataset(df):
    dataset=[]
    for i in df.itertuples():
        if i.label==1:
            dataset.append((i.text,{"cats":{"BugFix": 1.0, "NotBugFix": 0.0}}))
        else:
            dataset.append((i.text,{"cats":{"BugFix": 0.0, "NotBugFix": 1.0}}))
    return dataset

In [0]:
def evaluate(df):
    tp,fp,tn,fn=0,0,0,0
    for i in tqdm(df.itertuples()):
        doc=nlp(i.text)
        if i.label==1 and (doc.cats['BugFix']>doc.cats['NotBugFix']):
            tp+=1
        if i.label==0 and (doc.cats['NotBugFix']>doc.cats['BugFix']):
            tn+=1
        if i.label==0 and (doc.cats['BugFix']>doc.cats['NotBugFix']):
            fp+=1
        if i.label==1 and (doc.cats['NotBugFix']>doc.cats['BugFix']):
            fn+=1
    return tp,fp,tn,fn

In [0]:
eval_df=pd.read_csv('dataset/test.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

324it [00:05, 63.85it/s]

98 25 192 9
0.8950617283950617





In [0]:
eval_df=pd.read_csv('dataset/levin et al.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

1151it [00:15, 76.70it/s]

294 36 615 206
0.789748045178106





In [0]:
eval_df=pd.read_csv('dataset/berger et al.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

375it [00:04, 78.18it/s]

114 39 196 26
0.8266666666666667





In [0]:
eval_df=pd.read_csv('dataset/berger et al subset.csv')
tp,fp,tn,fn=evaluate(eval_df)
print(tp,fp,tn,fn)
print((tp+tn)/(tp+tn+fp+fn))

271it [00:03, 79.75it/s]

83 16 165 7
0.915129151291513





In [0]:
!tar -xzvf dataset_message.tgz 

dataset_message.csv


In [0]:
df_pred=pd.read_csv('dataset_message.csv')

In [0]:
df_pred['message']=df_pred['message'].str[:384]
df_pred=df_pred.drop_duplicates(subset='message')

In [0]:
def get_prediction(df,batch=16):
    preds=[]
    for doc in tqdm(nlp.pipe(df['message'].tolist(),
                           disable=["tagger", "parser","merge_noun_chunks",
                                    "merge_entities","merge_subtokens"
                                    "ner","entity_linker","entity_ruler"])):
    if doc.cats['BugFix']>doc.cats['NotBugFix']:
        preds.append(1)
    else:
        preds.append(0)
    return preds

In [0]:
df_pred['message'] = df_pred['message'].astype(str)

In [0]:
tqdm.pandas()

In [0]:
#9hrs 8 mins for 384 chars for ~4.90M commits
preds=get_prediction(df_pred)

4909387it [9:19:58, 146.12it/s]


In [0]:
import joblib
joblib.dump(preds,'preds.joblib', compress='zlib')

['preds.joblib']

In [0]:
len(preds)

4909387