In [1]:
import pandas
from sklearn.model_selection import train_test_split
import pandas
from transformers import pipeline
import math
from sklearn.metrics import confusion_matrix,precision_score,recall_score,roc_curve,auc,f1_score,accuracy_score
import numpy as np
import tqdm

In [57]:
data_train = pandas.read_csv("research_paper_dataset/train.csv")

In [58]:
data_train

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [28]:
candidate_labels = ["Computer Science", "Physics", "Mathematics", "Statistics", 
                    "Quantitative Biology", "Quantitative Finance" ]

In [30]:
classifier = pipeline("zero-shot-classification", model="roberta-large-mnli", device=0, batch_size=10)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
def get_df_with_prediction(df, inp_columns):
    small_df = df#.take([x for x in range(0,count)])
    arr_pred_multi_class = []
    arr_pred = []
    it = 1
    ids = []
    for id in small_df.index:
        ids.append(id)
    
    for id in tqdm.tqdm(ids):
        sequence = " ".join([small_df[col][id] for col in inp_columns])
        pred = classifier(sequence, candidate_labels)
        arr_pred.append(pred)
        pred_multi = classifier(sequence, candidate_labels, multi_label=True)
        arr_pred_multi_class.append(pred_multi)
    small_df['pred_multi_class'] = arr_pred_multi_class
    small_df['pred'] = arr_pred
    return small_df

def metrics(topic, y_true, y_pred, y_prob=None):        
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    metric_map = {
        'topic': topic,
        'precision': precision, 
        'recall': recall, 
        'f1_score': f1, 
        'accuracy': acc,        
        'true negative':tn, 
        'false positive': fp,
        'false negative': fn,
        'true positive': tp,

    }
    if y_prob is not None:
        fpr, tpr, thresholds = roc_curve(y_true, y_prob)
        auc_score = auc(fpr, tpr)
        metric_map['auc'] = auc_score
    return metric_map
    

def evaluate(small_df):
    result = {}
    for topic in candidate_labels:
        result[topic] = {"y_true":[], "y_pred":[], "y_pred_1":[], "y_prob":[]}
    for id in small_df.index:
    
        pred = small_df["pred"][id]
        output = pred['labels']
        pred_multi_class = small_df["pred_multi_class"][id]            
        for topic in candidate_labels:
            result[topic]["y_true"].append(small_df[topic][id])
            result[topic]["y_pred_1"].append(1 if topic in output[0:3] else 0)
            y_prob = pred_multi_class['scores'][pred_multi_class['labels'].index(topic)] if topic in pred_multi_class['labels'] else 0.0
            result[topic]["y_prob"].append(y_prob)
            result[topic]["y_pred"].append(y_prob>0.3)                
                
                
    eval_result = []
    eval_result_1 = []    
    for topic in result.keys():
        eval_result.append(
            metrics(
                topic,
                result[topic]["y_true"], 
                result[topic]["y_pred"], 
                result[topic]["y_prob"]
            )
        )
        eval_result_1.append(
            metrics(
                topic,
                result[topic]["y_true"], 
                result[topic]["y_pred_1"]
            )
        )
    eval_df = pandas.DataFrame(eval_result)
    eval_df1 = pandas.DataFrame(eval_result_1)    
    return (eval_df, eval_df1)

def find_samples(df, topic):
    fp = df[df[topic].apply(lambda x: x==0)][df["pred"].apply(lambda x: x.)]

In [20]:
df = get_df_with_prediction(data, ["TITLE","ABSTRACT"])

1 2 3 4 5 6 



7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 

In [24]:
for pred in df["pred"][0:100]:
    print(pred)

{'sequence': 'Closed-form Marginal Likelihood in Gamma-Poisson Matrix Factorization   We present novel understandings of the Gamma-Poisson (GaP) model, a\nprobabilistic matrix factorization model for count data. We show that GaP can\nbe rewritten free of the score/activation matrix. This gives us new insights\nabout the estimation of the topic/dictionary matrix by maximum marginal\nlikelihood estimation. In particular, this explains the robustness of this\nestimator to over-specified values of the factorization rank, especially its\nability to automatically prune irrelevant dictionary columns, as empirically\nobserved in previous work. The marginalization of the activation matrix leads\nin turn to a new Monte Carlo Expectation-Maximization algorithm with favorable\nproperties.\n', 'labels': ['Computer Science', 'Statistics', 'Mathematics', 'Quantitative Biology', 'Physics', 'Quantitative Finance'], 'scores': [0.26426297426223755, 0.22897480428218842, 0.21868237853050232, 0.102888904511

In [39]:
df_train = get_df_with_prediction(data_train, ["TITLE","ABSTRACT"])

100%|██████████| 20972/20972 [1:08:51<00:00,  5.08it/s]


In [41]:
df_train.to_pickle("df_nli_research_articles.pickle")


In [46]:
evaluate(df_train)[0]

Unnamed: 0,topic,precision,recall,f1_score,accuracy,true negative,false positive,false negative,true positive,auc
0,Computer Science,0.566576,0.603561,0.584484,0.648341,8410,3968,3407,5187,0.681397
1,Physics,0.61959,0.588059,0.603413,0.778371,12788,2171,2477,3536,0.793087
2,Mathematics,0.432217,0.443218,0.437648,0.694879,12083,3271,3128,2490,0.677942
3,Statistics,0.226117,0.36554,0.279401,0.531947,9253,6513,3303,1903,0.472664
4,Quantitative Biology,0.067989,0.396934,0.116094,0.830822,17191,3194,354,233,0.686921
5,Quantitative Finance,0.010253,0.128514,0.018991,0.842361,17634,3089,217,32,0.511334


In [61]:
evaluate(df_train)[1]

Unnamed: 0,topic,precision,recall,f1_score,accuracy,true negative,false positive,false negative,true positive
0,Computer Science,0.536758,0.964277,0.689635,0.644335,5226,7152,307,8287
1,Physics,0.610252,0.879095,0.720409,0.804358,11583,3376,727,5286
2,Mathematics,0.420419,0.925774,0.578242,0.638232,8184,7170,417,5201
3,Statistics,0.253616,0.990204,0.403807,0.274175,595,15171,51,5155
4,Quantitative Biology,0.114056,0.725724,0.197131,0.834541,17076,3309,161,426
5,Quantitative Finance,0.053294,0.51004,0.096505,0.886611,18467,2256,122,127


In [59]:
train, val = train_test_split(data_train, test_size=0.2)

In [60]:
val

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
1680,1681,From LiDAR to Underground Maps via 5G - Busine...,With ever-increasing productivity targets in...,1,0,0,0,0,0
768,769,FluxMarker: Enhancing Tactile Graphics with Dy...,"For people with visual impairments, tactile ...",1,0,0,0,0,0
2499,2500,Virtual Crystals and Nakajima Monomials,An explicit description of the virtualizatio...,0,0,1,0,0,0
6321,6322,Lock-Free Parallel Perceptron for Graph-based ...,Dependency parsing is an important NLP task....,1,0,0,0,0,0
2027,2028,Optimal boundary gradient estimates for Lamé s...,"In this paper, we derive the pointwise upper...",0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
2356,2357,Absence of cyclotron resonance in the anomalou...,It is observed that many thin superconductin...,0,1,0,0,0,0
14569,14570,Modeling Semantic Expectation: Using Script Kn...,Recent research in psycholinguistics has pro...,1,0,0,1,0,0
10349,10350,Learning to Grasp from a Single Demonstration,Learning-based approaches for robotic graspi...,1,0,0,0,0,0
13622,13623,A new algorithm for fast generalized DFTs,We give an new arithmetic algorithm to compu...,1,0,1,0,0,0


In [63]:
df_train['pred'][0]

{'sequence': "Reconstructing Subject-Specific Effect Maps   Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that

In [64]:
df_train['pred_multi_class'][0]

{'sequence': "Reconstructing Subject-Specific Effect Maps   Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that

In [65]:
df_train

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,pred_multi_class,pred
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,{'sequence': 'Reconstructing Subject-Specific ...,{'sequence': 'Reconstructing Subject-Specific ...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,{'sequence': 'Rotation Invariance Neural Netwo...,{'sequence': 'Rotation Invariance Neural Netwo...
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,{'sequence': 'Spherical polyharmonics and Pois...,{'sequence': 'Spherical polyharmonics and Pois...
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,{'sequence': 'A finite element approximation f...,{'sequence': 'A finite element approximation f...
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,{'sequence': 'Comparative study of Discrete Wa...,{'sequence': 'Comparative study of Discrete Wa...
...,...,...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0,{'sequence': 'Contemporary machine learning: a...,{'sequence': 'Contemporary machine learning: a...
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0,{'sequence': 'Uniform diamond coatings on WC-C...,{'sequence': 'Uniform diamond coatings on WC-C...
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0,{'sequence': 'Analysing Soccer Games with Clus...,{'sequence': 'Analysing Soccer Games with Clus...
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0,{'sequence': 'On the Efficient Simulation of t...,{'sequence': 'On the Efficient Simulation of t...
