In [1]:
#CODE FOR LOADING THE DATASET dataset_aueb_argument_v3
import json
import pandas as pd

label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data = load_corpus('dataset_aueb_argument_v3.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data)} abstracts')
data.sample(5)

Dataset length: 1017 abstracts


Unnamed: 0,document,sentences,labels
809,doi: 10.1186/s13075-019-1902-2,[ACPA-negative RA consists of subgroups: patie...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
896,doi: 10.3389/fmicb.2019.00811,[The Human Upper Respiratory Tract Epithelium ...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
55,doi: 10.1002/hep.30986,[qFIBS: A Novel Automated Technique for Quanti...,"[NEITHER, NEITHER, NEITHER, EVIDENCE, CLAIM]"
196,doi: 10.1016/j.gca.2020.03.042,[Main controls on the stable carbon isotope co...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
712,doi: 10.1126/scitranslmed.aaj1701,[A dose-dependent plasma signature of the safe...,"[NEITHER, NEITHER, CLAIM, EVIDENCE, NEITHER, N..."


In [2]:
#CODE FOR LOADING THE DATASET.json
label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data2 = load_corpus('dataset.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data2)} abstracts')
data2.sample(5)

Dataset length: 1669 abstracts


Unnamed: 0,document,sentences,labels
1541,21905158,[Home-based physical activity intervention for...,"[NONE, NONE, NONE, NONE, NONE, NONE, EVIDENCE,..."
1115,16179098,[Multicenter phase II trial of carboplatin/vin...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."
790,LNR_G5B2_10.1038_ngeo2894.txt,[Title: Extreme winds and precipitation during...,"[NONE, NONE, NONE, NONE, EVIDENCE, NONE, NONE,..."
1576,9640214,[Quality of life of early-stage breast cancer ...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."
71,DIJ_G2B2_14.txt,[The (Not So) Changing Man: Dynamic Gender Ste...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."


In [3]:
#concatenate the 2 datasets
data_final = pd.concat([data,data2], ignore_index=True)

In [4]:
#check the rows
print(f'Dataset length: {len(data_final)} abstracts')

Dataset length: 2686 abstracts


In [5]:
#preview
data_final

Unnamed: 0,document,sentences,labels
0,doi: 10.1001/jamaneurol.2017.2814,[Concordance Between Different Amyloid Immunoa...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
1,doi: 10.1001/jamaneurol.2017.4913,[Association of Changes in Plasma Neurofilamen...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
2,doi: 10.1002/2015gl067056,[Dynamically triggered slip leading to sustain...,"[NEITHER, NEITHER, NEITHER, EVIDENCE, EVIDENCE..."
3,doi: 10.1002/2015ms000564,[Impacts of parameterized orographic drag on t...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
4,doi: 10.1002/2016gl069551,"[Climate model biases in jet streams, blocking...","[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
...,...,...,...
2681,17947826,[Long-term intraocular pressure control of tra...,"[NONE, NONE, EVIDENCE, EVIDENCE, CLAIM]"
2682,19383599,[Combined trabeculectomy and cataract extracti...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."
2683,17601060,[[Ocular hypotensive effect of 1% carteolol lo...,"[NONE, NONE, NONE, NONE, NONE, NONE, EVIDENCE,..."
2684,12383808,[A prospective randomized trial comparing intr...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."


In [6]:
#Explode to sentences
sentences = data_final['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})
sentences

Unnamed: 0,doc_id,sentence
0,0,Concordance Between Different Amyloid Immunoas...
1,0,Importance Visual assessment of amyloid positr...
2,0,Several immunoassays have been developed to me...
3,0,The agreement between CSF Aβ42 measures from d...
4,0,Objective To determine the concordance between...
...,...,...
31999,2685,No statistically significant difference in con...
32000,2685,Latanoprost 0.005% once daily reduced IOP more...
32001,2685,Latanoprost had no statistically or clinically...
32002,2685,There was no difference in hyperemia between t...


In [7]:
#create the lexicon lists in order to be used for aurgumentation in each sentence
check_claim = ["provide", "reveal", "confirm", "suggest", "claim", "conclusion", "overall", "summary", "indicate", "proove", "finally"]
check_evidence = ["evidence", "result", "finding", "%"]

In [8]:
#create a column for the label
sentences["label"] =""
#set all letters to lowercase
sentences["sentence"] = sentences["sentence"].str.lower()
#produce a counter refering to each doc_id number of sentences
sentences['count'] = sentences['doc_id'].map(sentences['doc_id'].value_counts())

In [9]:
#print the data frame for testing reasons
sentences

Unnamed: 0,doc_id,sentence,label,count
0,0,concordance between different amyloid immunoas...,,17
1,0,importance visual assessment of amyloid positr...,,17
2,0,several immunoassays have been developed to me...,,17
3,0,the agreement between csf aβ42 measures from d...,,17
4,0,objective to determine the concordance between...,,17
...,...,...,...,...
31999,2685,no statistically significant difference in con...,,15
32000,2685,latanoprost 0.005% once daily reduced iop more...,,15
32001,2685,latanoprost had no statistically or clinically...,,15
32002,2685,there was no difference in hyperemia between t...,,15


In [10]:
#labeling each sentence acoording to the lexicon rules that were created above
for i in range(0, len(sentences)):
  if sentences["sentence"].str.contains('|'.join(check_claim))[i]:
    sentences["label"][i] = "CLAIM"
  elif sentences["sentence"].str.contains('|'.join(check_evidence))[i]:
    sentences["label"][i] = "EVIDENCE"
  else:
    sentences["label"][i] = "NEITHER"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences["label"][i] = "NEITHER"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences["label"][i] = "EVIDENCE"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences["label"][i] = "CLAIM"


In [11]:
#creating an extra rule in order to fill some extra lables
#below code checks for doc_ids that have both CLAIM and EVIDENCE as labels
#when this is the case this code fills all inbetween sentences with EVIDENCE label
#Also when CLAIM exists as label in a doc_id and it is not the final sentence of this abstract the next sentence 
#is also labeled as claim
j = 0
count = 0
for i in range(0,2685):
  evi = 0
  cla = 0
  count = sentences["count"][j] + count
  while sentences["doc_id"][j] == i:
    while j < count:
      if sentences["label"][j] == "EVIDENCE":
        evi = j
      elif sentences["label"][j] == "CLAIM":
        cla = j
      j = j + 1
    if (evi != 0 and cla != 0):
      for k in range ((evi+1),cla):
        sentences["label"][k] = "EVIDENCE"
    if (cla != 0) and (cla == count-2):
      sentences["label"][cla+1] = "CLAIM"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences["label"][k] = "EVIDENCE"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences["label"][cla+1] = "CLAIM"


In [12]:
#preview
sentences

Unnamed: 0,doc_id,sentence,label,count
0,0,concordance between different amyloid immunoas...,NEITHER,17
1,0,importance visual assessment of amyloid positr...,NEITHER,17
2,0,several immunoassays have been developed to me...,NEITHER,17
3,0,the agreement between csf aβ42 measures from d...,NEITHER,17
4,0,objective to determine the concordance between...,NEITHER,17
...,...,...,...,...
31999,2685,no statistically significant difference in con...,NEITHER,15
32000,2685,latanoprost 0.005% once daily reduced iop more...,EVIDENCE,15
32001,2685,latanoprost had no statistically or clinically...,NEITHER,15
32002,2685,there was no difference in hyperemia between t...,NEITHER,15


In [13]:
#explode the corresponding labels
labels = data_final['labels'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'labels': 'label'})
labels

Unnamed: 0,doc_id,label
0,0,NEITHER
1,0,NEITHER
2,0,NEITHER
3,0,NEITHER
4,0,NEITHER
...,...,...
31999,2685,EVIDENCE
32000,2685,NONE
32001,2685,CLAIM
32002,2685,CLAIM


In [14]:
#we replace the "NONE" label with "NEITHER" in order to make the 2 data sets match.
labels['label'] = labels['label'].replace('NONE','NEITHER')
labels

Unnamed: 0,doc_id,label
0,0,NEITHER
1,0,NEITHER
2,0,NEITHER
3,0,NEITHER
4,0,NEITHER
...,...,...
31999,2685,EVIDENCE
32000,2685,NEITHER
32001,2685,CLAIM
32002,2685,CLAIM


In [15]:
#load functions required for evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [16]:
y_true = labels["label"]
y_pred = sentences["label"]

In [17]:
target_names = ['CLAIM', 'EVIDENCE', 'NEITHER']

In [19]:
# splitting in train-validation-test sets
from sklearn.model_selection import train_test_split

In [20]:
#We split our data frame into a train-validation and a test data frame.
train_valid_true, test_true = train_test_split(y_true, test_size = 0.2 , random_state = 42)

In [21]:
#we split again our train-validation data frame into 2 separate data frames (train & validation)
train_true, valid_true = train_test_split(train_valid_true, test_size = 0.2 , random_state = 42)

In [22]:
#We split our data frame into a train-validation and a test data frame.
train_valid_pred, test_pred = train_test_split(y_pred, test_size = 0.2 , random_state = 42)

In [23]:
#we split again our train-validation data frame into 2 separate data frames (train & validation)
train_pred, valid_pred = train_test_split(train_valid_pred, test_size = 0.2 , random_state = 42)

In [24]:
#this is the evaluation of our baseline model for the test dataset that will be used in the fasttext approach
print(classification_report(test_true, test_pred, labels=target_names))

              precision    recall  f1-score   support

       CLAIM       0.35      0.36      0.35       669
    EVIDENCE       0.43      0.45      0.44      1209

   micro avg       0.40      0.42      0.41      1878
   macro avg       0.39      0.40      0.40      1878
weighted avg       0.40      0.42      0.41      1878

