In [1]:
#CODE FOR LOADING THE DATASET "dataset_aueb_structure_v2.json"
import json
import pandas as pd

label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data = load_corpus('dataset_aueb_structure_v2.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data)} abstracts')
data.sample(5)

Dataset length: 1014 abstracts


Unnamed: 0,document,sentences,labels
588,doi: 10.1093/ajh/hpx157,[ECG Voltage in Relation to Peripheral and Cen...,"[NEITHER, BACKGROUND, BACKGROUND, METHOD, RESU..."
346,doi: 10.1029/2017rg000593,[Remote Sensing of Droplet Number Concentratio...,"[NEITHER, OBJECTIVE, BACKGROUND, METHOD, RESUL..."
42,doi: 10.1002/chem.201901726,[Establishing Communication Between Artificial...,"[NEITHER, BACKGROUND, BACKGROUND, METHOD, OBJE..."
123,doi: 10.1007/s40120-019-00168-1,[Salivary Biomarkers for Alzheimer’s Disease a...,"[NEITHER, BACKGROUND, BACKGROUND, BACKGROUND, ..."
308,doi: 10.1021/acs.jpclett.8b03066,[The Fate of Water at the Electrochemical Inte...,"[NEITHER, BACKGROUND, BACKGROUND, BACKGROUND, ..."


In [2]:
# splitting in train-validation-test sets
from sklearn.model_selection import train_test_split

In [3]:
#Explode to sentences
sentences = data['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})
sentences

Unnamed: 0,doc_id,sentence
0,0,Concordance Between Different Amyloid Immunoas...
1,0,Importance Visual assessment of amyloid positr...
2,0,Several immunoassays have been developed to me...
3,0,The agreement between CSF Aβ42 measures from d...
4,0,Objective To determine the concordance between...
...,...,...
10543,1013,"Instead, SBPs sample a range of conformations ..."
10544,1013,Certain non-transported ligands leave the stru...
10545,1013,"Intriguingly, in some cases, similar SBP confo..."
10546,1013,"In this case, the inability for transport aris..."


In [4]:
#Explode to the corresponding labels
labels = data['labels'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'labels': 'label'})
labels

Unnamed: 0,doc_id,label
0,0,NEITHER
1,0,BACKGROUND
2,0,BACKGROUND
3,0,BACKGROUND
4,0,OBJECTIVE
...,...,...
10543,1013,METHOD
10544,1013,RESULT
10545,1013,RESULT
10546,1013,RESULT


In [5]:
#we drop the doc_id column as it is not needed
labels = labels.drop(['doc_id'],axis =1)
sentences = sentences.drop(['doc_id'],axis =1)

In [6]:
#we concatenate the 2 data frames into one
df = pd.concat([sentences,labels], axis =1, sort = False)

In [7]:
import nltk

In [8]:
#Performing some very basic tokenization to extract STOPWORDS and COMMONWORDS from the train-validation data frame

In [9]:
# converting all sentences to lowercase and replacing the "." with " "
#removing ' ' at the beginning or the end of each sentence
df['sentence'] = df['sentence'].str.lower().str.replace('.', ' ', regex=False).str.strip()
df.head()

Unnamed: 0,sentence,label
0,concordance between different amyloid immunoas...,NEITHER
1,importance visual assessment of amyloid positr...,BACKGROUND
2,several immunoassays have been developed to me...,BACKGROUND
3,the agreement between csf aβ42 measures from d...,BACKGROUND
4,objective to determine the concordance between...,OBJECTIVE


In [10]:
# Concatenating all sentences into one text.
one_text = " ".join(df['sentence'])
print(one_text[:1000])

concordance between different amyloid immunoassays and visual amyloid positron emission tomographic assessment importance visual assessment of amyloid positron emission tomographic (pet) images has been approved by regulatory authorities for clinical use several immunoassays have been developed to measure β-amyloid (aβ) 42 in cerebrospinal fluid (csf) the agreement between csf aβ42 measures from different immunoassays and visual pet readings may influence the use of csf biomarkers and/or amyloid pet assessment in clinical practice and trials objective to determine the concordance between csf aβ42 levels measured using 5 different immunoassays and visual amyloid pet analysis design, setting, and participants the study included 262 patients with mild cognitive impairment or subjective cognitive decline from the swedish biofinder (biomarkers for identifying neurodegenerative disorders early and reliably) cohort (recruited from september 1, 2010, through december 31, 2014) who had undergon

In [11]:
from collections import Counter

In [12]:
#we find the most common words
top_words = Counter(one_text.split()).most_common()
top_words[:20]

[('the', 11384),
 ('of', 9273),
 ('and', 8461),
 ('in', 6171),
 ('to', 4484),
 ('a', 3782),
 ('with', 2555),
 ('for', 2401),
 ('is', 2033),
 ('we', 1677),
 ('that', 1635),
 ('by', 1501),
 ('on', 1305),
 ('as', 1264),
 ('are', 1206),
 ('from', 1123),
 ('this', 1059),
 ('were', 880),
 ('an', 844),
 ('was', 811)]

In [13]:
# Printing the top 100 most common words 
print(sorted([i[0].lower() for i in top_words[:100]]))

['0', '1', '2', '3', '=', 'a', 'after', 'all', 'also', 'an', 'analysis', 'and', 'are', 'as', 'associated', 'at', 'based', 'be', 'been', 'between', 'blood', 'both', 'but', 'by', 'can', 'cell', 'cells', 'change', 'changes', 'climate', 'clinical', 'compared', 'data', 'different', 'disease', 'during', 'effects', 'for', 'from', 'genetic', 'global', 'has', 'have', 'high', 'higher', 'however,', 'human', 'in', 'increased', 'into', 'is', 'it', 'its', 'large', 'levels', 'liver', 'may', 'methods', 'model', 'models', 'more', 'most', 'new', 'not', 'of', 'on', 'only', 'or', 'other', 'our', 'over', 'p', 'patients', 'potential', 'protein', 'response', 'results', 'risk', 'show', 'studies', 'study', 'such', 'than', 'that', 'the', 'their', 'these', 'this', 'to', 'treatment', 'two', 'use', 'used', 'using', 'was', 'we', 'well', 'were', 'which', 'with']


In [14]:
#we also use the english stop words from the nltk library
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [15]:
#we combine stop words and most common used words into a list in order to later remove them from our data frame.
most_common = top_words[:100]
words_to_exclude = most_common+stop
words_to_exclude = list(dict.fromkeys(words_to_exclude))

In [16]:
#we remove them and form a new column which contains the "clean" text
df['sentence_clean'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (words_to_exclude)]))

In [17]:
#we remove the older column which contained raw text
df = df.drop(['sentence'] ,axis =1)
df

Unnamed: 0,label,sentence_clean
0,NEITHER,concordance different amyloid immunoassays vis...
1,BACKGROUND,importance visual assessment amyloid positron ...
2,BACKGROUND,several immunoassays developed measure β-amylo...
3,BACKGROUND,agreement csf aβ42 measures different immunoas...
4,OBJECTIVE,objective determine concordance csf aβ42 level...
...,...,...
10543,METHOD,"instead, sbps sample range conformations activ..."
10544,RESULT,certain non-transported ligands leave structur...
10545,RESULT,"intriguingly, cases, similar sbp conformations..."
10546,RESULT,"case, inability transport arises slow opening ..."


In [18]:
#We split our data frame into a train-validation and a test data frame.
train_valid, test = train_test_split(df, test_size = 0.2 , random_state = 42)

In [19]:
#keeping actual labels of test dataset
y_true_test = test['label']
y_true_test

1669    CONCLUSION
4901     OBJECTIVE
3229    CONCLUSION
6522    BACKGROUND
2513    BACKGROUND
           ...    
792         METHOD
233        NEITHER
6752    BACKGROUND
6369    CONCLUSION
9638     OBJECTIVE
Name: label, Length: 2110, dtype: object

In [20]:
train_valid

Unnamed: 0,label,sentence_clean
9366,RESULT,co-relation gradually emerging mutations motif...
6335,RESULT,approach allowed us distinguish effects salini...
4896,RESULT,demonstrate simple variations environmental pa...
7359,METHOD,"test hypothesis, seasonal ensemble reforecasts..."
2627,OBJECTIVE,report safety immunogenicity recombinant vesic...
...,...,...
5734,CONCLUSION,funding agencies|catch consortium (eu horizon ...
5191,OBJECTIVE,"here, explore global stability properties syst..."
5390,NEITHER,real-time assessment health-care requirements ...
860,NEITHER,large discrepancies summer climate change euro...


In [21]:
#we split again our train-validation data frame into 2 separate data frames (train & validation)
train, valid = train_test_split(train_valid, test_size = 0.2 , random_state = 42)

In [22]:
#keeping actual labels of train dataset
y_true_train = train['label']
y_true_train

4876        RESULT
2526    CONCLUSION
2147        RESULT
2154    CONCLUSION
2422     OBJECTIVE
           ...    
3740       NEITHER
2158    CONCLUSION
299     CONCLUSION
1934    BACKGROUND
9741     OBJECTIVE
Name: label, Length: 6750, dtype: object

In [23]:
#keeping actual labels of valid dataset
y_true_valid = valid['label']
y_true_valid

8203        RESULT
1362        METHOD
5806        METHOD
3379        METHOD
976         METHOD
           ...    
9899       NEITHER
5231       NEITHER
7360        METHOD
1673        METHOD
9147    CONCLUSION
Name: label, Length: 1688, dtype: object

In [24]:
#we fix the format of the label column in order to later match to the fasttext's required input format.
train.iloc[:, 0] = train.iloc[:, 0].apply(lambda x: '__label__' + x)
valid.iloc[:, 0] = valid.iloc[:, 0].apply(lambda x: '__label__' + x)
test.iloc[:, 0] = test.iloc[:, 0].apply(lambda x: '__label__' + x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [25]:
valid

Unnamed: 0,label,sentence_clean
8203,__label__RESULT,"found adjusting certain parameters, albeit wit..."
1362,__label__METHOD,"three different boundary conditions (linear, m..."
5806,__label__METHOD,large proportion expression quantitative trait...
3379,__label__METHOD,collect millennial‐length simulations coupled ...
976,__label__METHOD,two different boundary conditions defined repr...
...,...,...
9899,__label__NEITHER,potential elastic polarization lidars retrieve...
5231,__label__NEITHER,equilibrium forest demography explains distrib...
7360,__label__METHOD,"model systematic biases 2-m temperature, sea s..."
1673,__label__METHOD,"then, irradiation microwaves triggered simulta..."


In [26]:
#for the train dataset
#we combine the 2 columns into a new column which is required for the fasttext input format
train["fasttext"] = train["sentence_clean"] + ' ' + train["label"]
#we drop the 2 older columns
train = train.drop(['sentence_clean'],axis =1)
train = train.drop(['label'],axis =1)
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["fasttext"] = train["sentence_clean"] + ' ' + train["label"]


Unnamed: 0,fasttext
4876,also estimate minimum number virions produced ...
2526,ensemble-based computational methods based str...
2147,"64%, p <0 001) ohe development (35% vs __label..."
2154,results support clinical use total cross-secti...
2422,latest findings demonstrate cell cycle progres...
...,...
3740,"trends gpcr drug discovery: new agents, target..."
2158,825694) cellex foundation __label__CONCLUSION
299,calculations confirm active role bridge plays ...
1934,introduction: within-person trajectories cereb...


In [27]:
#for the validation dataset
#we combine the 2 columns into a new column which is required for the fasttext input format
valid["fasttext"] = valid["sentence_clean"] + ' ' + valid["label"]
#we drop the 2 older columns
valid = valid.drop(['sentence_clean'],axis =1)
valid = valid.drop(['label'],axis =1)
valid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid["fasttext"] = valid["sentence_clean"] + ' ' + valid["label"]


Unnamed: 0,fasttext
8203,"found adjusting certain parameters, albeit wit..."
1362,"three different boundary conditions (linear, m..."
5806,large proportion expression quantitative trait...
3379,collect millennial‐length simulations coupled ...
976,two different boundary conditions defined repr...
...,...
9899,potential elastic polarization lidars retrieve...
5231,equilibrium forest demography explains distrib...
7360,"model systematic biases 2-m temperature, sea s..."
1673,"then, irradiation microwaves triggered simulta..."


In [28]:
#for the test dataset
#we combine the 2 columns into a new column which is required for the fasttext input format
test["fasttext"] = test["sentence_clean"] + ' ' + test["label"]
#we drop the 2 older columns
test = test.drop(['sentence_clean'],axis =1)
test = test.drop(['label'],axis =1)
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["fasttext"] = test["sentence_clean"] + ' ' + test["label"]


Unnamed: 0,fasttext
1669,"conclusion, data indicate metformin glycinate ..."
4901,previously identified increased autoantibody l...
3229,view high structural rigidity efficient isomer...
6522,intergovernmental panel climate change (ipcc) ...
2513,"transitions literature far, however, featured ..."
...,...
792,position storm’s track relative sst-front impo...
233,"chromophore activation α,β-unsaturated carbony..."
6752,classical structural biology provide static sn...
6369,novel therapeutic agents development expected ...


In [29]:
import numpy as np

In [30]:
#we produce a .txt file from the test dataset which will be later used as test input in our fasttext model
test.to_csv('test2.txt', index = False, header = False, quotechar = " ", escapechar = "")

In [31]:
#we produce a .txt file from the train dataset which will be later used as train input in our fasttext model
train.to_csv('train2.txt', index = False, header = False, quotechar = " ", escapechar = "")

In [32]:
#we produce a .txt file from the validation dataset which will be later used as validation input in our fasttext model
valid.to_csv('valid2.txt', index = False, header = False, quotechar = " ", escapechar = "")

In [33]:
import fasttext

In [34]:
### WE USED DIFFERENT PARAMETERS AND TRIED MANY OPTIONS WITH THE SUPERVISED TRAINING MODEL.

In [35]:
#We train our model
model = fasttext.train_supervised(input="train2.txt")

In [36]:
valid[['sentence','label']] = valid["fasttext"].str.split("__label__",expand=True)
valid = valid.drop(['fasttext'],axis =1)
valid

Unnamed: 0,sentence,label
8203,"found adjusting certain parameters, albeit wit...",RESULT
1362,"three different boundary conditions (linear, m...",METHOD
5806,large proportion expression quantitative trait...,METHOD
3379,collect millennial‐length simulations coupled ...,METHOD
976,two different boundary conditions defined repr...,METHOD
...,...,...
9899,potential elastic polarization lidars retrieve...,NEITHER
5231,equilibrium forest demography explains distrib...,NEITHER
7360,"model systematic biases 2-m temperature, sea s...",METHOD
1673,"then, irradiation microwaves triggered simulta...",METHOD


In [37]:
test[['sentence','label']] = test["fasttext"].str.split("__label__",expand=True)
test = test.drop(['fasttext'],axis =1)
test

Unnamed: 0,sentence,label
1669,"conclusion, data indicate metformin glycinate ...",CONCLUSION
4901,previously identified increased autoantibody l...,OBJECTIVE
3229,view high structural rigidity efficient isomer...,CONCLUSION
6522,intergovernmental panel climate change (ipcc) ...,BACKGROUND
2513,"transitions literature far, however, featured ...",BACKGROUND
...,...,...
792,position storm’s track relative sst-front impo...,METHOD
233,"chromophore activation α,β-unsaturated carbony...",NEITHER
6752,classical structural biology provide static sn...,BACKGROUND
6369,novel therapeutic agents development expected ...,CONCLUSION


In [38]:
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model.predict(s)[0][0])
valid_sentences

8203        __label__RESULT
1362        __label__RESULT
5806        __label__RESULT
3379        __label__RESULT
976      __label__OBJECTIVE
               ...         
9899    __label__BACKGROUND
5231        __label__RESULT
7360        __label__RESULT
1673        __label__RESULT
9147        __label__RESULT
Name: sentence, Length: 1688, dtype: object

In [39]:
y_pred_valid = valid_sentences.str.replace('__label__','')
y_pred_valid

8203        RESULT
1362        RESULT
5806        RESULT
3379        RESULT
976      OBJECTIVE
           ...    
9899    BACKGROUND
5231        RESULT
7360        RESULT
1673        RESULT
9147        RESULT
Name: sentence, Length: 1688, dtype: object

In [40]:
test_sentences = test["sentence"].apply(lambda s: " ".join(s.split()))
test_sentences = test_sentences.apply(lambda s: model.predict(s)[0][0])
test_sentences

1669        __label__RESULT
4901        __label__RESULT
3229    __label__BACKGROUND
6522        __label__RESULT
2513    __label__BACKGROUND
               ...         
792     __label__BACKGROUND
233     __label__BACKGROUND
6752    __label__BACKGROUND
6369    __label__CONCLUSION
9638     __label__OBJECTIVE
Name: sentence, Length: 2110, dtype: object

In [41]:
y_pred_test = test_sentences.str.replace('__label__','')
y_pred_test

1669        RESULT
4901        RESULT
3229    BACKGROUND
6522        RESULT
2513    BACKGROUND
           ...    
792     BACKGROUND
233     BACKGROUND
6752    BACKGROUND
6369    CONCLUSION
9638     OBJECTIVE
Name: sentence, Length: 2110, dtype: object

In [42]:
target_names = ['RESULT', 'BACKGROUND', 'OBJECTIVE','CONCLUSION', 'METHOD', 'NEITHER']

In [43]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [44]:
y_true_valid = y_true_valid.str.replace('__label__','')
y_true_valid

8203        RESULT
1362        METHOD
5806        METHOD
3379        METHOD
976         METHOD
           ...    
9899       NEITHER
5231       NEITHER
7360        METHOD
1673        METHOD
9147    CONCLUSION
Name: label, Length: 1688, dtype: object

In [45]:
y_true_test = y_true_test.str.replace('__label__','')
y_true_test

1669    CONCLUSION
4901     OBJECTIVE
3229    CONCLUSION
6522    BACKGROUND
2513    BACKGROUND
           ...    
792         METHOD
233        NEITHER
6752    BACKGROUND
6369    CONCLUSION
9638     OBJECTIVE
Name: label, Length: 2110, dtype: object

In [46]:
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

      RESULT       0.46      0.77      0.58       434
  BACKGROUND       0.39      0.51      0.44       343
   OBJECTIVE       0.45      0.59      0.51       303
  CONCLUSION       0.44      0.21      0.28       199
      METHOD       1.00      0.01      0.02       246
     NEITHER       0.30      0.06      0.10       163

    accuracy                           0.44      1688
   macro avg       0.51      0.36      0.32      1688
weighted avg       0.51      0.44      0.38      1688



In [47]:
#By default, fastText sees each training example only five times during training, which is pretty small, 
#given that our training set only have 20.5k training examples.
#The number of times each examples is seen (also known as the number of epochs), can be increased using the -epoch option:

In [48]:
#We train our model again implemeting hyper parameter tunning (epochs)
model_2 = fasttext.train_supervised(input="train2.txt", epoch=18)

In [49]:
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_2.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))


validation dataset
              precision    recall  f1-score   support

      RESULT       0.58      0.61      0.59       434
  BACKGROUND       0.59      0.60      0.59       343
   OBJECTIVE       0.53      0.54      0.54       303
  CONCLUSION       0.45      0.34      0.39       199
      METHOD       0.42      0.55      0.48       246
     NEITHER       0.58      0.34      0.43       163

    accuracy                           0.53      1688
   macro avg       0.52      0.50      0.50      1688
weighted avg       0.53      0.53      0.53      1688



In [50]:
#epoch's increment does seem to make our model better (f1 score is increased , accuracy 0.43 -> 0.52)
#thus we will keep epoch = 18

In [51]:
#Another way to change the learning speed of our model is to increase (or decrease) the learning rate of the algorithm.
#This corresponds to how much the model changes after processing each example. 
#A learning rate of 0 would mean that the model does not change at all, and thus, does not learn anything. 
#Good values of the learning rate are in the range 0.1 - 1.0.

In [52]:
#We train our model again implemeting hyper parameter tunning (learning rate)
model_3 = fasttext.train_supervised(input="train2.txt",epoch = 18,  lr=0.5)
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_3.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

      RESULT       0.59      0.58      0.59       434
  BACKGROUND       0.57      0.59      0.58       343
   OBJECTIVE       0.50      0.53      0.52       303
  CONCLUSION       0.47      0.37      0.41       199
      METHOD       0.39      0.56      0.46       246
     NEITHER       0.57      0.33      0.41       163

    accuracy                           0.52      1688
   macro avg       0.52      0.49      0.50      1688
weighted avg       0.53      0.52      0.52      1688



In [53]:
#learning's rate (lr) increment does not seem to make our model better (accuracy drops)
#thus we will keep learning rate = 0.1 (by default)

In [54]:
#Finally, we can improve the performance of a model by using word bigrams, instead of just unigrams. 
#This is especially important for classification problems where word order is important, such as sentiment analysis.

In [55]:
#We train our model again implemeting hyper parameter tunning (wordNgrams)
model_4 = fasttext.train_supervised(input="train2.txt",wordNgrams=2, epoch = 18)
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_4.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

      RESULT       0.61      0.61      0.61       434
  BACKGROUND       0.56      0.63      0.59       343
   OBJECTIVE       0.62      0.52      0.56       303
  CONCLUSION       0.47      0.27      0.34       199
      METHOD       0.37      0.62      0.47       246
     NEITHER       0.54      0.30      0.39       163

    accuracy                           0.53      1688
   macro avg       0.53      0.49      0.49      1688
weighted avg       0.54      0.53      0.52      1688



In [56]:
#wordNgrams increment does not make our model better, it produces similar results

In [57]:
#We train our model again implemeting hyper parameter tunning
model_5 = fasttext.train_supervised(input="train2.txt", loss='ova', lr=0.1, epoch=18)
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_5.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

      RESULT       0.58      0.64      0.61       434
  BACKGROUND       0.57      0.60      0.59       343
   OBJECTIVE       0.53      0.55      0.54       303
  CONCLUSION       0.47      0.35      0.40       199
      METHOD       0.45      0.54      0.49       246
     NEITHER       0.64      0.36      0.46       163

    accuracy                           0.54      1688
   macro avg       0.54      0.51      0.52      1688
weighted avg       0.55      0.54      0.54      1688



In [58]:
#we will use model_5 in our test dataset since it produces the best accuracy and f1 scores.

In [59]:
test_sentences = test["sentence"].apply(lambda s: " ".join(s.split()))
test_sentences = test_sentences.apply(lambda s: model_5.predict(s)[0][0])
y_pred_test = test_sentences.str.replace('__label__','')
#FOR test DATASET
print('test dataset')
print(classification_report(y_true_test, y_pred_test, labels=target_names))

test dataset
              precision    recall  f1-score   support

      RESULT       0.60      0.69      0.64       511
  BACKGROUND       0.53      0.62      0.57       410
   OBJECTIVE       0.56      0.54      0.55       393
  CONCLUSION       0.43      0.36      0.39       247
      METHOD       0.49      0.49      0.49       338
     NEITHER       0.60      0.41      0.49       211

    accuracy                           0.55      2110
   macro avg       0.54      0.52      0.52      2110
weighted avg       0.55      0.55      0.54      2110



In [60]:
#saving our best model
model_5.save_model("fasttext_structure_final_model.bin")