In [1]:
#CODE FOR LOADING THE DATASET "dataset_aueb_argument_v3.json"
import json
import pandas as pd

label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data = load_corpus('dataset_aueb_argument_v3.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data)} abstracts')
data.sample(5)

Dataset length: 1017 abstracts


Unnamed: 0,document,sentences,labels
218,doi: 10.1016/j.jhep.2017.02.014,[Impaired hepatic lipid synthesis from polyuns...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
369,doi: 10.1029/2019ms001628,[Ensembles of Global Climate Model Variants De...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
25,doi: 10.1002/anie.201610837,[Miller-Urey Spark-Discharge Experiments in th...,"[NEITHER, NEITHER, NEITHER, EVIDENCE, NEITHER,..."
868,doi: 10.15252/emmm.201707809,[18\n F‐AV‐1451 and CSF T‐tau and P...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."
190,doi: 10.1016/j.freeradbiomed.2019.04.027,[Computational solutions in redox lipidomics –...,"[NEITHER, NEITHER, NEITHER, NEITHER, NEITHER, ..."


In [2]:
#CODE FOR LOADING THE DATASET "dataset.json"
import json
import pandas as pd

label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data2 = load_corpus('dataset.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data2)} abstracts')
data2.sample(5)

Dataset length: 1669 abstracts


Unnamed: 0,document,sentences,labels
706,LMN_G5B1_10.1016_j.jenvman.2015.11.043.txt,[Title: National climate policies across Europ...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, EVI..."
649,ABC_G1B2_10.1016_j.scs.2018.12.041.txt,[A GIS-based model to assess electric energy c...,"[NONE, NONE, CLAIM, NONE, NONE, EVIDENCE, EVID..."
294,FGP_G3B3_PMID31021567.txt,[Title: A qualitative study on the oral health...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."
1362,16476841,[Early intervention with epoetin alfa during p...,"[NONE, NONE, NONE, NONE, EVIDENCE, EVIDENCE, E..."
1651,11148810,[Long-term effects of timolol therapy in ocula...,"[NONE, CLAIM, NONE, NONE, NONE, NONE, NONE, EV..."


In [3]:
#concatenate the 2 datasets
data_final = pd.concat([data,data2], ignore_index=True)

In [4]:
# splitting in train-validation-test sets
from sklearn.model_selection import train_test_split

In [5]:
#Explode to sentences
sentences = data_final['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})
sentences

Unnamed: 0,doc_id,sentence
0,0,Concordance Between Different Amyloid Immunoas...
1,0,Importance Visual assessment of amyloid positr...
2,0,Several immunoassays have been developed to me...
3,0,The agreement between CSF Aβ42 measures from d...
4,0,Objective To determine the concordance between...
...,...,...
31999,2685,No statistically significant difference in con...
32000,2685,Latanoprost 0.005% once daily reduced IOP more...
32001,2685,Latanoprost had no statistically or clinically...
32002,2685,There was no difference in hyperemia between t...


In [6]:
#Explode to the corresponding labels
labels = data_final['labels'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'labels': 'label'})
labels

Unnamed: 0,doc_id,label
0,0,NEITHER
1,0,NEITHER
2,0,NEITHER
3,0,NEITHER
4,0,NEITHER
...,...,...
31999,2685,EVIDENCE
32000,2685,NONE
32001,2685,CLAIM
32002,2685,CLAIM


In [7]:
#we drop the doc_id column as it is not needed
labels = labels.drop(['doc_id'],axis =1)
sentences = sentences.drop(['doc_id'],axis =1)

In [8]:
#we concatenate the 2 data frames into one
df = pd.concat([sentences,labels], axis =1, sort = False)

In [9]:
#we replace the "NONE" label with "NEITHER" in order to make the 2 data sets match.
df['label'] = df['label'].replace('NONE','NEITHER')
df

Unnamed: 0,sentence,label
0,Concordance Between Different Amyloid Immunoas...,NEITHER
1,Importance Visual assessment of amyloid positr...,NEITHER
2,Several immunoassays have been developed to me...,NEITHER
3,The agreement between CSF Aβ42 measures from d...,NEITHER
4,Objective To determine the concordance between...,NEITHER
...,...,...
31999,No statistically significant difference in con...,EVIDENCE
32000,Latanoprost 0.005% once daily reduced IOP more...,NEITHER
32001,Latanoprost had no statistically or clinically...,CLAIM
32002,There was no difference in hyperemia between t...,CLAIM


In [10]:
import nltk

In [11]:
# Performing some very basic tokenization to extract STOPWORDS and COMMONWORDS from the train-validation data frame

In [12]:
# converting all sentences to lowercase and replacing the "." with " "
#removing ' ' at the beginning or the end of each sentence
df['sentence'] = df['sentence'].str.lower().str.replace('.', ' ', regex=False).str.strip()
df.head()

Unnamed: 0,sentence,label
0,concordance between different amyloid immunoas...,NEITHER
1,importance visual assessment of amyloid positr...,NEITHER
2,several immunoassays have been developed to me...,NEITHER
3,the agreement between csf aβ42 measures from d...,NEITHER
4,objective to determine the concordance between...,NEITHER


In [13]:
# Concatenating all sentences into one text.
one_text = " ".join(df['sentence'])
print(one_text[:1000])

concordance between different amyloid immunoassays and visual amyloid positron emission tomographic assessment importance visual assessment of amyloid positron emission tomographic (pet) images has been approved by regulatory authorities for clinical use several immunoassays have been developed to measure β-amyloid (aβ) 42 in cerebrospinal fluid (csf) the agreement between csf aβ42 measures from different immunoassays and visual pet readings may influence the use of csf biomarkers and/or amyloid pet assessment in clinical practice and trials objective to determine the concordance between csf aβ42 levels measured using 5 different immunoassays and visual amyloid pet analysis design, setting, and participants the study included 262 patients with mild cognitive impairment or subjective cognitive decline from the swedish biofinder (biomarkers for identifying neurodegenerative disorders early and reliably) cohort (recruited from september 1, 2010, through december 31, 2014) who had undergon

In [14]:
from collections import Counter

In [15]:
#we find the most common words
top_words = Counter(one_text.split()).most_common()
top_words[:20]

[('the', 32281),
 ('of', 26693),
 ('and', 26293),
 ('in', 18512),
 ('to', 14256),
 ('a', 10400),
 ('with', 8458),
 ('for', 7671),
 ('were', 5245),
 ('was', 4824),
 ('is', 4493),
 ('on', 4078),
 ('patients', 3976),
 ('that', 3856),
 ('by', 3522),
 ('as', 3272),
 ('from', 3194),
 ('this', 3041),
 ('at', 2887),
 ('we', 2850)]

In [16]:
# Printing the top 100 most common words 
print(sorted([i[0].lower() for i in top_words[:100]]))

['(p', '0', '1', '2', '3', '4', '5', '6', '=', 'a', 'after', 'all', 'also', 'among', 'an', 'analysis', 'and', 'are', 'as', 'associated', 'at', 'be', 'been', 'between', 'both', 'but', 'by', 'can', 'cancer', 'change', 'climate', 'clinical', 'compared', 'data', 'different', 'during', 'effects', 'energy', 'for', 'from', 'global', 'group', 'groups', 'had', 'has', 'have', 'health', 'higher', 'in', 'increased', 'is', 'it', 'life', 'may', 'model', 'months', 'more', 'mortality', 'most', 'no', 'not', 'of', 'on', 'or', 'other', 'our', 'over', 'p', 'patients', 'quality', 'randomized', 'results', 'risk', 'significant', 'significantly', 'study', 'survival', 'than', 'that', 'the', 'their', 'there', 'these', 'this', 'time', 'to', 'treatment', 'trial', 'two', 'use', 'used', 'using', 'was', 'we', 'were', 'which', 'who', 'with', 'women', 'years']


In [17]:
#we also use the english stop words from the nltk library
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [18]:
#we combine stop words and most common used words into a list in order to later remove them from our data frame.
most_common = top_words[:100]
words_to_exclude = most_common+stop
words_to_exclude = list(dict.fromkeys(words_to_exclude))

In [19]:
#we remove them and form a new column which contains the "clean" text
df['sentence_clean'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (words_to_exclude)]))

In [20]:
#we remove the older column which contained raw text
df = df.drop(['sentence'] ,axis =1)
df

Unnamed: 0,label,sentence_clean
0,NEITHER,concordance different amyloid immunoassays vis...
1,NEITHER,importance visual assessment amyloid positron ...
2,NEITHER,several immunoassays developed measure β-amylo...
3,NEITHER,agreement csf aβ42 measures different immunoas...
4,NEITHER,objective determine concordance csf aβ42 level...
...,...,...
31999,EVIDENCE,statistically significant difference conjuncti...
32000,NEITHER,latanoprost 0 005% daily reduced iop effective...
32001,CLAIM,latanoprost statistically clinically significa...
32002,CLAIM,difference hyperemia two regimens


In [21]:
#We split our data frame into a train-validation and a test data frame.
train_valid, test = train_test_split(df, test_size = 0.2 , random_state = 42)

In [22]:
#keeping actual labels of test dataset
y_true_test = test['label']
y_true_test

24522    EVIDENCE
14640     NEITHER
10299    EVIDENCE
24540     NEITHER
22577     NEITHER
           ...   
3274      NEITHER
1808      NEITHER
21524       CLAIM
20509     NEITHER
17967    EVIDENCE
Name: label, Length: 6401, dtype: object

In [23]:
train_valid

Unnamed: 0,label,sentence_clean
4814,NEITHER,nonalcoholic fatty liver disease (nafld) encom...
11976,NEITHER,"1-year postoperatively, first woman reported c..."
4416,NEITHER,"case zikv infection, role osteoblasts zikv pat..."
17730,NEITHER,study investigates association hap cooking fue...
25163,EVIDENCE,statistically significant qol difference treat...
...,...,...
29802,NEITHER,randomized controlled trial conducted patients...
5390,NEITHER,real-time assessment health-care requirements ...
860,NEITHER,large discrepancies summer climate change euro...
15795,NEITHER,distribution near normal scale show ceiling ef...


In [24]:
#we split again our train-validation data frame into 2 separate data frames (train & validation)
train, valid = train_test_split(train_valid, test_size = 0.2 , random_state = 42)

In [25]:
#keeping actual labels of train dataset
y_true_train = train['label']
y_true_train

482       NEITHER
17159       CLAIM
5773      NEITHER
31434    EVIDENCE
15817     NEITHER
           ...   
12109     NEITHER
29067     NEITHER
28590    EVIDENCE
3071      NEITHER
24070     NEITHER
Name: label, Length: 20482, dtype: object

In [26]:
#keeping actual labels of valid dataset
y_true_valid = valid['label']
y_true_valid

21239     NEITHER
25283     NEITHER
26157     NEITHER
20931     NEITHER
20767    EVIDENCE
           ...   
31907     NEITHER
24353       CLAIM
2198      NEITHER
17015     NEITHER
27883     NEITHER
Name: label, Length: 5121, dtype: object

In [27]:
#we fix the format of the label column in order to later match to the fasttext's required input format.
train.iloc[:, 0] = train.iloc[:, 0].apply(lambda x: '__label__' + x)
valid.iloc[:, 0] = valid.iloc[:, 0].apply(lambda x: '__label__' + x)
test.iloc[:, 0] = test.iloc[:, 0].apply(lambda x: '__label__' + x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [28]:
valid

Unnamed: 0,label,sentence_clean
21239,__label__NEITHER,wind velocity fields measured four cross-secti...
25283,__label__NEITHER,sedentary (engaging <60 min recreational activ...
26157,__label__NEITHER,response rate 88 4%
20931,__label__NEITHER,"increases frequency, duration, and/or severity..."
20767,__label__EVIDENCE,"show positive trends oc 474 streams, lakes, ri..."
...,...,...
31907,__label__NEITHER,randomized double-masked crossover study compa...
24353,__label__CLAIM,"patients cancer, high-fat diet may possibly su..."
2198,__label__NEITHER,pro-c3 collagen neo-epitope putative direct ma...
17015,__label__NEITHER,finding important implications improving mater...


In [29]:
#for the train dataset
#we combine the 2 columns into a new column which is required for the fasttext input format
train["fasttext"] = train["sentence_clean"] + ' ' + train["label"]
#we drop the 2 older columns
train = train.drop(['sentence_clean'],axis =1)
train = train.drop(['label'],axis =1)
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["fasttext"] = train["sentence_clean"] + ' ' + train["label"]


Unnamed: 0,fasttext
482,flexible polymers poly dimethyl siloxane (pdms...
17159,efforts tackle pedestrian safety focus five re...
5773,ventricular tachycardia also inducible remaini...
31434,comparison mean diurnal measurements latanopro...
15817,"response, integrated care programmes appearing..."
...,...
12109,retrospective case analysis __label__NEITHER
29067,primary efficacy endpoint diurnal iop (average...
28590,"overall, eyes slt travoprost groups achieved s..."
3071,"high, anisotropic, substrate-independent mobil..."


In [30]:
#for the validation dataset
#we combine the 2 columns into a new column which is required for the fasttext input format
valid["fasttext"] = valid["sentence_clean"] + ' ' + valid["label"]
#we drop the 2 older columns
valid = valid.drop(['sentence_clean'],axis =1)
valid = valid.drop(['label'],axis =1)
valid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid["fasttext"] = valid["sentence_clean"] + ' ' + valid["label"]


Unnamed: 0,fasttext
21239,wind velocity fields measured four cross-secti...
25283,sedentary (engaging <60 min recreational activ...
26157,response rate 88 4% __label__NEITHER
20931,"increases frequency, duration, and/or severity..."
20767,"show positive trends oc 474 streams, lakes, ri..."
...,...
31907,randomized double-masked crossover study compa...
24353,"patients cancer, high-fat diet may possibly su..."
2198,pro-c3 collagen neo-epitope putative direct ma...
17015,finding important implications improving mater...


In [31]:
#for the test dataset
#we combine the 2 columns into a new column which is required for the fasttext input format
test["fasttext"] = test["sentence_clean"] + ' ' + test["label"]
#we drop the 2 older columns
test = test.drop(['sentence_clean'],axis =1)
test = test.drop(['label'],axis =1)
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["fasttext"] = test["sentence_clean"] + ' ' + test["label"]


Unnamed: 0,fasttext
24522,results showed overall significant improvement...
14640,item response analysis showed excellent sensit...
10299,find two key regions surface responsible devel...
24540,health-related quality life (hrqol; explorator...
22577,summarised current knowledge regarding risk fa...
...,...
3274,use radiative kernels understand influence rap...
1808,"compared experts, iam simulations projected gr..."
21524,"based contributions special issue, several rec..."
20509,abstract: study examines state local practice ...


In [32]:
import numpy as np

In [33]:
#we produce a .txt file from the test dataset which will be later used as test input in our fasttext model
test.to_csv('test.txt', index = False, header = False, quotechar = " ", escapechar = "")

In [34]:
#we produce a .txt file from the train dataset which will be later used as train input in our fasttext model
train.to_csv('train.txt', index = False, header = False, quotechar = " ", escapechar = "")

In [35]:
#we produce a .txt file from the validation dataset which will be later used as validation input in our fasttext model
valid.to_csv('valid.txt', index = False, header = False, quotechar = " ", escapechar = "")

In [36]:
import fasttext

In [37]:
### WE USED DIFFERENT PARAMETERS AND TRIED MANY OPTIONS WITH THE SUPERVISED TRAINING MODEL.

In [38]:
#We train our model
model = fasttext.train_supervised(input="train.txt")

In [39]:
#reforming the validation dataset without the prefix
valid[['sentence','label']] = valid["fasttext"].str.split("__label__",expand=True)
valid = valid.drop(['fasttext'],axis =1)
valid

Unnamed: 0,sentence,label
21239,wind velocity fields measured four cross-secti...,NEITHER
25283,sedentary (engaging <60 min recreational activ...,NEITHER
26157,response rate 88 4%,NEITHER
20931,"increases frequency, duration, and/or severity...",NEITHER
20767,"show positive trends oc 474 streams, lakes, ri...",EVIDENCE
...,...,...
31907,randomized double-masked crossover study compa...,NEITHER
24353,"patients cancer, high-fat diet may possibly su...",CLAIM
2198,pro-c3 collagen neo-epitope putative direct ma...,NEITHER
17015,finding important implications improving mater...,NEITHER


In [40]:
#reforming the test dataset without the prefix
test[['sentence','label']] = test["fasttext"].str.split("__label__",expand=True)
test = test.drop(['fasttext'],axis =1)
test

Unnamed: 0,sentence,label
24522,results showed overall significant improvement...,EVIDENCE
14640,item response analysis showed excellent sensit...,NEITHER
10299,find two key regions surface responsible devel...,EVIDENCE
24540,health-related quality life (hrqol; explorator...,NEITHER
22577,summarised current knowledge regarding risk fa...,NEITHER
...,...,...
3274,use radiative kernels understand influence rap...,NEITHER
1808,"compared experts, iam simulations projected gr...",NEITHER
21524,"based contributions special issue, several rec...",CLAIM
20509,abstract: study examines state local practice ...,NEITHER


In [41]:
#forming the validation dataset properly to be used as input
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model.predict(s)[0][0])
valid_sentences

21239    __label__NEITHER
25283    __label__NEITHER
26157    __label__NEITHER
20931    __label__NEITHER
20767    __label__NEITHER
               ...       
31907    __label__NEITHER
24353    __label__NEITHER
2198     __label__NEITHER
17015    __label__NEITHER
27883      __label__CLAIM
Name: sentence, Length: 5121, dtype: object

In [42]:
#removing the prefix from the predicted outcome
y_pred_valid = valid_sentences.str.replace('__label__','')
y_pred_valid

21239    NEITHER
25283    NEITHER
26157    NEITHER
20931    NEITHER
20767    NEITHER
          ...   
31907    NEITHER
24353    NEITHER
2198     NEITHER
17015    NEITHER
27883      CLAIM
Name: sentence, Length: 5121, dtype: object

In [43]:
##forming the test dataset properly to be used as input
test_sentences = test["sentence"].apply(lambda s: " ".join(s.split()))
test_sentences = test_sentences.apply(lambda s: model.predict(s)[0][0])
test_sentences

24522    __label__EVIDENCE
14640     __label__NEITHER
10299     __label__NEITHER
24540     __label__NEITHER
22577     __label__NEITHER
               ...        
3274      __label__NEITHER
1808      __label__NEITHER
21524     __label__NEITHER
20509     __label__NEITHER
17967    __label__EVIDENCE
Name: sentence, Length: 6401, dtype: object

In [44]:
#removing the prefix from the predicted outcome of test dataset
y_pred_test = test_sentences.str.replace('__label__','')
y_pred_test

24522    EVIDENCE
14640     NEITHER
10299     NEITHER
24540     NEITHER
22577     NEITHER
           ...   
3274      NEITHER
1808      NEITHER
21524     NEITHER
20509     NEITHER
17967    EVIDENCE
Name: sentence, Length: 6401, dtype: object

In [45]:
#creating a list with labels
target_names = ['CLAIM', 'EVIDENCE', 'NEITHER']

In [46]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [47]:
#removing the prefix from the predicted outcome
y_true_valid = y_true_valid.str.replace('__label__','')
y_true_valid

21239     NEITHER
25283     NEITHER
26157     NEITHER
20931     NEITHER
20767    EVIDENCE
           ...   
31907     NEITHER
24353       CLAIM
2198      NEITHER
17015     NEITHER
27883     NEITHER
Name: label, Length: 5121, dtype: object

In [48]:
y_true_test = y_true_test.str.replace('__label__','')
y_true_test

24522    EVIDENCE
14640     NEITHER
10299    EVIDENCE
24540     NEITHER
22577     NEITHER
           ...   
3274      NEITHER
1808      NEITHER
21524       CLAIM
20509     NEITHER
17967    EVIDENCE
Name: label, Length: 6401, dtype: object

In [49]:
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

       CLAIM       0.53      0.31      0.39       521
    EVIDENCE       0.64      0.52      0.57      1009
     NEITHER       0.82      0.91      0.86      3591

    accuracy                           0.77      5121
   macro avg       0.66      0.58      0.61      5121
weighted avg       0.75      0.77      0.76      5121



In [50]:
#By default, fastText sees each training example only five times during training, which is pretty small, 
#given that our training set only have 20.5k training examples.
#The number of times each examples is seen (also known as the number of epochs), can be increased using the -epoch option:

In [51]:
#We train our model again implemeting hyper parameter tunning (epochs)
model_2 = fasttext.train_supervised(input="train.txt", epoch=10)

In [52]:
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_2.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))


validation dataset
              precision    recall  f1-score   support

       CLAIM       0.47      0.37      0.41       521
    EVIDENCE       0.60      0.55      0.57      1009
     NEITHER       0.83      0.87      0.85      3591

    accuracy                           0.76      5121
   macro avg       0.63      0.60      0.61      5121
weighted avg       0.75      0.76      0.75      5121



In [53]:
#epoch's increment does not seem to make our model better (f1 score is not affected that much neither accuracy)
#thus we will keep epoch = 5 (by default)

In [54]:
#Another way to change the learning speed of our model is to increase (or decrease) the learning rate of the algorithm.
#This corresponds to how much the model changes after processing each example. 
#A learning rate of 0 would mean that the model does not change at all, and thus, does not learn anything. 
#Good values of the learning rate are in the range 0.1 - 1.0.

In [55]:
#We train our model again implemeting hyper parameter tunning (learning rate)
model_3 = fasttext.train_supervised(input="train.txt", lr=0.5)
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_3.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

       CLAIM       0.45      0.38      0.41       521
    EVIDENCE       0.60      0.56      0.58      1009
     NEITHER       0.84      0.87      0.85      3591

    accuracy                           0.76      5121
   macro avg       0.63      0.60      0.61      5121
weighted avg       0.75      0.76      0.75      5121



In [56]:
#learning's rate (lr) increment does not seem to make our model better 
#thus we will keep learning rate = 0.1 (by default)

In [57]:
#Finally, we can improve the performance of a model by using word bigrams, instead of just unigrams. 
#This is especially important for classification problems where word order is important, such as sentiment analysis.

In [58]:
#We train our model again implemeting hyper parameter tunning (wordNgrams)
model_4 = fasttext.train_supervised(input="train.txt",wordNgrams=2)
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_4.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

       CLAIM       0.59      0.27      0.37       521
    EVIDENCE       0.67      0.52      0.58      1009
     NEITHER       0.81      0.93      0.87      3591

    accuracy                           0.78      5121
   macro avg       0.69      0.57      0.61      5121
weighted avg       0.76      0.78      0.76      5121



In [59]:
#wordNgrams increment decreases f1 score as accuracy remains the same

In [60]:
#We train our model again implemeting hyper parameter tunning
model_5 = fasttext.train_supervised(input="train.txt", loss='hs', lr=0.7, epoch=18,wordNgrams=2)
valid_sentences = valid["sentence"].apply(lambda s: " ".join(s.split()))
valid_sentences = valid_sentences.apply(lambda s: model_5.predict(s)[0][0])
y_pred_valid = valid_sentences.str.replace('__label__','')
#FOR VALIDATION DATASET
print('validation dataset')
print(classification_report(y_true_valid, y_pred_valid, labels=target_names))

validation dataset
              precision    recall  f1-score   support

       CLAIM       0.46      0.36      0.41       521
    EVIDENCE       0.59      0.56      0.58      1009
     NEITHER       0.84      0.87      0.85      3591

    accuracy                           0.76      5121
   macro avg       0.63      0.60      0.61      5121
weighted avg       0.75      0.76      0.75      5121



In [61]:
#we will use model_5 in our test dataset

In [62]:
test_sentences = test["sentence"].apply(lambda s: " ".join(s.split()))
test_sentences = test_sentences.apply(lambda s: model_5.predict(s)[0][0])
y_pred_test = test_sentences.str.replace('__label__','')
#FOR test DATASET
print('test dataset')
print(classification_report(y_true_test, y_pred_test, labels=target_names))

test dataset
              precision    recall  f1-score   support

       CLAIM       0.52      0.40      0.45       669
    EVIDENCE       0.59      0.59      0.59      1209
     NEITHER       0.85      0.88      0.86      4523

    accuracy                           0.77      6401
   macro avg       0.65      0.62      0.63      6401
weighted avg       0.76      0.77      0.77      6401



In [63]:
#saving our best model
model_5.save_model("fasttext_arguments_final_model.bin")