In [1]:
import pandas as pd
import pickle as pkl
import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

import stanza
# stanza.download('nl')
nlp = stanza.Pipeline(lang='nl', processors='tokenize,pos,lemma')

def count_freq(preds, gts):
    preds_len = [len(x.split(' ')) for x in preds]
    gts_len = [len(x.split(' ')) for x in gts]
    print(Counter(preds_len))
    print(Counter(gts_len))

def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score 

def lemma(li):
    new_list = []
    for t in li:
        doc = nlp(str(t))
        doc1 = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
        doc1 = re.sub('-',' ',doc1)
        doc1 = re.sub(' +', ' ',doc1)
        new_list.append(doc1)
    new_list = [s for s in new_list if len(s) >= 2]
    return new_list

def get_term_(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for token, label in zip(tokens, labels):
            if label == 'B':
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # Check b_pos = 0 không
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms  

[nltk_data] Downloading package stopwords to /Users/hanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-09-14 13:58:17 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |

2021-09-14 13:58:17 INFO: Use device: cpu
2021-09-14 13:58:17 INFO: Loading: tokenize
2021-09-14 13:58:17 INFO: Loading: pos
2021-09-14 13:58:17 INFO: Loading: lemma
2021-09-14 13:58:17 INFO: Done loading processors!


In [4]:
domain_path ='/Users/hanh/Documents/Github/terminology-extraction/ACTER/nl/'
# preds_path = '/Users/hanh/Documents/Github/terminology-extraction/patterns/nl_domains/corp_wind_equi/'
preds_path = '/Users/hanh/Documents/Github/terminology-extraction/results/weighted_results/nl/'
def term_evaluation(domain_path, preds_path, rule=None):
    groundtruth = pd.read_csv(domain_path, sep='	', engine='python',header=None)
    gt = list(groundtruth[0])
    predictions = pkl.load(open(preds_path, 'rb'))
    preds =  get_term_(predictions)
    pred_terms =  set(preds) #- set(stop_words)
    pred_terms = [x for x in pred_terms if len(x)>1]
    pred_terms = [x.lower().strip() for x in pred_terms]
    pred_terms = [re.sub(' -','-', x) for x in pred_terms]
    pred_terms = [re.sub('- ','-', x) for x in pred_terms]
    pred_terms = [re.sub('\(','', x) for x in pred_terms]
    pred_terms = [re.sub('\/','', x) for x in pred_terms]
    print(evaluation_metrics(pred_terms, gt))
    return set(pred_terms), set(gt)

In [5]:
predictions, groundtruth =term_evaluation(domain_path+'htfl/annotations/htfl_nl_terms.ann', preds_path+'ann_weighted_bert_multilingual_uncased.pkl')

(63.04, 59.21, 61.07)


In [6]:
predictions_, groundtruth_ = term_evaluation(domain_path+'htfl/annotations/htfl_nl_terms_nes.ann',
                                             preds_path+'nes_weighted_bert_multilingual_uncased.pkl')

(63.29, 58.83, 60.98)


## Remove the terms consisting only of stop words

In [7]:
stop_words = set(stopwords.words('dutch'))
predictions1 =  set(predictions) - set(stop_words)
predictions2 =  set(predictions_) - set(stop_words)
evaluation_metrics(set(predictions1), groundtruth), evaluation_metrics(set(predictions2), groundtruth)

((63.23, 59.21, 61.15), (60.18, 60.56, 60.37))

## Remove only stopwords at the end of terms consisting of several words

In [8]:
predictions2 = [' '.join(x.split()[:-1]) if x.split()[-1] in stop_words else  x for x in predictions]
predictions2_ = [' '.join(x.split()[:-1]) if x.split()[-1] in stop_words else  x for x in predictions_]
evaluation_metrics(set(predictions2), groundtruth),evaluation_metrics(set(predictions2_), groundtruth_)

((63.2, 59.21, 61.14), (63.51, 58.83, 61.08))

## Remove terms consisting of several words but finishing with a stop word

In [9]:
predictions3 = [x for x in predictions if x.split()[-1] not in stop_words]
predictions3_ = [x for x in predictions_ if x.split()[-1] not in stop_words]
evaluation_metrics(set(predictions3), groundtruth), evaluation_metrics(set(predictions3_), groundtruth)

((63.23, 59.21, 61.15), (60.18, 60.56, 60.37))

## Export Wrong Prediction Lists

In [12]:
htfl = list(set(predictions2) - set(groundtruth))
pd.DataFrame(htfl).rename(columns={0:'htfl'}).to_excel('./wrong_prediction/htfl-wrongly-prediction.xlsx', index=False)