In [1]:
import pandas as pd
import pickle as pkl
import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

import stanza
# stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score 

def get_term_(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for token, label in zip(tokens, labels):
            if label == 'B':
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # Check b_pos = 0 không
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms  

[nltk_data] Downloading package stopwords to /Users/hanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-09-14 14:58:41 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-09-14 14:58:41 INFO: Use device: cpu
2021-09-14 14:58:41 INFO: Loading: tokenize
2021-09-14 14:58:41 INFO: Loading: pos
2021-09-14 14:58:41 INFO: Loading: lemma
2021-09-14 14:58:41 INFO: Done loading processors!


In [2]:
def get_term_noun_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0:
                    if (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'NOUN'):
                            terms.append(' '.join([b_word.text] + term))

                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

In [3]:
def get_term_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'
        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if (b_pos != 0) and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                    b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                    c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                    if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'ADJ'):
                        terms.append(' '.join([b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

In [4]:
domain_path ='/Users/hanh/Documents/Github/terminology-extraction/ACTER/en/'
preds_path = '/Users/hanh/Documents/Github/terminology-extraction/patterns/en_domains/corp_equi_wind/'
def term_evaluation(domain_path, preds_path, rule=None):
    groundtruth = pd.read_csv(domain_path, sep='	', engine='python',header=None)
    gt = list(groundtruth[0])
    predictions = pkl.load(open(preds_path, 'rb'))
    preds =  get_term_(predictions)
    preds1 =  get_term_noun_noun(predictions)
    preds2 =  get_term_adj_noun(predictions)
    preds = preds + preds1 + preds2 
    stop_words = set(stopwords.words('english'))
    pred_terms =  set(preds) # - set(stop_words)
    pred_terms = [x for x in pred_terms if len(x)>1]
#     pred_terms = [x.lower().strip() for x in pred_terms]
    pred_terms = [re.sub(' -','-', x) for x in pred_terms]
    pred_terms = [re.sub('- ','-', x) for x in pred_terms]
    pred_terms = [re.sub('\(','', x) for x in pred_terms]
    pred_terms = [re.sub('\/','', x) for x in pred_terms]
    print(evaluation_metrics(pred_terms, gt))
    return set(pred_terms), set(gt)

In [5]:
predictions, groundtruth = term_evaluation(domain_path+'htfl/annotations/htfl_en_terms.ann', preds_path+'ann_xlnet_htfl.pkl')
predictions_, groundtruth_ = term_evaluation(domain_path+'htfl/annotations/htfl_en_terms_nes.ann', preds_path+'nes_xlnet_htfl.pkl')

(38.78, 37.99, 38.38)
(39.91, 43.52, 41.64)


In [6]:
predictions, groundtruth = term_evaluation(domain_path+'htfl/annotations/htfl_en_terms.ann', preds_path+'ann_xlnet_htfl.pkl')
predictions_, groundtruth_ = term_evaluation(domain_path+'htfl/annotations/htfl_en_terms_nes.ann', preds_path+'nes_xlnet_htfl.pkl')

(38.78, 37.99, 38.38)
(39.91, 43.52, 41.64)


## Remove the terms consisting only of stop words

In [7]:
stop_words = set(stopwords.words('english'))
predictions1 =  set(predictions) - set(stop_words)
evaluation_metrics(set(predictions), groundtruth)

(38.78, 37.99, 38.38)

In [8]:
evaluation_metrics(set(predictions1), groundtruth)

(38.9, 37.91, 38.4)

In [9]:
predictions1_ =  set(predictions_) - set(stop_words)
evaluation_metrics(set(predictions_), groundtruth_)

(39.91, 43.52, 41.64)

In [10]:
evaluation_metrics(set(predictions1_), groundtruth_)

(40.03, 43.4, 41.65)

## Remove only stopwords at the end of terms consisting of several words

In [11]:
predictions2 = [' '.join(x.split()[:-1]) if x.split()[-1] in stop_words else  x for x in predictions]
evaluation_metrics(set(predictions2), groundtruth)

(38.96, 37.95, 38.45)

In [12]:
len(set(predictions2)), len(set(predictions))

(2300, 2313)

In [13]:
predictions2_ = [' '.join(x.split()[:-1]) if x.split()[-1] in stop_words else  x for x in predictions_]
evaluation_metrics(set(predictions2_), groundtruth_)

(40.04, 43.4, 41.65)

## Remove terms consisting of several words but finishing with a stop word

In [14]:
predictions3 = [x for x in predictions if x.split()[-1] not in stop_words]
evaluation_metrics(set(predictions3), groundtruth)

(39.08, 37.91, 38.49)

In [15]:
predictions3_ = [x for x in predictions_ if x.split()[-1] not in stop_words]
evaluation_metrics(set(predictions3_), groundtruth)

(37.41, 44.3, 40.56)

In [16]:
len(predictions2), len(predictions3)

(2313, 2290)

## Export Wrong Prediction Lists

In [17]:
# predictions, groundtruth = term_evaluation(domain_path+'htfl/annotations/htfl_en_terms.ann', preds_path+'ann_xlnet_htfl.pkl')
# htfl = list(predictions - groundtruth)
# pd.DataFrame(htfl).rename(columns={0:'htfl'}).to_csv('./wrong_prediction/htfl.csv', index=False)

In [18]:
# predictions, groundtruth = term_evaluation(domain_path+'equi/annotations/equi_en_terms.ann', preds_path+'ann_xlnet_equi.pkl')
# equi = list(predictions - groundtruth)
# pd.DataFrame(equi).rename(columns={0:'equi'}).to_csv('./wrong_prediction/equi.csv', index=False)

In [19]:
# predictions, groundtruth = term_evaluation(domain_path+'wind/annotations/wind_en_terms.ann', preds_path+'ann_xlnet_wind.pkl')
# wind = list(predictions - groundtruth)
# pd.DataFrame(wind).rename(columns={0:'wind'}).to_csv('./wrong_prediction/wind.csv', index=False)

In [20]:
# predictions, groundtruth = term_evaluation(domain_path+'corp/annotations/corp_en_terms.ann', preds_path+'ann_xlnet_corp.pkl')
# corp = list(predictions - groundtruth)
# pd.DataFrame(corp).rename(columns={0:'corp'}).to_csv('./wrong_prediction/corp.csv', index=False)

In [21]:
# final = htfl + equi + wind + corp
# len(final), len(set(final))

In [22]:
# wrong_predict = pd.read_excel('./wrong_prediction/wrongly-predicted.xlsx')
# wrong_predict