In [1]:
import pandas as pd
import pickle as pkl
import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

import stanza
# stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def count_freq(preds, gts):
    preds_len = [len(x.split(' ')) for x in preds]
    gts_len = [len(x.split(' ')) for x in gts]
    print(Counter(preds_len))
    print(Counter(gts_len))

def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score 

def lemma(li):
    new_list = []
    for t in li:
        doc = nlp(str(t))
        doc1 = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
        doc1 = re.sub('-',' ',doc1)
        doc1 = re.sub(' +', ' ',doc1)
        new_list.append(doc1)
    new_list = [s for s in new_list if len(s) >= 2]
    return new_list

def get_term_(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for token, label in zip(tokens, labels):
            if label == 'B':
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # Check b_pos = 0 không
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms  

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hanhtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-08-10 16:57:04 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-08-10 16:57:04 INFO: Use device: cpu
2021-08-10 16:57:04 INFO: Loading: tokenize
2021-08-10 16:57:04 INFO: Loading: pos
2021-08-10 16:57:04 INFO: Loading: lemma
2021-08-10 16:57:04 INFO: Done loading processors!


|Patterns|%IATE|Head|
|-|-|-|
|NOUN NOUN	        |0.15	|1| 
|ADJ NOUN	        |0.14	|1| 
|ADJ NOUN NOUN	    |0.04	|2|
|NOUN NOUN NOUN	    |0.03	|2|
|NOUN ADP NOUN	    |0.02	|0| 
|ADJ ADJ NOUN	    |0.01	|2|
|NOUN ADP ADJ NOUN	|0.01	|0|
|ADJ NOUN NOUN NOUN	|0.01	|3|

1. NOUN NOUN - 1

In [2]:
def get_term_noun_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0:
                    if (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'NOUN'):
                            terms.append(' '.join([b_word.text] + term))
#                     if (tokens[i] != '') and (tokens[i] != ' ') and (len(nlp(str(tokens[i])).sentences) > 0):
#                         a_word = nlp(str(tokens[i])).sentences[0].words[0]
#                         if (a_word.text != 'None') and (a_word.upos == 'NOUN'):
#                             terms.append(' '.join(term + [a_word.text]))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

2. ADJ NOUN - 1

In [3]:
def get_term_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'
        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if (b_pos != 0) and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                    b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                    c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                    if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'ADJ'):
                        terms.append(' '.join([b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

3. ADJ NOUN NOUN - 2

In [4]:
def get_term_adj_noun_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                # Check b_pos = 0 không
                if (b_pos != 0) and (tokens[b_pos] != '') and (tokens[b_pos] != ' ') and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ')and (tokens[b_pos - 2] != '') and (tokens[b_pos - 2] != ' ') and (len(nlp(str(tokens[b_pos - 2])).sentences) > 0) and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                    b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                    b1_word = nlp(str(tokens[b_pos - 2])).sentences[0].words[0]
                    c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                    if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'NOUN')and (b1_word.text != 'None') and (b1_word.upos == 'ADJ'):
                        terms.append(' '.join([b1_word.text] +[b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # check b_pos - 1
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

4. NOUN NOUN NOUN - 2

In [5]:
def get_term_noun_noun_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0:
                    #ADJ ADJ NOUN              
                    if (tokens[b_pos - 2] != '') and (tokens[b_pos - 2] != ' ') and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 2])).sentences) > 0) and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        b1_word = nlp(str(tokens[b_pos - 2])).sentences[0].words[0] 
                        b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b1_word.text != 'None') and (b_word.upos == 'NOUN') and (b1_word.upos == 'NOUN'):
                            terms.append(' '.join([b1_word.text] +[b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

5. NOUN ADP NOUN - 0

In [6]:
def get_term_noun_adp_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0 and i+1 < len(tokens):
                    if (tokens[i] != '') and (tokens[i] != ' ') and (tokens[i+1] != '') and (tokens[i+1] != ' ') and (len(nlp(str(tokens[i])).sentences) > 0) and (len(nlp(str(tokens[i+1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        a1_word = nlp(str(tokens[i+1])).sentences[0].words[0] 
                        a_word = nlp(str(tokens[i])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN')and (c_word.text != 'None') and (a_word.text != 'None') and (a1_word.text != 'None') and (a_word.upos == 'ADP') and (a1_word.upos == 'NOUN'):
                            terms.append(' '.join(term + [a_word.text] + [a1_word.text]))                               
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower() for x in final_terms]
    return final_terms

6. ADJ ADJ NOUN - 2

In [7]:
def get_term_adj_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0:
                    #ADJ ADJ NOUN              
                    if (tokens[b_pos - 2] != '') and (tokens[b_pos - 2] != ' ') and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 2])).sentences) > 0) and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        b1_word = nlp(str(tokens[b_pos - 2])).sentences[0].words[0] 
                        b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b1_word.text != 'None') and (b_word.upos == 'ADJ') and (b1_word.upos == 'ADJ'):
                            terms.append(' '.join([b1_word.text] +[b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

7. NOUN ADP ADJ NOUN - 0

In [8]:
def get_term_noun_adp_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())
        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'
        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0 and i + 2 < len(tokens):
                    if (tokens[i] != '') and (tokens[i] != ' ') and (tokens[i+1] != '') and (tokens[i+1] != ' ') and (tokens[i+2] != '') and (tokens[i+2] != ' ') and (tokens[b_pos] != '') and (tokens[b_pos] != ' ') and (len(nlp(str(tokens[i])).sentences) > 0) and (len(nlp(str(tokens[i+1])).sentences) > 0) and (len(nlp(str(tokens[i+2])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        a2_word = nlp(str(tokens[i+2])).sentences[0].words[0] 
                        a1_word = nlp(str(tokens[i+1])).sentences[0].words[0] 
                        a_word = nlp(str(tokens[i])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (a_word.text != 'None') and (a1_word.text != 'None') and (a2_word.text != 'None') and (a_word.upos == 'ADP') and (a1_word.upos == 'ADJ') and (a2_word.upos == 'NOUN'):
                            terms.append(' '.join(term + [a_word.text] + [a1_word.text] +[a2_word.text]))   
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower() for x in final_terms]
    return final_terms    

8. ADJ NOUN NOUN NOUN - 3

In [9]:
def get_term_adj_noun_noun_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0:         
                    if (tokens[b_pos - 3] != '') and (tokens[b_pos - 3] != ' ') and  (tokens[b_pos - 2] != '') and (tokens[b_pos - 2] != ' ') and (tokens[b_pos - 3] != '') and (tokens[b_pos - 3] != ' ') and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 2])).sentences) > 0) and (len(nlp(str(tokens[b_pos - 3])).sentences) > 0) and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                        b2_word = nlp(str(tokens[b_pos - 3])).sentences[0].words[0] 
                        b1_word = nlp(str(tokens[b_pos - 2])).sentences[0].words[0] 
                        b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b1_word.text != 'None') and (b2_word.text != 'None') and (b_word.upos == 'NOUN') and (b1_word.upos == 'NOUN') and (b2_word.upos == 'ADJ'):
                            terms.append(' '.join([b2_word.text] +[b1_word.text] +[b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

|Patterns|%IATE|Head|
|-|-|-|
|NOUN NOUN	        |0.15	|1| 
|ADJ NOUN	        |0.14	|1| 
|ADJ NOUN NOUN	    |0.04	|2|
|NOUN NOUN NOUN	    |0.03	|2|
|NOUN ADP NOUN	    |0.02	|0| 
|ADJ ADJ NOUN	    |0.01	|2|
|NOUN ADP ADJ NOUN	|0.01	|0|
|ADJ NOUN NOUN NOUN	|0.01	|3|

In [10]:
def term_evaluation(domain_path, preds_path, rule=None):
    groundtruth = pd.read_csv(domain_path, sep='	', engine='python',header=None)
    gt = list(groundtruth[0])
    predictions = pkl.load(open(preds_path, 'rb'))
    if rule == 'adj_noun':
        preds =  get_term_adj_noun(predictions)
    elif rule == 'noun_noun':
        preds =  get_term_noun_noun(predictions)
    elif rule == 'adj_noun_noun':
        preds = get_term_adj_noun_noun(predictions)
    elif rule == 'noun_noun_noun':
        preds = get_term_noun_noun_noun(predictions)
    elif rule == 'noun_adp_noun':
        preds = get_term_noun_adp_noun(predictions)
    elif rule == 'adj_adj_noun':
        preds = get_term_adj_adj_noun(predictions)
    elif rule == 'noun_adp_adj_noun':
        preds = get_term_noun_adp_adj_noun(predictions)
    elif rule == 'adj_noun_noun_noun':
        preds = get_term_adj_noun_noun_noun(predictions)
    else:
        preds =  get_term_(predictions)
    stop_words = set(stopwords.words('english'))
    pred_terms =  set(preds) - set(stop_words)
    pred_terms = [x for x in pred_terms if len(x)>1]
    pred_terms = [x.lower().strip() for x in pred_terms]
    pred_terms = [re.sub(' -','-', x) for x in pred_terms]
    pred_terms = [re.sub('- ','-', x) for x in pred_terms]
    pred_terms = [re.sub('\(','', x) for x in pred_terms]
    pred_terms = [re.sub('\/','', x) for x in pred_terms]
    precision, recall, f1 = evaluation_metrics(pred_terms, gt)
    return f1
    

In [11]:
path = '/Users/hanhtran/Documents/terminology-extraction/ACTER/en/'
path1= '/Users/hanhtran/Documents/terminology-extraction/patterns/en_domains/corp_wind/'
path2= '/Users/hanhtran/Documents/terminology-extraction/patterns/en_domains/equi_wind/'
path3= '/Users/hanhtran/Documents/terminology-extraction/patterns/en_domains/corp_equi/'

domain_paths = [path+'equi/annotations/equi_en_terms.ann', path+'equi/annotations/equi_en_terms_nes.ann',
               path+'corp/annotations/corp_en_terms.ann', path+'corp/annotations/corp_en_terms_nes.ann',
               path+'wind/annotations/wind_en_terms.ann', path+'wind/annotations/wind_en_terms_nes.ann']
preds_paths = [path1 + 'ann_xlnet_equi.pkl', path1 + 'nes_xlnet_equi.pkl',
              path2 + 'ann_xlnet_corp.pkl', path2 + 'nes_xlnet_corp.pkl',
              path3 + 'ann_xlnet_wind.pkl', path3 + 'nes_xlnet_wind.pkl']
                
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d, p)
    results.append([p, f1])
raw_res = pd.DataFrame(results,columns=['DOMAIN','NON_PATTERN'])
raw_res

Unnamed: 0,DOMAIN,NON_PATTERN
0,/Users/hanhtran/Documents/terminology-extracti...,34.81
1,/Users/hanhtran/Documents/terminology-extracti...,40.62
2,/Users/hanhtran/Documents/terminology-extracti...,10.69
3,/Users/hanhtran/Documents/terminology-extracti...,20.2
4,/Users/hanhtran/Documents/terminology-extracti...,23.04
5,/Users/hanhtran/Documents/terminology-extracti...,29.25


In [12]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1  = term_evaluation(d, p, 'noun_noun')
    results.append([p, f1])
noun_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_NOUN'])
noun_noun

Unnamed: 0,DOMAIN,NOUN_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,35.87
1,/Users/hanhtran/Documents/terminology-extracti...,41.35
2,/Users/hanhtran/Documents/terminology-extracti...,11.07
3,/Users/hanhtran/Documents/terminology-extracti...,20.56
4,/Users/hanhtran/Documents/terminology-extracti...,26.1
5,/Users/hanhtran/Documents/terminology-extracti...,30.9


In [13]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d,  p, 'adj_noun')
    results.append([p, f1])
adj_noun = pd.DataFrame(results,columns=['DOMAIN','ADJ_NOUN'])
adj_noun

Unnamed: 0,DOMAIN,ADJ_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,33.63
1,/Users/hanhtran/Documents/terminology-extracti...,39.64
2,/Users/hanhtran/Documents/terminology-extracti...,12.15
3,/Users/hanhtran/Documents/terminology-extracti...,20.27
4,/Users/hanhtran/Documents/terminology-extracti...,23.69
5,/Users/hanhtran/Documents/terminology-extracti...,29.44


In [14]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d, p, 'adj_noun_noun')
    results.append([p, f1])
adj_noun_noun = pd.DataFrame(results,columns=['DOMAIN','ADJ_NOUN_NOUN'])
adj_noun_noun

Unnamed: 0,DOMAIN,ADJ_NOUN_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,34.69
1,/Users/hanhtran/Documents/terminology-extracti...,40.53
2,/Users/hanhtran/Documents/terminology-extracti...,10.67
3,/Users/hanhtran/Documents/terminology-extracti...,20.29
4,/Users/hanhtran/Documents/terminology-extracti...,23.3
5,/Users/hanhtran/Documents/terminology-extracti...,29.17


In [15]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d, p, 'noun_noun_noun')
    results.append([p, f1])
noun_noun_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_NOUN_NOUN'])
noun_noun_noun

Unnamed: 0,DOMAIN,NOUN_NOUN_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,34.56
1,/Users/hanhtran/Documents/terminology-extracti...,40.39
2,/Users/hanhtran/Documents/terminology-extracti...,10.65
3,/Users/hanhtran/Documents/terminology-extracti...,20.15
4,/Users/hanhtran/Documents/terminology-extracti...,23.49
5,/Users/hanhtran/Documents/terminology-extracti...,29.07


In [16]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d, p, 'noun_adp_noun')
    results.append([p,f1])
noun_adp_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_ADP_NOUN'])
noun_adp_noun

Unnamed: 0,DOMAIN,NOUN_ADP_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,34.05
1,/Users/hanhtran/Documents/terminology-extracti...,39.81
2,/Users/hanhtran/Documents/terminology-extracti...,10.68
3,/Users/hanhtran/Documents/terminology-extracti...,19.93
4,/Users/hanhtran/Documents/terminology-extracti...,22.53
5,/Users/hanhtran/Documents/terminology-extracti...,28.63


In [17]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d,  p, 'adj_adj_noun')
    results.append([p,f1])
adj_adj_noun = pd.DataFrame(results,columns=['DOMAIN','ADJ_ADJ_NOUN'])
adj_adj_noun

Unnamed: 0,DOMAIN,ADJ_ADJ_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,34.68
1,/Users/hanhtran/Documents/terminology-extracti...,40.52
2,/Users/hanhtran/Documents/terminology-extracti...,10.68
3,/Users/hanhtran/Documents/terminology-extracti...,20.2
4,/Users/hanhtran/Documents/terminology-extracti...,22.95
5,/Users/hanhtran/Documents/terminology-extracti...,29.18


In [18]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d, p, 'noun_adp_adj_noun')
    results.append([p,f1])
noun_adp_adj_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_ADP_ADJ_NOUN'])
noun_adp_adj_noun

Unnamed: 0,DOMAIN,NOUN_ADP_ADJ_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,34.61
1,/Users/hanhtran/Documents/terminology-extracti...,40.4
2,/Users/hanhtran/Documents/terminology-extracti...,10.59
3,/Users/hanhtran/Documents/terminology-extracti...,20.08
4,/Users/hanhtran/Documents/terminology-extracti...,22.83
5,/Users/hanhtran/Documents/terminology-extracti...,29.02


In [19]:
results = []
for d, p in zip(domain_paths, preds_paths):
    f1 = term_evaluation(d, p, 'adj_noun_noun_noun')
    results.append([p,f1])
adj_noun_noun_noun = pd.DataFrame(results,columns=['DOMAIN','ADJ_NOUN_NOUN_NOUN'])
adj_noun_noun_noun

Unnamed: 0,DOMAIN,ADJ_NOUN_NOUN_NOUN
0,/Users/hanhtran/Documents/terminology-extracti...,34.84
1,/Users/hanhtran/Documents/terminology-extracti...,40.58
2,/Users/hanhtran/Documents/terminology-extracti...,10.68
3,/Users/hanhtran/Documents/terminology-extracti...,20.19
4,/Users/hanhtran/Documents/terminology-extracti...,23.06
5,/Users/hanhtran/Documents/terminology-extracti...,29.29


In [20]:
df_ls = [noun_noun, adj_noun, adj_noun_noun, noun_noun_noun ,
         noun_adp_noun, adj_adj_noun,noun_adp_adj_noun,adj_noun_noun_noun]

In [21]:
for d in df_ls:
    raw_res = raw_res.merge(d, on='DOMAIN', how='left')

In [22]:
raw_res['DOMAIN'] = [x.split('/')[-1] for x in raw_res['DOMAIN']]
raw_res                     

Unnamed: 0,DOMAIN,NON_PATTERN,NOUN_NOUN,ADJ_NOUN,ADJ_NOUN_NOUN,NOUN_NOUN_NOUN,NOUN_ADP_NOUN,ADJ_ADJ_NOUN,NOUN_ADP_ADJ_NOUN,ADJ_NOUN_NOUN_NOUN
0,ann_xlnet_equi.pkl,34.81,35.87,33.63,34.69,34.56,34.05,34.68,34.61,34.84
1,nes_xlnet_equi.pkl,40.62,41.35,39.64,40.53,40.39,39.81,40.52,40.4,40.58
2,ann_xlnet_corp.pkl,10.69,11.07,12.15,10.67,10.65,10.68,10.68,10.59,10.68
3,nes_xlnet_corp.pkl,20.2,20.56,20.27,20.29,20.15,19.93,20.2,20.08,20.19
4,ann_xlnet_wind.pkl,23.04,26.1,23.69,23.3,23.49,22.53,22.95,22.83,23.06
5,nes_xlnet_wind.pkl,29.25,30.9,29.44,29.17,29.07,28.63,29.18,29.02,29.29


In [23]:
raw_res.to_csv('en_iate_comparison.csv')

In [24]:
final = raw_res.T.rename(columns=raw_res.T.iloc[0]).drop(raw_res.T.index[0])
final

Unnamed: 0,ann_xlnet_equi.pkl,nes_xlnet_equi.pkl,ann_xlnet_corp.pkl,nes_xlnet_corp.pkl,ann_xlnet_wind.pkl,nes_xlnet_wind.pkl
NON_PATTERN,34.81,40.62,10.69,20.2,23.04,29.25
NOUN_NOUN,35.87,41.35,11.07,20.56,26.1,30.9
ADJ_NOUN,33.63,39.64,12.15,20.27,23.69,29.44
ADJ_NOUN_NOUN,34.69,40.53,10.67,20.29,23.3,29.17
NOUN_NOUN_NOUN,34.56,40.39,10.65,20.15,23.49,29.07
NOUN_ADP_NOUN,34.05,39.81,10.68,19.93,22.53,28.63
ADJ_ADJ_NOUN,34.68,40.52,10.68,20.2,22.95,29.18
NOUN_ADP_ADJ_NOUN,34.61,40.4,10.59,20.08,22.83,29.02
ADJ_NOUN_NOUN_NOUN,34.84,40.58,10.68,20.19,23.06,29.29


In [25]:
for col in list(final.columns):
    final[col] = [float(x) for x in final[col]]
    final['percent_'+ col] = round(final[col]*100/final[col].iloc[0],2) - 100

final.filter(regex='percent_')

Unnamed: 0,percent_ann_xlnet_equi.pkl,percent_nes_xlnet_equi.pkl,percent_ann_xlnet_corp.pkl,percent_nes_xlnet_corp.pkl,percent_ann_xlnet_wind.pkl,percent_nes_xlnet_wind.pkl
NON_PATTERN,0.0,0.0,0.0,0.0,0.0,0.0
NOUN_NOUN,3.05,1.8,3.55,1.78,13.28,5.64
ADJ_NOUN,-3.39,-2.41,13.66,0.35,2.82,0.65
ADJ_NOUN_NOUN,-0.34,-0.22,-0.19,0.45,1.13,-0.27
NOUN_NOUN_NOUN,-0.72,-0.57,-0.37,-0.25,1.95,-0.62
NOUN_ADP_NOUN,-2.18,-1.99,-0.09,-1.34,-2.21,-2.12
ADJ_ADJ_NOUN,-0.37,-0.25,-0.09,0.0,-0.39,-0.24
NOUN_ADP_ADJ_NOUN,-0.57,-0.54,-0.94,-0.59,-0.91,-0.79
ADJ_NOUN_NOUN_NOUN,0.09,-0.1,-0.09,-0.05,0.09,0.14


In [26]:
final.filter(regex='percent_')[['percent_ann_xlnet_corp.pkl', 'percent_ann_xlnet_equi.pkl', 'percent_ann_xlnet_wind.pkl',
                                'percent_nes_xlnet_corp.pkl','percent_nes_xlnet_equi.pkl', 'percent_nes_xlnet_wind.pkl']]

Unnamed: 0,percent_ann_xlnet_corp.pkl,percent_ann_xlnet_equi.pkl,percent_ann_xlnet_wind.pkl,percent_nes_xlnet_corp.pkl,percent_nes_xlnet_equi.pkl,percent_nes_xlnet_wind.pkl
NON_PATTERN,0.0,0.0,0.0,0.0,0.0,0.0
NOUN_NOUN,3.55,3.05,13.28,1.78,1.8,5.64
ADJ_NOUN,13.66,-3.39,2.82,0.35,-2.41,0.65
ADJ_NOUN_NOUN,-0.19,-0.34,1.13,0.45,-0.22,-0.27
NOUN_NOUN_NOUN,-0.37,-0.72,1.95,-0.25,-0.57,-0.62
NOUN_ADP_NOUN,-0.09,-2.18,-2.21,-1.34,-1.99,-2.12
ADJ_ADJ_NOUN,-0.09,-0.37,-0.39,0.0,-0.25,-0.24
NOUN_ADP_ADJ_NOUN,-0.94,-0.57,-0.91,-0.59,-0.54,-0.79
ADJ_NOUN_NOUN_NOUN,-0.09,0.09,0.09,-0.05,-0.1,0.14


In [27]:
final.filter(regex='percent_').to_csv('percent_comparison.csv')