In [1]:
import pandas as pd
pattern_n = pd.read_csv('patterns_nl.txt', delimiter = '\t')
pattern_n

Unnamed: 0,pattern,iate_%,headword,quality
0,NOUN,0.34,0,1
1,ADJ NOUN,0.1,1,1
2,NOUN ADP NOUN,0.03,0,1
3,ADV,0.03,0,0
4,NOUN ADP DET NOUN,0.03,1,1
5,ADJ,0.02,0,1
6,NOUN ADP ADJ NOUN,0.01,0,1
7,ADP NOUN,0.01,1,1
8,ADJ ADJ NOUN,0.01,2,1
9,ADP,0.01,0,1


In [2]:
import pandas as pd
import pickle as pkl
import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

import stanza
stanza.download('nl')
nlp = stanza.Pipeline(lang='nl', processors='tokenize,pos,lemma')

def count_freq(preds, gts):
    preds_len = [len(x.split(' ')) for x in preds]
    gts_len = [len(x.split(' ')) for x in gts]
    print(Counter(preds_len))
    print(Counter(gts_len))

def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score 

def lemma(li):
    new_list = []
    for t in li:
        doc = nlp(str(t))
        doc1 = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
        doc1 = re.sub('-',' ',doc1)
        doc1 = re.sub(' +', ' ',doc1)
        new_list.append(doc1)
    new_list = [s for s in new_list if len(s) >= 2]
    return new_list

def get_term_(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for token, label in zip(tokens, labels):
            if label == 'B':
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # Check b_pos = 0 không
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms  

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hanhtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 39.6MB/s]                    
2021-06-10 11:55:20 INFO: Downloading default packages for language: nl (Dutch)...
2021-06-10 11:55:22 INFO: File exists: /Users/hanhtran/stanza_resources/nl/default.zip.
2021-06-10 11:55:26 INFO: Finished downloading models and saved to /Users/hanhtran/stanza_resources.
2021-06-10 11:55:26 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |

2021-06-10 11:55:26 INFO: Use device: cpu
2021-06-10 11:55:26 INFO: Loading: tokenize
2021-06-10 11:55:26 INFO: Loading: pos
2021-06-10 11:55:26 INFO: Loading: lemma
2021-06-10 11:55:26 INFO: Done loading processors!


1. ADJ NOUN -1

In [3]:
def get_term_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'
        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if (b_pos != 0) and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                    b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                    c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                    if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'ADJ'):
                        terms.append(' '.join([b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

2. NOUN ADP NOUN - 0

In [4]:
def get_term_noun_adp_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0 and i+1 < len(tokens):
                    if (tokens[i] != '') and (tokens[i] != ' ') and (tokens[i+1] != '') and (tokens[i+1] != ' ') and (len(nlp(str(tokens[i])).sentences) > 0 and len(nlp(str(tokens[i+1])).sentences) > 0 and len(nlp(str(tokens[b_pos])).sentences) > 0):
                        a1_word = nlp(str(tokens[i+1])).sentences[0].words[0] 
                        a_word = nlp(str(tokens[i])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN')and (c_word.text != 'None') and (a_word.text != 'None') and (a1_word.text != 'None') and ((a_word.upos == 'ADP') and (a1_word.upos == 'NOUN')):
                            terms.append(' '.join(term + [a_word.text] + [a1_word.text]))                               
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower() for x in final_terms]
    return final_terms

3. NOUN ADP DET NOUN - 1

In [5]:
def get_term_noun_adp_det_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                #adj_noun_adj_noun
                if (b_pos != 0) and (i + 1 < len(tokens)) and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (tokens[b_pos] != '') and (tokens[b_pos] != ' ') and (tokens[i] != '') and (tokens[i] != ' ') and (tokens[i+1] != '') and (tokens[i+1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0) and (len(nlp(str(tokens[i])).sentences) > 0) and  (len(nlp(str(tokens[i+1])).sentences) > 0): 
                    a1_word = nlp(str(tokens[i+1])).sentences[0].words[0]
                    a_word = nlp(str(tokens[i])).sentences[0].words[0]
                    b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                    c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                    # Check vị trí b_pos - 1: terms.append()
                    if (c_word.text != 'None') and (b_word.text != 'None') and (a_word.text != 'None') and (a1_word.text != 'None') and (c_word.upos == 'ADP') and ((b_word.upos == 'NOUN') and (a_word.upos == 'DET') and (a1_word.upos == 'NOUN')):
                        terms.append(' '.join([b_word.text] + term + [a_word.text] + [a1_word.text]))
                                
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # check b_pos - 1
        
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower() for x in final_terms]
    return final_terms

4. NOUN ADP ADJ NOUN - 0

In [6]:
def get_term_noun_adp_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())
        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'
        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0 and i + 2 < len(tokens):
                    if (tokens[i] != '') and (tokens[i] != ' ') and (tokens[i+1] != '') and (tokens[i+1] != ' ') and (tokens[i+2] != '') and (tokens[i+2] != ' ') and (tokens[b_pos] != '') and (tokens[b_pos] != ' ') and (len(nlp(str(tokens[i])).sentences) > 0 and len(nlp(str(tokens[i+1])).sentences) > 0 and len(nlp(str(tokens[i+2])).sentences) > 0 and len(nlp(str(tokens[b_pos])).sentences) > 0):
                        a2_word = nlp(str(tokens[i+2])).sentences[0].words[0] 
                        a1_word = nlp(str(tokens[i+1])).sentences[0].words[0] 
                        a_word = nlp(str(tokens[i])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (a_word.text != 'None') and (a1_word.text != 'None') and (a2_word.text != 'None') and (a_word.upos == 'ADP') and (a1_word.upos == 'ADJ') and (a2_word.upos == 'NOUN'):
                            terms.append(' '.join(term + [a_word.text] + [a1_word.text] +[a2_word.text]))   
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower() for x in final_terms]
    return final_terms    

5. ADP NOUN - 1

In [7]:
def get_term_adp_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'
        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if (b_pos != 0) and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 1])).sentences) > 0) and (len(nlp(str(tokens[b_pos])).sentences) > 0):
                    b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                    c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                    if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b_word.upos == 'ADP'):
                        terms.append(' '.join([b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

6. ADJ ADJ NOUN - 2

In [8]:
def get_term_adj_adj_noun(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for i, (token, label) in enumerate(zip(tokens, labels)):
            if label == 'B': 
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                if b_pos != 0:
                    #ADJ ADJ NOUN              
                    if (tokens[b_pos - 2] != '') and (tokens[b_pos - 2] != ' ') and (tokens[b_pos - 1] != '') and (tokens[b_pos - 1] != ' ') and (len(nlp(str(tokens[b_pos - 2])).sentences) > 0 and len(nlp(str(tokens[b_pos - 1])).sentences) > 0 and len(nlp(str(tokens[b_pos])).sentences) > 0):
                        b1_word = nlp(str(tokens[b_pos - 2])).sentences[0].words[0] 
                        b_word = nlp(str(tokens[b_pos - 1])).sentences[0].words[0]
                        c_word = nlp(str(tokens[b_pos])).sentences[0].words[0]
                        if (c_word.upos == 'NOUN') and (c_word.text != 'None') and (b_word.text != 'None') and (b1_word.text != 'None') and ((b_word.upos == 'ADJ') and (b1_word.upos == 'ADJ')):
                            terms.append(' '.join([b1_word.text] +[b_word.text] + term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms    

In [11]:
def term_evaluation(domain_path, preds_path, rule=None):
    groundtruth = pd.read_csv(domain_path, sep='	', engine='python',header=None)
    gt = list(groundtruth[0])
    predictions = pkl.load(open(preds_path, 'rb'))
    if rule == 'adj_noun':
        preds =  get_term_adj_noun(predictions)
    elif rule == 'noun_adp_noun':
        preds =  get_term_noun_adp_noun(predictions)
    elif rule == 'noun_adp_det_noun':
        preds = get_term_noun_adp_det_noun(predictions)
    elif rule == 'noun_adp_adj_noun':
        preds = get_term_noun_adp_adj_noun(predictions)
    elif rule == 'adp_noun':
        preds = get_term_adp_noun(predictions)
    elif rule == 'adj_adj_noun':
        preds = get_term_adj_adj_noun(predictions)
    else:
        preds =  get_term_(predictions)
    stop_words = set(stopwords.words('dutch'))
    pred_terms =  set(preds) - set(stop_words)
    pred_terms = [x for x in pred_terms if len(x)>1]
    pred_terms = [x.lower().strip() for x in pred_terms]
    pred_terms = [re.sub(' -','-', x) for x in pred_terms]
    pred_terms = [re.sub('- ','-', x) for x in pred_terms]
    pred_terms = [re.sub('\(','', x) for x in pred_terms]
    pred_terms = [re.sub('\/','', x) for x in pred_terms]
    precision, recall, f1 = evaluation_metrics(pred_terms, gt)
    return precision, recall, f1

In [12]:
path = '/Users/hanhtran/Documents/terminology-extraction/ACTER/'
domain_paths = [path+'nl/htfl/annotations/htfl_nl_terms.ann', path+'nl/wind/annotations/wind_nl_terms.ann',
                path+'nl/equi/annotations/equi_nl_terms.ann',path+'nl/corp/annotations/corp_nl_terms.ann',
                path+'nl/htfl/annotations/htfl_nl_terms_nes.ann', path+'nl/wind/annotations/wind_nl_terms_nes.ann',
                path+'nl/equi/annotations/equi_nl_terms_nes.ann',path+'nl/corp/annotations/corp_nl_terms_nes.ann',
               ]
preds_paths = ['ann_bert_htfl.pkl','ann_bert_wind.pkl','ann_bert_equi.pkl', 'ann_bert_corp.pkl',
              'nes_bert_htfl.pkl','nes_bert_wind.pkl','nes_bert_equi.pkl', 'nes_bert_corp.pkl']
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p)
    results.append([p,pre, rec, f1])
raw_res = pd.DataFrame(results,columns=['DOMAIN','NON_PATTERN_P','NON_PATTERN_R','NON_PATTERN_F1'])
raw_res

Unnamed: 0,DOMAIN,NON_PATTERN_P,NON_PATTERN_R,NON_PATTERN_F1
0,ann_bert_htfl.pkl,60.4,59.5,59.95
1,ann_bert_wind.pkl,56.45,75.43,64.57
2,ann_bert_equi.pkl,65.92,75.95,70.58
3,ann_bert_corp.pkl,61.21,69.63,65.15
4,nes_bert_htfl.pkl,63.54,58.83,61.09
5,nes_bert_wind.pkl,59.8,68.35,63.79
6,nes_bert_equi.pkl,67.7,72.47,70.0
7,nes_bert_corp.pkl,61.03,66.87,63.82


In [14]:
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p, 'adj_noun')
    results.append([p,pre, rec, f1])
adj_noun = pd.DataFrame(results,columns=['DOMAIN','ADJ_NOUN_P','ADJ_NOUN_R','ADJ_NOUN_F1'])
adj_noun

In [None]:
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p, 'noun_adp_noun')
    results.append([p,pre, rec, f1])
noun_adp_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_ADP_NOUN_P','NOUN_ADP_NOUN_R','NOUN_ADP_NOUN_F1'])
noun_adp_noun

In [None]:
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p, 'noun_adp_det_noun')
    results.append([p,pre, rec, f1])
noun_adp_det_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_ADP_DET_NOUN_P','NOUN_ADP_DET_NOUN_R','NOUN_ADP_DET_NOUN_F1'])
noun_adp_det_noun

In [None]:
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p, 'noun_adp_adj_noun')
    results.append([p,pre, rec, f1])
noun_adp_adj_noun = pd.DataFrame(results,columns=['DOMAIN','NOUN_ADP_ADJ_NOUN_P','NOUN_ADP_ADJ_NOUN_R','NOUN_ADP_ADJ_NOUN_F1'])
noun_adp_adj_noun

In [None]:
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p, 'adp_noun')
    results.append([p,pre, rec, f1])
adp_noun = pd.DataFrame(results,columns=['DOMAIN','ADP_NOUN_P','ADP_NOUN_R','ADP_NOUN_F1'])
adp_noun

In [None]:
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p, 'adj_adj_noun')
    results.append([p,pre, rec, f1])
adj_adj_noun = pd.DataFrame(results,columns=['DOMAIN','ADJ_ADJ_NOUN_P','ADJ_ADJ_NOUN_R','ADJ_ADJ_NOUN_F1'])
adj_adj_noun

In [None]:
df_ls = [adj_noun, noun_adp_noun, noun_adp_det_noun, noun_adp_adj_noun, adp_noun, adj_adj_noun]

for d in df_ls:
    raw_res = raw_res.merge(d, on='DOMAIN', how='left')
raw_res

Integrate all version

In [18]:
pwd

'/Users/hanhtran/Documents/terminology-extraction/patterns/nl_domains'

In [15]:
import pandas as pd

In [22]:
path = '/Users/hanhtran/Documents/terminology-extraction/patterns/nl_domains/nl_filtering/'
non_pattern = pd.read_csv(path + 'non_partern.csv', index_col=0)
adj_noun = pd.read_csv(path + 'adj_noun.csv', index_col=0)
adj_adj_noun = pd.read_csv(path + 'adj_adj_noun.csv', index_col=0)
adp_noun = pd.read_csv(path + 'adp_noun.csv', index_col=0)
noun_adp_adj_noun = pd.read_csv(path + 'noun_adp_adj_noun.csv', index_col=0)
noun_adp_det_noun = pd.read_csv(path + 'noun_adp_det_noun.csv', index_col=0)
noun_adp_noun = pd.read_csv(path + 'noun_adp_noun.csv', index_col=0)

In [23]:
df = non_pattern.merge(adj_noun, on = 'DOMAIN', how='left')
df = df.merge(adj_adj_noun, on = 'DOMAIN', how='left')
df = df.merge(adp_noun, on = 'DOMAIN', how='left')
df = df.merge(noun_adp_adj_noun, on = 'DOMAIN', how='left')
df = df.merge(noun_adp_det_noun, on = 'DOMAIN', how='left')
df = df.merge(noun_adp_noun, on = 'DOMAIN', how='left')
df.head(5)

Unnamed: 0,DOMAIN,NON_PATTERN_P,NON_PATTERN_R,NON_PATTERN_F1,ADJ_NOUN_P,ADJ_NOUN_R,ADJ_NOUN_F1,ADJ_ADJ_NOUN_P,ADJ_ADJ_NOUN_R,ADJ_ADJ_NOUN_F1,...,ADP_NOUN_F1,NOUN_ADP_ADJ_NOUN_P,NOUN_ADP_ADJ_NOUN_R,NOUN_ADP_ADJ_NOUN_F1,NOUN_ADP_DET_NOUN_P,NOUN_ADP_DET_NOUN_R,NOUN_ADP_DET_NOUN_F1,NOUN_ADP_NOUN_P,NOUN_ADP_NOUN_R,NOUN_ADP_NOUN_F1
0,/gdrive/MyDrive/TermEval/nl_domains/ann_bert_h...,60.4,59.5,59.95,55.74,64.8,59.93,59.87,59.98,59.92,...,53.6,59.29,59.55,59.42,60.34,59.5,59.92,54.29,59.5,56.78
1,/gdrive/MyDrive/TermEval/nl_domains/ann_bert_w...,56.45,75.43,64.57,47.1,79.57,59.17,54.79,75.43,63.47,...,57.57,54.62,75.43,63.36,56.45,75.43,64.57,50.53,75.53,60.55
2,/gdrive/MyDrive/TermEval/nl_domains/ann_bert_e...,65.92,75.95,70.58,53.29,78.97,63.64,64.55,75.95,69.79,...,65.55,65.39,75.95,70.28,65.31,75.95,70.23,60.34,76.02,67.28
3,/gdrive/MyDrive/TermEval/nl_domains/ann_bert_c...,61.21,69.63,65.15,52.82,76.12,62.36,59.54,69.72,64.23,...,57.38,59.37,70.2,64.33,61.16,69.63,65.12,53.13,72.21,61.22
4,/gdrive/MyDrive/TermEval/nl_domains/nes_bert_h...,63.54,58.83,61.09,58.29,64.11,61.06,63.08,59.27,61.12,...,54.8,62.51,58.87,60.64,63.48,58.83,61.07,57.66,58.92,58.28


In [26]:
df['DOMAIN'] = [x.split('/')[-1] for x in df['DOMAIN']]
cols = [col for col in df.columns if 'DOMAIN' in col or '_F1' in col]
df[cols]

Unnamed: 0,DOMAIN,NON_PATTERN_F1,ADJ_NOUN_F1,ADJ_ADJ_NOUN_F1,ADP_NOUN_F1,NOUN_ADP_ADJ_NOUN_F1,NOUN_ADP_DET_NOUN_F1,NOUN_ADP_NOUN_F1
0,ann_bert_htfl.pkl,59.95,59.93,59.92,53.6,59.42,59.92,56.78
1,ann_bert_wind.pkl,64.57,59.17,63.47,57.57,63.36,64.57,60.55
2,ann_bert_equi.pkl,70.58,63.64,69.79,65.55,70.28,70.23,67.28
3,ann_bert_corp.pkl,65.15,62.36,64.23,57.38,64.33,65.12,61.22
4,nes_bert_htfl.pkl,61.09,61.06,61.12,54.8,60.64,61.07,58.28
5,nes_bert_wind.pkl,63.79,59.93,62.92,58.34,62.9,63.79,60.76
6,nes_bert_equi.pkl,70.0,63.78,69.35,65.53,69.7,69.46,67.07
7,nes_bert_corp.pkl,63.82,61.32,63.31,57.47,63.14,63.79,60.48


In [27]:
f1_df = df[cols].set_index('DOMAIN').T
f1_df.to_csv('nl_iate_f1.csv')

In [28]:
f1_df

DOMAIN,ann_bert_htfl.pkl,ann_bert_wind.pkl,ann_bert_equi.pkl,ann_bert_corp.pkl,nes_bert_htfl.pkl,nes_bert_wind.pkl,nes_bert_equi.pkl,nes_bert_corp.pkl
NON_PATTERN_F1,59.95,64.57,70.58,65.15,61.09,63.79,70.0,63.82
ADJ_NOUN_F1,59.93,59.17,63.64,62.36,61.06,59.93,63.78,61.32
ADJ_ADJ_NOUN_F1,59.92,63.47,69.79,64.23,61.12,62.92,69.35,63.31
ADP_NOUN_F1,53.6,57.57,65.55,57.38,54.8,58.34,65.53,57.47
NOUN_ADP_ADJ_NOUN_F1,59.42,63.36,70.28,64.33,60.64,62.9,69.7,63.14
NOUN_ADP_DET_NOUN_F1,59.92,64.57,70.23,65.12,61.07,63.79,69.46,63.79
NOUN_ADP_NOUN_F1,56.78,60.55,67.28,61.22,58.28,60.76,67.07,60.48


In [29]:
df[[col for col in df.columns if 'DOMAIN' in col or 'NON_PATTERN' in col]]

Unnamed: 0,DOMAIN,NON_PATTERN_P,NON_PATTERN_R,NON_PATTERN_F1
0,ann_bert_htfl.pkl,60.4,59.5,59.95
1,ann_bert_wind.pkl,56.45,75.43,64.57
2,ann_bert_equi.pkl,65.92,75.95,70.58
3,ann_bert_corp.pkl,61.21,69.63,65.15
4,nes_bert_htfl.pkl,63.54,58.83,61.09
5,nes_bert_wind.pkl,59.8,68.35,63.79
6,nes_bert_equi.pkl,67.7,72.47,70.0
7,nes_bert_corp.pkl,61.03,66.87,63.82
