In [1]:
import pandas as pd

In [4]:
pattern_f = pd.read_csv('patterns_fr.txt', delimiter = '\t')
pattern_f

Unnamed: 0,pattern,iate_%,headword,quality
0,NOUN ADP NOUN,0.14,0,1
1,NOUN ADJ,0.14,0,1
2,NOUN,0.11,0,1
3,NOUN ADP DET NOUN,0.04,0,1
4,NOUN ADP NOUN ADJ,0.03,0,1
5,NOUN ADP NOUN ADP NOUN,0.02,0,1
6,ADJ,0.02,0,1
7,ADV,0.02,0,0
8,NOUN ADJ ADJ,0.01,0,1
9,NOUN ADP NOUN ADP DET NOUN,0.01,0,1


In [1]:
import pandas as pd
import pickle as pkl
import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

import stanza
stanza.download('fr')
nlp = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma')

def count_freq(preds, gts):
    preds_len = [len(x.split(' ')) for x in preds]
    gts_len = [len(x.split(' ')) for x in gts]
    print(Counter(preds_len))
    print(Counter(gts_len))

def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score 

def lemma(li):
    new_list = []
    for t in li:
        doc = nlp(str(t))
        doc1 = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
        doc1 = re.sub('-',' ',doc1)
        doc1 = re.sub(' +', ' ',doc1)
        new_list.append(doc1)
    new_list = [s for s in new_list if len(s) >= 2]
    return new_list

def get_term_(predictions):
    all_term = []
    for sentence in predictions:
        tokens = []
        labels = []
        for d in sentence:
            tokens.extend(d.keys())
            labels.extend(d.values())

        for i, label in enumerate(labels):
            if labels[i] == 'I' and (i == 0 or labels[i - 1] == 'O'):
                labels[i] = 'O'

        terms = []
        term = []
        for token, label in zip(tokens, labels):
            if label == 'B':
                #Lưu vị trí B
                b_pos = i
                term = [token]
            elif label == 'I':
                term.append(token)
            elif len(term) > 0:
                terms.append(' '.join(term))
                term = []
        if len(term) > 0:
            terms.append(' '.join(term))
            # Check b_pos = 0 không
        all_term.append(terms)
    
    final_terms = []
    for i in all_term:
        final_terms.extend(i)

    final_terms = [x.lower().strip() for x in final_terms]
    return final_terms  

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hanhtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 30.0MB/s]                    
2021-06-10 12:08:07 INFO: Downloading default packages for language: fr (French)...
2021-06-10 12:08:09 INFO: File exists: /Users/hanhtran/stanza_resources/fr/default.zip.
2021-06-10 12:08:13 INFO: Finished downloading models and saved to /Users/hanhtran/stanza_resources.
2021-06-10 12:08:13 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2021-06-10 12:08:13 INFO: Use device: cpu
2021-06-10 12:08:13 INFO: Loading: tokenize
2021-06-10 12:08:13 INFO: Loading: mwt
2021-06-10 12:08:13 INFO: Loading: pos
2021-06-10 12:08:13 INFO: Loading: lemma
2021-06-10

1. NOUN APD NOUN - 0

2. NOUN ADJ - 0

3. NOUN ADP DET NOUN - 0

4. NOUN ADP NOUN ADJ - 0

5. NOUN ADP NOUN ADP NOUN - 0

6. NOUN ADJ ADJ - 0

7. NOUN ADP NOUN ADP DET NOUN - 0

8. NOUN ADJ ADP NOUN - 0

9. NOUN ADP DET NOUN ADJ - 0

10. NOUN ADJ ADP DET NOUN - 0

11. ADJ NOUN - 1

12. NOUN NOUN - 0

13. NOUN ADP DET NOUN ADP NOUN - 0

14. NOUN ADP DET NOUN ADP DET NOUN - 0

In [4]:
def term_evaluation(domain_path, preds_path, rule=None):
    groundtruth = pd.read_csv(domain_path, sep='	', engine='python',header=None)
    gt = list(groundtruth[0])
    predictions = pkl.load(open(preds_path, 'rb'))
    if rule == 'adj_noun':
        preds =  get_term_adj_noun(predictions)
    elif rule == 'noun_adp_noun':
        preds =  get_term_noun_adp_noun(predictions)
    elif rule == 'noun_adp_det_noun':
        preds = get_term_noun_adp_det_noun(predictions)
    elif rule == 'noun_adp_adj_noun':
        preds = get_term_noun_adp_adj_noun(predictions)
    elif rule == 'adp_noun':
        preds = get_term_adp_noun(predictions)
    elif rule == 'adj_adj_noun':
        preds = get_term_adj_adj_noun(predictions)
    else:
        preds =  get_term_(predictions)
    stop_words = set(stopwords.words('french'))
    pred_terms =  set(preds) - set(stop_words)
    pred_terms = [x for x in pred_terms if len(x)>1]
    pred_terms = [x.lower().strip() for x in pred_terms]
    pred_terms = [re.sub(' -','-', x) for x in pred_terms]
    pred_terms = [re.sub('- ','-', x) for x in pred_terms]
    pred_terms = [re.sub('\(','', x) for x in pred_terms]
    pred_terms = [re.sub('\/','', x) for x in pred_terms]
    precision, recall, f1 = evaluation_metrics(pred_terms, gt)
    return precision, recall, f1

In [6]:
path = '/Users/hanhtran/Documents/terminology-extraction/ACTER/'
domain_paths = [path+'fr/htfl/annotations/htfl_fr_terms.ann', path+'fr/wind/annotations/wind_fr_terms.ann',
                path+'fr/equi/annotations/equi_fr_terms.ann',path+'fr/corp/annotations/corp_fr_terms.ann',
                path+'fr/htfl/annotations/htfl_fr_terms_nes.ann', path+'fr/wind/annotations/wind_fr_terms_nes.ann',
                path+'fr/equi/annotations/equi_fr_terms_nes.ann',path+'fr/corp/annotations/corp_fr_terms_nes.ann',
               ]
preds_paths = ['ann_camembert_htfl.pkl','ann_camembert_wind.pkl','ann_camembert_equi.pkl', 'ann_camembert_corp.pkl',
              'nes_camembert_htfl.pkl','nes_camembert_wind.pkl','nes_camembert_equi.pkl', 'nes_camembert_corp.pkl']
results = []
for d, p in zip(domain_paths, preds_paths):
    pre, rec, f1 = term_evaluation(d,p)
    results.append([p,pre, rec, f1])
raw_res = pd.DataFrame(results,columns=['DOMAIN','NON_PATTERN_P','NON_PATTERN_R','NON_PATTERN_F1'])
raw_res

Unnamed: 0,DOMAIN,NON_PATTERN_P,NON_PATTERN_R,NON_PATTERN_F1
0,ann_camembert_htfl.pkl,53.17,41.74,46.77
1,ann_camembert_wind.pkl,29.69,58.34,39.35
2,ann_camembert_equi.pkl,4.52,6.76,5.42
3,ann_camembert_corp.pkl,34.56,52.6,41.71
4,nes_camembert_htfl.pkl,54.66,35.8,43.26
5,nes_camembert_wind.pkl,32.74,49.79,39.5
6,nes_camembert_equi.pkl,4.07,6.69,5.06
7,nes_camembert_corp.pkl,36.02,54.52,43.38
