In [1]:
import pandas as pd
import csv
import os
import re
import pandas as pd
import spacy
import pickle
import stanza

In [2]:
wind = pd.read_csv('acter_wind_relative-all.csv', delimiter = '\t', header = None, quoting=csv.QUOTE_NONE, error_bad_lines=False)
equi = pd.read_csv('acter_equi_relative-all.csv', delimiter = '\t', header = None, quoting=csv.QUOTE_NONE, error_bad_lines=False)
corp = pd.read_csv('acter_corp_relative-all.csv', delimiter = '\t', header = None, quoting=csv.QUOTE_NONE, error_bad_lines=False)
len(wind), len(equi),len(corp)
train = pd.concat([wind, equi, corp]).rename(columns={0:'words',1:'term_score',2:'id'})
# train.to_csv('acter_train_relative-all.csv')

In [3]:
train_dict = pd.Series(train['term_score'].values,index=train['words']).to_dict()
train_dict

{'mw': 0.0,
 's': 0.0,
 'j.': 0.0,
 'hawt': 0.7707936317531038,
 'e.g.': 0.8138496937298163,
 'pscad': 0.7668327978049295,
 'm.': 0.0,
 'wecs': 0.7629247164037941,
 '/s': 0.7625158380886778,
 'deis': 0.759551899379512,
 'no.': 0.6895200709429711,
 'means': 0.8177060827043381,
 'a.': 0.6783913108127341,
 'pp.': 0.6783913108127341,
 'm/': 0.7482823655432105,
 'w': 0.0,
 'c.': 0.6783913108127341,
 'ϕ': 0.7460211052591118,
 'states': 1.0,
 's.': 0.7454387598012496,
 'f1': 0.7454387598012496,
 'r': 0.0,
 'p.': 0.7856895340809044,
 'vawt': 0.7424161964209408,
 'd.': 0.0,
 'turbines': 0.7405069553071858,
 'jepirachi': 0.7405069553071858,
 'dowec': 0.7405069553071858,
 'mwh': 0.7398533001622212,
 'ρ': 0.739190628800276,
 'co2': 0.7385186890073114,
 'r.': 0.7357328122885667,
 'habitats': 0.7350103488690716,
 'dfig': 0.7342768550484168,
 'u∞': 0.7342768550484168,
 'lietuvos': 0.7335319887784154,
 'tco2': 0.7288026410190231,
 'hyperlink': 0.7279670910536921,
 'tip-speed': 0.7279670910536921,
 'fi

In [10]:
import json

with open('term_score_dict.json', 'w') as fp:
    json.dump(train_dict, fp)

In [25]:
class KeyTerm():
    def __init__(self, train_dict_dir = 'term_score_dict.json', data_dir = "../ACTER", language = 'en', term = "equi", nes=False):
        data_file = os.path.join(data_dir, language, term, 'annotations')
#         if nes:
#             data_file = os.path.join(data_file, '{0}_{1}_terms_nes.ann'.format(term, language))
#         else:
#             data_file = os.path.join(data_file, '{0}_{1}_terms.ann'.format(term, language))
#         self.df = pd.read_csv(data_file, sep='\t', names=['word', 'class'], header=None)
        self.nlp = stanza.Pipeline(lang='en')
#         self.keys = self.df['word'].to_list()
#         self.keys = [str(x) for x in self.keys]
#         self.keys_lemma = list(set([self.lemma(x) for x in self.keys]))
        with open(train_dict_dir, 'r') as fp: 
            self.dicts = json.load(fp)
    
    def lemma(self, doc):
        lemma_word = self.nlp(str(doc))
        lemma_word = ' '.join([w.lemma for sent in lemma_word.sentences for w in sent.words])
        lemma_word = re.sub(' -','-',lemma_word)
        lemma_word = re.sub('- ','-',lemma_word)
        lemma_word = re.sub(' \)', ' ',lemma_word)
        lemma_word = re.sub('\( ', ' ',lemma_word)
        lemma_word = re.sub(' +', ' ',lemma_word)
        return lemma_word

    def extract_doc(self, doc, use_lemma=True):
        doc = self.nlp(doc)
        results = []
        for sent in doc.sentences:
            if use_lemma:
                lemma_word = ' '.join([w.lemma for w in sent.words])
                lemma_word = re.sub(' -','-',lemma_word)
                lemma_word = re.sub('- ','-',lemma_word)
                lemma_word = re.sub(' \)', ' ',lemma_word)
                lemma_word = re.sub('\( ', ' ',lemma_word)
                lemma_word = re.sub(' +', ' ',lemma_word)

                tokens = lemma_word.split()
                text = lemma_word
#                 keys = self.keys_lemma
            else:
                tokens = [token.text for token in sent.tokens]
                text = sent.text
#                 keys = self.keys

            label = self.extract(tokens, text=text)

            
            results.append({
                "tokens": tokens,
                "sent": sent.text,
                "labels": label
            })
        return results

    def extract(self, tokens, text = None):
        if text == None:
            text = ' '.join(tokens)
        z = [0] * len(tokens)
        for i, token in enumerate(tokens):
            z[i] = self.dicts.get(self.lemma(token),0)
        return z

class ActerDataset():
    def __init__(self, data_dir = "../ACTER", language = 'en', nes=False):
        if language == 'en':
            nlp = spacy.load("en_core_web_sm")
        elif language == 'fr':
            nlp = spacy.load("fr_core_news_sm")
        elif language == 'nl':
            nlp = spacy.load("nl_core_news_sm")
        self.sentences = []
        self.labels = []
        self.tokens = []
        self.terms = []

        language_dir = os.path.join(data_dir, language)
        for term in ['corp']:#,'equi','wind']:
            keyterm = KeyTerm(data_dir = data_dir, language=language, term = term, nes=nes)

            sentences, labels, tokens = self.extract_term(language_dir, term, keyterm, nlp)

            self.sentences.extend(sentences)
            self.labels.extend(labels)
            self.tokens.extend(tokens)
#             self.terms.extend(terms)

    def extract_term(self, data_dir, term, keyterm, nlp):
        data_dir = os.path.join(data_dir, term, 'texts', "annotated")
        sentences = []
        labels = []
        all_token = []
        terms = []
        for file in os.listdir(data_dir):
            if file.endswith('.txt') and file.startswith(term):
                data_file = os.path.join(data_dir, file)
                print(data_file)
                with open(data_file) as f:
                    for line in f:
                        results = keyterm.extract_doc(line.strip().lower(), use_lemma=True)
                        for result in results:
#                             if set(result['labels']) != {'O'}:
                            sentences.append(result['sent'])
                            labels.append(result['labels'])
                            all_token.append(result['tokens'])
                            #terms.append(result['terms'])

        return sentences, labels, all_token #, terms

In [26]:
dataset = ActerDataset()
path = "../processed_data/en/"
if not os.path.exists(path):
        os.mkdir(path) 
with open(path + "ann_train_inl.pkl", "wb") as output_file:
    pickle.dump((dataset.sentences, dataset.labels, dataset.tokens), output_file)

2021-05-28 09:32:39 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-05-28 09:32:39 INFO: Use device: cpu
2021-05-28 09:32:39 INFO: Loading: tokenize
2021-05-28 09:32:39 INFO: Loading: pos
2021-05-28 09:32:40 INFO: Loading: lemma
2021-05-28 09:32:40 INFO: Loading: depparse
2021-05-28 09:32:41 INFO: Loading: sentiment
2021-05-28 09:32:42 INFO: Loading: ner
2021-05-28 09:32:43 INFO: Done loading processors!


../ACTER/en/corp/texts/annotated/corp_en_19.txt
../ACTER/en/corp/texts/annotated/corp_en_08.txt
../ACTER/en/corp/texts/annotated/corp_en_09.txt
../ACTER/en/corp/texts/annotated/corp_en_07.txt


KeyboardInterrupt: 

In [None]:
input_file = path + "ann_train_inl.pkl"
final_train_df = pd.DataFrame()

for i in train:
    with open(i, "rb") as input_file:
        sentences, labels, tokens = pickle.load(input_file)
    sentence_id = []
    words = []
    targets = []

    for index, (token, label) in tqdm(enumerate(zip(tokens, labels))):
        for t, l in zip(token, label):
            sentence_id.append(index)
            words.append(t)
            targets.append(l)
    train_df['sentence_id'] = sentence_id
    train_df['words'] = words
    train_df['labels'] = targets
    final_train_df = final_train_df.append(train_df, ignore_index=True)

In [None]:
final_train_df

In [None]:
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger("regressor")

FP16 = False
BATCH_SIZE = 32
SEED = 42
WARMUP_PROPORTION = 0.1
PYTORCH_PRETRAINED_BERT_CACHE = "/mnt/Intel/bert_tmp"
LOSS_SCALE = 0. # Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
MAX_SEQ_LENGTH = 100

DATA_PATH = "train.csv"