In [1]:
import os
import pickle
import pathlib
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import parallel_backend, Parallel, delayed
from nltk.stem import WordNetLemmatizer

# prepare environment
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
nltk.download('punkt')
nltk.download('stopwords')
DATA = pathlib.Path('data')

[nltk_data] Downloading package punkt to /home/avagadro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/avagadro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


У меня Fedora и GTX1070, tensorflow из коробки с cuda не работает, видимо надо собирать из исходников.
Чтобы этого не делать, взял предобученную сетку  с huggingface.

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")



2022-10-25 22:22:37,526 loading file /home/avagadro/.flair/models/ner-english-ontonotes-fast/0d55dd3b912da9cf26e003035a0c269a0e9ab222f0be1e48a3bbba3a58c0fed0.c9907cd5fde3ce84b71a4172e7ca03841cd81ab71d13eb68aa08b259f57c00b6
2022-10-25 22:22:42,906 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


__load data__

In [7]:
train_raw = pd.read_csv(DATA / 'train.tsv', sep='\t')
valid_raw = pd.read_csv(DATA / 'test.tsv', sep='\t')

# concat
raw = pd.concat([train_raw, valid_raw])

### tagging

__predict NER-tags__

In [4]:
# prepare corpus
cleaned = raw['review'].str.replace(r'<.*?>', '', regex=True)       # remove html tags
with parallel_backend('loky'):
    corpus = Parallel()(delayed(Sentence)(elem) for elem in cleaned)

In [5]:
BATCH_SIZE = 25
rng = range(np.ceil(len(corpus) / BATCH_SIZE).astype(int))

for i in tqdm(rng, total=len(rng), desc='tagging'):
    tagger.predict(corpus[i*BATCH_SIZE:(i+1)*BATCH_SIZE])

# save
pickle.dump(corpus, open((DATA / 'corpus.pkl').as_posix(), 'wb'))

tagging: 100%|██████████| 2000/2000 [1:14:40<00:00,  2.24s/it]  


__replace tokens__

In [2]:
# load
corpus = pickle.load(open((DATA / 'corpus.pkl').as_posix(), 'rb'))

In [3]:
REPLACE_NER = ['ORG', 'PERSON', 'FAC', 'GPE', 'LOC']    # эти сущности заменяются на соотв. теги
# https://huggingface.co/flair/ner-english-ontonotes-fast?text=On+September+1st+George+Washington+won+1+dollar.

def replace_tokens(sentence, as_list=True):
    sn_tokens = []
    for token in sentence.tokens:
        tk = token.text
        for span in sentence.get_spans('ner'):
            label = span.get_label().value
            if (token in span.tokens) and (label in REPLACE_NER):
                tk = f'[{label}]'
                break
        sn_tokens.append(tk)
    return sn_tokens if as_list else ' '.join(sn_tokens)


with parallel_backend('loky'):
    replaced = Parallel()(delayed(replace_tokens)(sentence) for sentence in tqdm(corpus, desc='replacing NERs'))

# save
pickle.dump(replaced, open((DATA / 'corpus_replaced.pkl').as_posix(), 'wb'))

replacing NERs: 100%|██████████| 50000/50000 [05:05<00:00, 163.70it/s]


### prepare dataset

In [4]:
# load
replaced = pickle.load(open((DATA / 'corpus_replaced.pkl').as_posix(), 'rb'))

In [5]:
STOPWORDS = nltk.corpus.stopwords.words('english')
lemfunc = np.vectorize(WordNetLemmatizer().lemmatize)   # векторизация лемматизатора для ускорения работы со списками

def drop_stopwords(tokens, min_length=3):
    return [w.lower() for w in tokens if w not in STOPWORDS and len(w) >= min_length]


# lemmatize
with parallel_backend('loky'):
    prepared = Parallel()(delayed(lemfunc)(elem) for elem in replaced)
# drop stopwords
with parallel_backend('loky'):
    prepared = Parallel()(delayed(drop_stopwords)(elem) for elem in prepared)


In [10]:
# back split and save
data = pd.concat([pd.Series(prepared), raw['is_positive'].reset_index(drop=True)], axis=1).rename(columns={0: 'tokens'})

data[:train_raw.shape[0]].to_csv(DATA / 'prepared_train.csv', index=False)
data[train_raw.shape[0]:].to_csv(DATA / 'prepared_valid.csv', index=False)

In [None]:
#