In [1]:
import datasets
from datasets import Dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import re
import pandas as pd
from tqdm.notebook import tqdm
from itertools import combinations
import ftfy
import spacy

tqdm.pandas()

pipe = pipeline("ner", model="dslim/bert-large-NER", device="cuda")
nlp=spacy.load('en_core_web_sm', disable=["ner", "lematizer"])

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def strip_parenthesis(row):
    body = ftfy.fix_text(row['body']).replace("\xa0", "").replace("\n", " ").replace("—", "-")
    body = re.sub(r'\(.*?\)|\[.*?\]|<.*?>|\{.*?\}', '', body)
    row["body"] = re.sub(r'\s{2,}', ' ', body).strip()
    return row

In [3]:
df = pd.read_json("mining.json")
df = df.progress_apply(strip_parenthesis, axis=1)

  0%|          | 0/16671 [00:00<?, ?it/s]

In [4]:
docs = nlp.pipe(df['body'].tolist(), n_process=2, batch_size=1000)

In [5]:
res = []
for doc in tqdm(docs, total=len(df)):
    sents = list(doc.sents)
    res.append([sent.text.strip() for sent in sents])

  0%|          | 0/16671 [00:00<?, ?it/s]

In [6]:
df["sentences"] = res
df = df.explode("sentences").reset_index(drop=True)
df = df.rename(columns={"sentences": "sentence"}).drop(columns=["body"]).dropna()

In [7]:
def process_ner(example):
    outputs = pipe(example["sentence"], aggregation_strategy="first")
    outputs = [[x for x in output if x['entity_group'] == 'ORG'] for output in outputs]

    batch_from = []
    batch_to = []
    batch_masked_sentences = []

    for idx, output in enumerate(outputs):
        if len(output) < 2:
            batch_from.append(None)
            batch_to.append(None)
            batch_masked_sentences.append(None)
            continue

        example_from = []
        example_to = []
        example_masked_sentences = []

        for token_idx_from, token_idx_to in combinations(range(len(output)), r=2):
            masked_sentence = example["sentence"][idx]
            for token_idx, token in reversed(list(enumerate(output))):
                if token_idx == token_idx_to:
                    example_to.append(token['word'])
                    masked_sentence = masked_sentence[:token['start']] + "__NE_TO__" + masked_sentence[token['end']:]
                elif token_idx == token_idx_from:
                    example_from.append(token['word'])
                    masked_sentence = masked_sentence[:token['start']] + "__NE_FROM__" + masked_sentence[token['end']:]
                else:
                    masked_sentence = masked_sentence[:token['start']] + "__NE_OTHER__" + masked_sentence[token['end']:]
            example_masked_sentences.append(masked_sentence)
        batch_from.append(example_from)
        batch_to.append(example_to)
        batch_masked_sentences.append(example_masked_sentences)

    #torch.cuda.empty_cache()  # Clear GPU memory after processing each batch

    return {
        "from": batch_from,
        "to": batch_to,
        "masked_sentence": batch_masked_sentences
    }
ds = Dataset.from_pandas(df)

In [8]:
ds = ds.map(process_ner, batched=True, batch_size=128)

Map:   0%|          | 0/314184 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [9]:
df = ds.to_pandas()
df = df.drop(columns=["__index_level_0__"])
df = df.dropna(subset=["from", "to", "masked_sentence"]).reset_index(drop=True)
df = df.explode(["from", "to", "masked_sentence"]).reset_index(drop=True)

In [11]:
def replace_tokens(text):
    text = text.replace("[FROM]", "__NE_FROM__")
    text = text.replace("[TO]", "__NE_TO__")
    text = text.replace("[OTHER]", "__NE_OTHER__")
    return text

def check_adjacent_tokens(text):
    pattern = r'(__NE_FROM__|__NE_TO__|__NE_OTHER__)\s+(__NE_FROM__|__NE_TO__|__NE_OTHER__)'
    match = re.search(pattern, text)
    return match is not None

df['masked_sentence'] = df['masked_sentence'].apply(replace_tokens)
df["check_tokens"] = df["masked_sentence"].apply(check_adjacent_tokens)
ddf = df[df['check_tokens'] != True].reset_index(drop=True).drop(columns="check_tokens")
ddf.to_json("mining_processed.json", orient="records", force_ascii=False)
