In [1]:
import os
import django

# The following lines are here to avoid `SynchronousOnlyOperation` exception
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

In [7]:
from articles.models import Article

cnt = Article.objects.count()
cnt

16826

In [2]:
## Proof of concept class

class ArticleNERWriter:
    def __init__(self, article):
        self.file_name = self.__get_file_name_from_url(article.article_url)
        self.docs = spacy_nlp(article.text)
    
    def __get_file_name_from_url(self, url):
        """
        url: https://ky.kloop.asia/2011/05/03/osh-shaarynda-birinchi-zholu-mektep-okuuchularynyn-arasynda-mektep-perisi-2011-synak-tk-z-ld/
        """
        split_res = url.split('/')
        # for a case if URL ends without /
        if split_res[-1]:
            pref = split_res[-1]
        # if URL ends with / then use the second item from the end
        else:
            pref = split_res[-2]
        return f'{pref}.csv'
    
    def write_sentence_level_ner_to_file(self):
        with open(self.file_name, mode='w') as ner_file:
            writer = csv.writer(ner_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for sent in self.docs.sents:
                text = sent.text.strip()
                if text:
                    ner_results = hf_nlp(text)
                    if ner_results:
                        writer.writerow([text, *ner_results])

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from spacy.lang.ky import Kyrgyz

tokenizer = AutoTokenizer.from_pretrained("Davlan/xlm-roberta-base-ner-hrl")
model = AutoModelForTokenClassification.from_pretrained("Davlan/xlm-roberta-base-ner-hrl")
hf_nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

spacy_nlp = Kyrgyz()
spacy_nlp.add_pipe("sentencizer")



<spacy.pipeline.sentencizer.Sentencizer at 0x11c8942c0>

In [14]:
art = Article.objects.first()
doc = spacy_nlp(art.text)

ner_raw = []

for sent in doc.sents:
    text = sent.text.strip()
    if text:
        st = doc.text.find(text)
        en = st + len(text) + 1
        ner_results = hf_nlp(text)
        if ner_results:
            ner_d = {
                'sentence_start': st,
                'sentence_end': en,
                'NER': ner_results
            }
            ner_raw.append(ner_d)

In [15]:
doc.text[3028:3135]

'Авторлор: Бекжан Джусупов, Ариет Туратбеков\nСүрөттүн автору: Аскатбек уулу Ибрагим\nРедактор: Айзат Шакиева\n'

In [16]:
ner_raw[-1]

{'sentence_start': 3028,
 'sentence_end': 3135,
 'NER': [{'entity_group': 'PER',
   'score': 0.9997777,
   'word': 'Бекжан Джусупов',
   'start': 9,
   'end': 25},
  {'entity_group': 'PER',
   'score': 0.99878484,
   'word': 'Ариет Туратбеков',
   'start': 26,
   'end': 43},
  {'entity_group': 'PER',
   'score': 0.9997801,
   'word': 'Аскатбек уулу Ибрагим',
   'start': 60,
   'end': 82},
  {'entity_group': 'PER',
   'score': 0.9997498,
   'word': 'Айзат Шакиева',
   'start': 92,
   'end': 106}]}

In [31]:
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            # 👇️ alternatively use str()
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

## Loop through all articles and write their NERs to ArticleNER

In [32]:
# For now, we'll only try to write the first article's contents

for art in [Article.objects.first()]:
    doc = spacy_nlp(art.text)
    ner_raw = []
    for sent in doc.sents:
        text = sent.text.strip()
        if text:
            st = doc.text.find(text)
            en = st + len(text) + 1
            ner_results = hf_nlp(text)
            if ner_results:
                ner_d = {
                    'sentence_start': st,
                    'sentence_end': en,
                    'NER': ner_results
                }
                ner_raw.append(ner_d)
    ArticleNER.objects.create(article=art, ner_raw=json.dumps(ner_raw, cls=NpEncoder))