In [1]:
import spacy
from spacy.training import Example
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('/kaggle/input/xddddd-yasha/anti-punto-switcher-public.csv')
test_data = pd.read_csv('/kaggle/input/xddddd-yasha/anti-punto-switcher-private.csv')

In [2]:
def create_training_data(df):
    training_data = []
    for text, label in zip(df['text'], df['label']):
        text_words = text.split()
        label_words = label.split()
        
        entities = []
        offset = 0
        
        for t_word, l_word in zip(text_words, label_words):
            start = text.find(t_word, offset)
            end = start + len(t_word)
            
            if t_word != l_word:
                entities.append((start, end, "INCORRECT"))
            else:
                entities.append((start, end, "CORRECT"))
            offset = end

        training_data.append((text, {"entities": entities}))
    
    return training_data

train_data = create_training_data(data)

In [4]:
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers<4.37.0,>=3.4.0->spacy-transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m7.9 MB/s[

In [5]:
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example

In [6]:
nlp = spacy.blank("ru")
db = DocBin()

for text, annotations in train_data:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"Skipping entity ({start}, {end}, {label}) in sentence: {text}")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./train.spacy")

In [17]:
import os

config_content = """
[paths]
train = "/kaggle/working/train.spacy"
dev = "/kaggle/working/train.spacy"

[system]
gpu_allocator = "pytorch"

[nlp]
lang = "ru"
pipeline = ["transformer","ner"]
batch_size = 128

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "bert-base-multilingual-uncased"
tokenizer_config = {"use_fast": true}

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = 52
gpu_allocator = "pytorch"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
"""

with open("/kaggle/working/config.cfg", "w") as f:
    f.write(config_content)

In [20]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id=0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2024-08-18 12:04:10,958] [INFO] Set up nlp object from config
[2024-08-18 12:04:10,998] [INFO] Pipeline: ['transformer', 'ner']
[2024-08-18 12:04:11,004] [INFO] Created vocabulary
[2024-08-18 12:04:11,004] [INFO] Finished initializing nlp object
  jitify._init_module()
[2024-08-18 12:05:42,708] [INFO] Initialized pipeline components: ['transformer', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
python: tpp.c:82: __pthread_tpp_change_priority: Assertion `new_prio == -1 || (new_prio >= fifo_min_prio && new_prio <= fifo_max_prio)' failed.


In [None]:
import spacy

nlp = spacy.load("./output/model-best")

test_text = "и старательности полкового rjvfylbhf был"
doc = nlp(test_text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
def switch_layout(word):
    layout_mapping = str.maketrans("qwertyuiop[]asdfghjkl;'zxcvbnm,./", "йцукенгшщзхъфывапролджэячсмитьбюё")
    return word.translate(layout_mapping)

def correct_text(text):
    doc = nlp(text)
    corrected_text = []
    for token in doc:
        if token.ent_type_ == "INCORRECT":
            corrected_text.append(switch_layout(token.text))
        else:
            corrected_text.append(token.text)
    return " ".join(corrected_text)

results = []
for text in test_data['text']:
    results.append(correct_text(text))

with open('output.csv', 'w') as f:
    for line in results:
        f.write(line + '\n')