In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import random
from tqdm import tqdm

In [19]:
model_name = "sdadas/polish-gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [11]:
def generate(text: str, args, num_return_sequences=1) -> str:
    tokens = tokenizer(text, return_tensors="pt")

    output = model.generate(
        tokens.input_ids,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=tokens.attention_mask,
        do_sample=True,
        num_return_sequences=num_return_sequences,
        **args
    )
    output = [tokenizer.decode(out, skip_special_tokens=True) for out in output]

    return output[0] if len(output) == 1 else output

In [12]:
df = pd.read_json('../task_1/data/full_text_classification.jsonl', lines=True)
df

Unnamed: 0,text,label
0,"@USER Nic, nic,nic niewa≈ºne, jutro albo w najb...",neutralny wyd≈∫wiƒôk
1,@USER Kibic @USER odpowiada @USER i @USER na k...,neutralny wyd≈∫wiƒôk
2,M√≥wi ≈ºe stare rapsy sƒÖ ca≈Çkiem niezle,neutralny wyd≈∫wiƒôk
3,"@USER @USER Zaleg≈Ço≈õci by≈Çy, ale wa≈ºne czy by≈Ç...",neutralny wyd≈∫wiƒôk
4,@USER @USER Oby nie spierdolil na p√≥≈Çnoc,negatywny wyd≈∫wiƒôk
...,...,...
4436,"@USER Noc? To wtedy, gdy jest ciemno? Bo ≈ºadne...",neutralny wyd≈∫wiƒôk
4437,"wszƒôdzie dobrze, ale w grobie najlepiej",mowa nienawi≈õci
4438,@USER a ile zagra≈Ç tam minut ?,neutralny wyd≈∫wiƒôk
4439,@USER #FinalSix: Mamy to !!! Puchar Polski jes...,pozytywny wyd≈∫wiƒôk


In [84]:
df['label'].value_counts()

label
neutralny wyd≈∫wiƒôk    2022
negatywny wyd≈∫wiƒôk     920
pozytywny wyd≈∫wiƒôk     870
mowa nienawi≈õci        629
Name: count, dtype: int64

In [5]:
df2 = pd.read_json('../task_1/data/fragments_classification.jsonl', lines=True)
df2

Unnamed: 0,text,label
0,Nie uzna gola. Robben by≈Ç kilka metr√≥w w polu ...,"[[0, 8, odwr√≥cenie]]"
1,@USER No w≈Ça≈õnie o tym jest ten tweet üòÑ,[]
2,@USER @USER Widaƒá chcƒÖ wiecej polskich mord go...,"[[23, 38, wzmocnienie]]"
3,"Idƒô spaƒá bo padam na twarz, w ko≈Ñcu w domuuuu",[]
4,@USER Tak siƒô pozna≈Çam z moim ch≈Çopakiem üòÇ cza...,[]
...,...,...
795,@USER Wszystkiego najlepszego z okazji urodzin...,"[[5, 29, wzmocnienie]]"
796,"@USER widzƒô, ≈ºe pewne tweety dzia≈ÇajƒÖ jak magn...",[]
797,"@USER @USER Chocia≈º futro ma z jenota,\nTo nie...","[[43, 52, odwr√≥cenie], [55, 67, wzmocnienie], ..."
798,@USER Ty aby nie zaczƒÖle≈õ ƒápaƒá przez wydumane ...,"[[13, 25, odwr√≥cenie], [37, 54, wzmocnienie]]"


In [66]:
labels2 = {'wzmocnienie': 0, 'odwr√≥cenie': 0, 'os≈Çabienie': 0}
for _, row in df2.iterrows():
    for _, _, label in row['label']:
        labels2[label] += 1
labels2

{'wzmocnienie': 414, 'odwr√≥cenie': 277, 'os≈Çabienie': 97}

In [82]:
def generate_n_new(df, label, n, args):
    texts = df[df['label'] == label]['text'].tolist()
    samples = random.choices(texts, k=n)
    return [generate(sample[:max(len(sample)//2, 30)], args) for sample in tqdm(samples)]

In [23]:
args = {'max_new_tokens': 50, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.9, 'no_repeat_ngram_size': 9}

In [87]:
new_positive = generate_n_new(df, 'pozytywny wyd≈∫wiƒôk', 500, args)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [23:16<00:00,  2.79s/it]


In [88]:
import json

with open('new_samples.jsonl', 'w', encoding='utf-8') as jsonl_file:
    for entry in new_positive:
        jsonl_file.write(json.dumps({'text': entry, 'label': 'pozytywny wyd≈∫wiƒôk'}, ensure_ascii=False) + '\n')

In [89]:
new_negative = generate_n_new(df, 'negatywny wyd≈∫wiƒôk', 500, args)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [34:05<00:00,  4.09s/it]


In [90]:
import json

with open('new_samples.jsonl', 'a', encoding='utf-8') as jsonl_file:
    for entry in new_negative:
        jsonl_file.write(json.dumps({'text': entry, 'label': 'negatywny wyd≈∫wiƒôk'}, ensure_ascii=False) + '\n')

In [91]:
new_hate_speech = generate_n_new(df, 'mowa nienawi≈õci', 500, args)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [26:26<00:00,  3.17s/it]


In [92]:
import json

with open('new_samples.jsonl', 'a', encoding='utf-8') as jsonl_file:
    for entry in new_hate_speech:
        jsonl_file.write(json.dumps({'text': entry, 'label': 'mowa nienawi≈õci'}, ensure_ascii=False) + '\n')

In [8]:
def generate_fragments(df, label_name, n, args):
    
    samples_candidats = []
    for index, row in df.iterrows():
        text = row['text']
        labels = row['label']

        previous_labels = []
        for label in labels:
            if label[2] == label_name and label[0] > len(text) / 2:
                samples_candidats.append((text[:label[0]], previous_labels, label[0], label[2]))
                break
            previous_labels.append(label)

    samples_to_generate = random.choices(samples_candidats, k=n)

    new_samples = []
    for text, prev_labels, start_idx, label in tqdm(samples_to_generate):
        new_sample = generate(text, args)
        new_fragment = new_sample[start_idx:]
        new_words = new_fragment.split()
        if len(new_words) >= 1:
            end_idx = start_idx + len(new_words[0])
            if len(new_words) >= 2:
                end_idx += len(new_words[1]) + 1
            new_samples.append({'text': new_sample, 'label': prev_labels + [[start_idx, end_idx, label]]})

    return new_samples

In [None]:
args2 = {'max_new_tokens': 30, 'temperature': 0.6, 'top_k': 50, 'top_p': 0.9, 'no_repeat_ngram_size': 5}

In [64]:
generate_fragments(df2, 'os≈Çabienie', 5, args2)

[{'text': '@USER My≈õla≈Çam do dzi≈õ, ≈ºe to cyjanek potasu. A to by≈Ça czysta heroina. - To nie by≈Ço cyjankowe - powiedzia≈Ç. - To by≈Ça czysta heroina,',
  'label': [[30, 45, 'os≈Çabienie']]},
 {'text': '@USER Tyle gotujesz tego ≈ºarcia a .....nie ma co robiƒá z tymi pieniƒôdzmi. No to siƒô zastan√≥w.',
  'label': [[6, 19, 'wzmocnienie'], [34, 45, 'os≈Çabienie']]},
 {'text': '@USER @USER A niby z jakiego powodu? ≈ªeby mog≈Ça sprawowaƒá sw√≥j urzƒÖd? ___ _ - Nie, ≈ºeby mog≈Ça pe≈Çniƒá sw√≥j urzƒÖd. Ale po prostu chce mieƒá co≈õ do powiedzenia w tej sprawie. - A co ona mo≈ºe?',
  'label': [[70, 75, 'os≈Çabienie']]},
 {'text': 'Kiedy≈õ Patryk Jaki m√≥wi≈Ç\\"peda≈Ç√≥w trzeba wyciƒÖƒá\\"teraz na potrzeby kampanii !!".',
  'label': [[34, 47, 'wzmocnienie'], [76, 80, 'os≈Çabienie']]},
 {'text': 'Ostatnio siƒô zrobi≈Çam polityczna owca. Ale w sumie to nie. Ja to bym chcia≈Ça, ≈ºeby to by≈Ça prawdziwa Polska. ≈ªeby nie by≈Ço podzia≈Ç√≥w. ≈ªeby ludzie byli w sobie szcze',
  'label': [[33, 42,

In [75]:
wzmocnienia = generate_fragments(df2, 'wzmocnienie', 50, args2)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:35<00:00,  1.91s/it]


In [88]:
oslabienia = generate_fragments(df2, 'os≈Çabienie', 300, args2)

  0%|          | 0/300 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [08:53<00:00,  1.78s/it]


In [89]:
odwrocenia = generate_fragments(df2, 'odwr√≥cenie', 150, args2)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150/150 [03:45<00:00,  1.50s/it]


In [90]:
new_samples = wzmocnienia + oslabienia + odwrocenia


In [91]:
with open('new_samples_fragments.jsonl', 'w', encoding='utf-8') as jsonl_file:
    for entry in new_samples:
        jsonl_file.write(json.dumps(entry, ensure_ascii=False) + '\n')

In [None]:
import pandas as pd
import spacy
from collections import Counter

nlp = spacy.load("pl_core_news_sm")

In [32]:
def get_label_pos(doc, start_idx, end_idx):
    comb = []
    for i in range(len(doc)):
        if doc[i].idx > end_idx:
            break
        if doc[i].idx + len(doc[i]) > start_idx:
            comb.append(doc[i].pos_)
    return comb

In [33]:
def profile_pos_combinations(df):
    profiles = {'wzmocnienie': {}, 'os≈Çabienie': {}, 'odwr√≥cenie': {}}

    for _, row in df.iterrows():
        text = row['text']
        labels = row['label']
        doc = nlp(text)

        for start_idx, end_idx, label in labels:
            comb = '+'.join(get_label_pos(doc, start_idx, end_idx))
            
            if comb not in profiles[label]:
                profiles[label][comb] = 1
            else:
                profiles[label][comb] += 1
    
    return profiles


In [34]:
profiles = profile_pos_combinations(df2)
profiles

{'wzmocnienie': {'ADJ+ADJ': 3,
  'VERB+PUNCT': 12,
  'DET+NOUN': 9,
  'ADJ+NOUN+PUNCT': 21,
  'ADV+ADJ': 9,
  'ADJ': 23,
  'ADV+PUNCT': 7,
  'ADJ+NOUN': 39,
  'ADP+NOUN': 7,
  'VERB': 19,
  'NOUN+VERB': 2,
  'PRON': 8,
  'VERB+VERB': 4,
  'ADV': 21,
  'VERB+AUX': 1,
  'PART': 6,
  'DET': 4,
  'NOUN+NOUN': 7,
  'ADP+NOUN+PUNCT': 2,
  'ADV+ADP': 2,
  'VERB+ADP': 2,
  'NOUN+ADP': 2,
  'NOUN': 44,
  'ADV+ADV': 10,
  'PART+PROPN': 1,
  'PRON+VERB': 4,
  'PRON+PART': 1,
  'VERB+ADJ': 2,
  'NOUN+PUNCT': 11,
  'INTJ+PUNCT': 2,
  'PRON+NOUN': 3,
  'ADV+NOUN': 4,
  'ADJ+SCONJ': 1,
  'ADV+SPACE+NOUN': 1,
  'PART+PART': 3,
  'ADV+VERB': 9,
  'DET+ADP': 1,
  'VERB+ADV': 3,
  'PRON+PROPN': 1,
  'PRON+ADJ+PUNCT': 2,
  'ADJ+PUNCT': 8,
  'PROPN+PUNCT': 1,
  'SCONJ+VERB+ADV': 1,
  'DET+NOUN+PUNCT': 2,
  'ADV+ADV+PUNCT': 3,
  'INTJ': 6,
  'ADP+ADV+PUNCT': 1,
  'VERB+NOUN+PUNCT': 3,
  'ADV+VERB+PUNCT': 2,
  'NOUN+ADJ': 5,
  'ADJ+VERB': 1,
  'ADP+PRON': 2,
  'ADV+PRON': 3,
  'PART+ADP': 1,
  'DET+ADJ': 4,


In [48]:
def generate_fragments_with_profile_check(df, label_name, n, args):
    
    samples_candidats = []
    for _, row in df.iterrows():
        text = row['text']
        labels = row['label']

        previous_labels = []
        for start_idx, end_idx, label in labels:
            if label == label_name and start_idx > len(text) / 2:
                samples_candidats.append((text[:start_idx], previous_labels, get_label_pos(nlp(text), start_idx, end_idx), start_idx, end_idx))

            previous_labels.append((start_idx, end_idx, label))

    samples_to_generate = random.choices(samples_candidats, k=n)

    new_samples = []
    for text, prev_labels, label_pos, start_idx, end_idx in tqdm(samples_to_generate):
        new_n_samples = generate(text, args, 10)
        found = False
        for new_sample in new_n_samples:
            doc = nlp(new_sample)
            for i in range(len(doc)):
                if start_idx <= doc[i].idx:
                    valid = True
                    for j, pos in enumerate(label_pos):
                        if i+j >= len(doc) or pos != doc[i+j].pos_:
                            valid = False
                            break
                    if valid:
                        new_samples.append({'text': new_sample, 'label': prev_labels + [[doc[i].idx, doc[i+j].idx + len(doc[i+j]), label_name]]})
                        found = True
                        break
            if found:
                break
    return new_samples

In [59]:
wzmocnienia = generate_fragments_with_profile_check(df2, 'wzmocnienie', 50, args2)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [03:40<00:00,  4.40s/it]


In [60]:
oslabienia = generate_fragments_with_profile_check(df2, 'os≈Çabienie', 50, args2)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [03:32<00:00,  4.25s/it]


In [61]:
odwrocenia = generate_fragments_with_profile_check(df2, 'odwr√≥cenie', 150, args2)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150/150 [10:55<00:00,  4.37s/it]


In [62]:
new_samples = wzmocnienia + oslabienia + odwrocenia

In [64]:
import json

with open('new_samples_fragments.jsonl', 'w', encoding='utf-8') as jsonl_file:
    for entry in new_samples:
        jsonl_file.write(json.dumps(entry, ensure_ascii=False) + '\n')

In [20]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


tokenizer2 = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model2 = AutoModel.from_pretrained("allegro/herbert-base-cased")

tokenizer2.add_special_tokens({'pad_token': '[PAD]'})

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1

In [21]:
def get_sentence_embedding(sentence):
    inputs = tokenizer2(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model2(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

In [14]:
def generate_sentences_with_sim_check(df, label, n, args):
    texts = df[df['label'] == label]['text'].tolist()
    samples = random.choices(texts, k=n)
    new_samples = []

    for sample in tqdm(samples):
        sample_embeddings = get_sentence_embedding(sample)
        input = sample[:max(len(sample)//2, 30)]
        output = generate(input, args, 10)
        best_sample = ''
        best_sim = 0
        for new_sample in output:
            sim = cosine_similarity(sample_embeddings, get_sentence_embedding(new_sample))
            if sim > best_sim:
                best_sim = sim
                best_sample = new_sample
        new_samples.append(best_sample)
    return new_samples

In [24]:
new_positive = generate_sentences_with_sim_check(df, 'pozytywny wyd≈∫wiƒôk', 500, args)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [54:37<00:00,  6.56s/it] 


In [25]:
new_negative = generate_sentences_with_sim_check(df, 'negatywny wyd≈∫wiƒôk', 500, args)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [55:37<00:00,  6.68s/it]


In [26]:
new_hate_speech = generate_sentences_with_sim_check(df, 'mowa nienawi≈õci', 500, args)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [53:26<00:00,  6.41s/it]


In [27]:
import json

with open('new_samples.jsonl', 'w', encoding='utf-8') as jsonl_file:
    for entry in new_positive:
        jsonl_file.write(json.dumps({'text': entry, 'label': 'pozytywny wyd≈∫wiƒôk'}, ensure_ascii=False) + '\n')

with open('new_samples.jsonl', 'a', encoding='utf-8') as jsonl_file:
    for entry in new_negative:
        jsonl_file.write(json.dumps({'text': entry, 'label': 'negatywny wyd≈∫wiƒôk'}, ensure_ascii=False) + '\n')

with open('new_samples.jsonl', 'a', encoding='utf-8') as jsonl_file:
    for entry in new_hate_speech:
        jsonl_file.write(json.dumps({'text': entry, 'label': 'mowa nienawi≈õci'}, ensure_ascii=False) + '\n')