In [None]:
#python ./spert.py train --config configs/train_reduced.conf

In [16]:
import transformers
import torch
import pandas as pd

### Getting the texts that are not in labeled data, separating into 2 parts

In [17]:
df_labels = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/final_labels.csv', index_col=[0])

In [18]:
df_labels = df_labels[df_labels['Dehumanization']!='не можу визначитись з правильною відповіддю'].copy()

In [19]:
df_labels['label'] = df_labels['Dehumanization'].apply(lambda x: 0 if x=='ні' else 1)

In [20]:
df_labels

Unnamed: 0,Emotion,Dehumanization,Mention,External ID,Created By,text,rating,label
1219,"так, присутня негативна",так,так,row_0.txt,snizannabotvin@gmail.com,"Всвязи с этим немного поправлю коллег ⤵️ ""Они...",4,1
1218,"ні, оцінка не присутня",ні,ні,row_1.txt,snizannabotvin@gmail.com,Литературный критик Галина Юзефович о новом ро...,4,0
1591,"так, присутня негативна",так,так,row_10.txt,snizannabotvin@gmail.com,Почему на базах неонацистов стоят языческие ис...,4,1
1198,"так, присутня негативна",так,так,row_100.txt,snizannabotvin@gmail.com,Группа добровольцев-медиков из Чеченской Респу...,4,1
3247,"ні, оцінка не присутня",ні,так,row_1000.txt,tutovadesign@gmail.com,"ВСУшники, переходите на сторону добра, у нас т...",5,0
...,...,...,...,...,...,...,...,...
3613,"ні, оцінка не присутня",ні,так,row_995.txt,tutovadesign@gmail.com,Утренний брифинг Минобороны России: ▪️ россий...,5,0
3612,"так, присутня негативна",ні,так,row_996.txt,tutovadesign@gmail.com,И понеслась мазепинщино-петлюровщино-бандеровщ...,5,0
4121,"так, присутня негативна",ні,так,row_997.txt,yevhen.marchenko91@gmail.com,Наш соратник по русскому движению Алексей Сели...,3,0
4120,"ні, оцінка не присутня",так,так,row_998.txt,yevhen.marchenko91@gmail.com,Хорошее видео от 4 бригады НМ ЛНР https://t.me...,3,1


In [21]:
dehumanizing_target_sequences = ['рейх', 'нацист', 'питек', 'бандерло', 'свино', 'вермахт', 'питек']

In [22]:
import re

def contains_target_sequence(text, target_sequences):
    for seq in target_sequences:
        if re.search(seq, text, re.IGNORECASE):
            return True
    return False

In [23]:
def split_test_data_by_target_sequences(X, y, text_columns, target_sequences):
    contains_seq = X.apply(lambda row: any(contains_target_sequence(row[col], target_sequences) for col in text_columns), axis=1)
    X_pos, y_pos = X[contains_seq], y[contains_seq]
    X_neg, y_neg = X[~contains_seq], y[~contains_seq]

    return X_pos, y_pos, X_neg, y_neg

In [24]:
# excluding the rows that were labeled for SpERT training

In [25]:
df_labels_CoNLL04 = pd.read_json('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/CoNLL04_annotation/SpERT_dataset/export_400samples.json')

In [26]:
exclusion_list = df_labels_CoNLL04['External ID'].to_list()

In [28]:
exclusion_list[:20]

['row_1566.txt',
 'row_1565.txt',
 'row_1564.txt',
 'row_1563.txt',
 'row_1562.txt',
 'row_1561.txt',
 'row_1560.txt',
 'row_1559.txt',
 'row_1558.txt',
 'row_1557.txt',
 'row_1556.txt',
 'row_1555.txt',
 'row_1554.txt',
 'row_1553.txt',
 'row_1552.txt',
 'row_1551.txt',
 'row_1550.txt',
 'row_1549.txt',
 'row_1548.txt',
 'row_1547.txt']

In [29]:
df_clean_labels = df_labels[~df_labels['External ID'].isin(exclusion_list)].copy()

In [30]:
X_test_pos, y_test_pos, X_test_neg, y_test_neg = split_test_data_by_target_sequences(df_clean_labels[['text']], df_clean_labels['label'], ['text'], dehumanizing_target_sequences)

In [31]:
import spacy

nlp = spacy.load("ru_core_news_sm")

In [32]:
def tokenize_text(text):
    return [token.text for token in nlp(text)]

In [33]:
def get_SpERT_formatted_input(X):
    formatted_texts = []

    for text in X.text:
        tokens = tokenize_text(text)
        formatted_text = {"tokens": tokens}
        formatted_texts.append([formatted_text, tokens, text])

    return formatted_texts

In [34]:
%%time
X_test_pos_formatted = get_SpERT_formatted_input(X_test_pos)

CPU times: user 8.75 s, sys: 91.7 ms, total: 8.85 s
Wall time: 8.93 s


In [35]:
%%time
X_test_neg_formatted = get_SpERT_formatted_input(X_test_neg)

CPU times: user 16.3 s, sys: 264 ms, total: 16.6 s
Wall time: 17.1 s


In [36]:
X_test_neg_list_of_dicts = []
for doc in X_test_neg_formatted:
    X_test_neg_list_of_dicts.append(doc[0])

In [37]:
X_test_pos_list_of_dicts = []
for doc in X_test_pos_formatted:
    X_test_pos_list_of_dicts.append(doc[0])

In [38]:
import json

file_path_neg_one_sample = '/Users/katerynaburovova/PycharmProjects/dehumanization/spert/data/datasets/test_datasets/test_neg.json'

with open(file_path_neg_one_sample, 'w', encoding='utf-8') as f:
    json.dump(X_test_neg_list_of_dicts, f, ensure_ascii=False, indent=4)

In [39]:
file_path_pos_one_sample = '/Users/katerynaburovova/PycharmProjects/dehumanization/spert/data/datasets/test_datasets/test_pos.json'

with open(file_path_pos_one_sample, 'w', encoding='utf-8') as f:
    json.dump(X_test_pos_list_of_dicts, f, ensure_ascii=False, indent=4)

In [40]:
file_n_path = '/Users/katerynaburovova/PycharmProjects/dehumanization/spert/data/predictions_neg.json'

with open(file_n_path, 'r') as file:
    data_neg = json.load(file)

In [41]:
len(data_neg[0]['entities'])

0

In [42]:
file_p_path = '/Users/katerynaburovova/PycharmProjects/dehumanization/spert/data/predictions_pos.json'

with open(file_p_path, 'r') as file:
    data_pos = json.load(file)

In [43]:
for sentence in data_neg:
    if len(sentence['entities'])==0:
        sentence['pred'] = 0
    else:
        sentence['pred'] = 1

In [44]:
for sentence in data_pos:
    if len(sentence['entities'])==0:
        sentence['pred'] = 0
    else:
        sentence['pred'] = 1

In [45]:
X_test_pos.reset_index(inplace = True, drop=True)

In [46]:
X_test_pos.iloc[13]

text    Мобики 2-го батальона 56-й бригады ВСУ жалуютс...
Name: 13, dtype: object

In [47]:
data_pos[13]

{'tokens': ['Мобики',
  '2-го',
  'батальона',
  '56-й',
  'бригады',
  'ВСУ',
  'жалуются',
  ',',
  'что',
  'их',
  'без',
  'подготовки',
  'и',
  'соответствующей',
  'экипировки',
  'кинули',
  'на',
  'Пески',
  ',',
  'где',
  'у',
  'хохловермахта',
  'посыпался',
  'фронт'],
 'entities': [{'type': 'HIGH_UH_LOW_NH', 'start': 21, 'end': 22}],
 'relations': [],
 'pred': 1}

In [48]:
pos_predictions = [sentence['pred'] for sentence in data_pos]

In [49]:
neg_predictions = [sentence['pred'] for sentence in data_neg]

In [50]:
from sklearn.metrics import classification_report

In [51]:
report = classification_report(y_test_pos, pos_predictions, target_names=['class_0', 'class_1'])
print(report)

              precision    recall  f1-score   support

     class_0       0.29      0.48      0.36        60
     class_1       0.97      0.93      0.95       967

    accuracy                           0.90      1027
   macro avg       0.63      0.70      0.65      1027
weighted avg       0.93      0.90      0.91      1027



In [54]:
from sklearn.metrics import precision_score

micro_precision = precision_score(y_test_pos, pos_predictions, average='micro')
micro_precision

0.9006815968841285

In [52]:
report = classification_report(y_test_neg, neg_predictions, target_names=['class_0', 'class_1'])
print(report)

              precision    recall  f1-score   support

     class_0       0.87      0.94      0.90      1621
     class_1       0.59      0.39      0.47       386

    accuracy                           0.83      2007
   macro avg       0.73      0.66      0.68      2007
weighted avg       0.81      0.83      0.82      2007



In [55]:
from sklearn.metrics import precision_score

micro_precision = precision_score(y_test_neg, neg_predictions, average='micro')
micro_precision

0.8310911808669657

Reading the log files

In [4]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-02_S_10_ep/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,71.698113,57.86802,64.044944,75.513001,58.619075,64.89844,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,10,0,590


In [5]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-03_M_10_ep/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,71.538462,66.0746,68.698061,71.223723,66.139262,68.309966,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,10,0,1860


In [6]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-03_M_20_ep/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,87.5,68.383659,76.769691,85.049838,71.572636,77.022961,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,20,0,3720


In [7]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-04_M_20_ep_200_neg/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,92.926829,67.673179,78.314491,96.259428,76.450183,84.966392,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,20,0,3720


In [8]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-04_M_20_ep_300_neg/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,93.564356,67.14032,78.179938,95.184075,68.541072,79.479634,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,20,0,3720


In [9]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-04_M_25_ep_200_neg/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,95.098039,68.916519,79.917611,96.142177,71.325648,80.790436,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,25,0,4650


In [11]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-05_14:45:19.041428_Mred_25ep_200neg/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,98.237885,96.536797,97.379913,98.346697,95.305684,96.769559,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,25,0,4650


In [12]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-05_15:49:38.240660/eval_valid.csv', delimiter=";", header=[0])
df

Unnamed: 0,ner_prec_micro,ner_rec_micro,ner_f1_micro,ner_prec_macro,ner_rec_macro,ner_f1_macro,rel_prec_micro,rel_rec_micro,rel_f1_micro,rel_prec_macro,...,rel_f1_macro,rel_nec_prec_micro,rel_nec_rec_micro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration
0,97.333333,94.805195,96.052632,97.887184,92.774194,95.239464,0.0,0.0,0.0,,...,,0.0,0.0,0.0,,,,20,0,3720


In [15]:
import json

# Load the JSON file
with open('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-05_14:45:19.041428_Mred_25ep_200neg/args.json', 'r') as f:
    data = json.load(f)
data

{'train_path': 'data/datasets/dehumanization_spert_reduced_M/conll04_train.json',
 'valid_path': 'data/datasets/dehumanization_spert_reduced_M/conll04_dev.json',
 'save_path': 'data/save/',
 'init_eval': False,
 'save_optimizer': False,
 'train_log_iter': 100,
 'final_eval': True,
 'train_batch_size': 2,
 'epochs': 25,
 'neg_entity_count': 200,
 'neg_relation_count': 100,
 'lr': 5e-05,
 'lr_warmup': 0.1,
 'weight_decay': 0.01,
 'max_grad_norm': 1.0,
 'config': 'configs/train_reduced.conf',
 'types_path': 'data/datasets/dehumanization_spert_reduced_M/conll04_types.json',
 'tokenizer_path': 'DeepPavlov/rubert-base-cased',
 'max_span_size': 10,
 'lowercase': False,
 'sampling_processes': 4,
 'model_path': 'DeepPavlov/rubert-base-cased',
 'model_type': 'spert',
 'cpu': False,
 'eval_batch_size': 1,
 'max_pairs': 1000,
 'rel_filter_threshold': 0.4,
 'size_embedding': 25,
 'prop_drop': 0.1,
 'freeze_transformer': False,
 'no_overlapping': False,
 'seed': None,
 'cache_path': None,
 'debug': 

In [14]:
import json

# Load the JSON file
with open('/Users/katerynaburovova/PycharmProjects/dehumanization/SPERT_experimentation/spert/data/log/conll04_train/2023-05-05_15:49:38.240660/args.json', 'r') as f:
    data = json.load(f)
data

{'train_path': 'data/datasets/dehumanization_spert_reduced_M/conll04_train.json',
 'valid_path': 'data/datasets/dehumanization_spert_reduced_M/conll04_dev.json',
 'save_path': 'data/save/',
 'init_eval': False,
 'save_optimizer': False,
 'train_log_iter': 100,
 'final_eval': True,
 'train_batch_size': 2,
 'epochs': 20,
 'neg_entity_count': 200,
 'neg_relation_count': 100,
 'lr': 5e-05,
 'lr_warmup': 0.1,
 'weight_decay': 0.01,
 'max_grad_norm': 1.0,
 'config': 'configs/train_reduced.conf',
 'types_path': 'data/datasets/dehumanization_spert_reduced_M/conll04_types.json',
 'tokenizer_path': 'DeepPavlov/rubert-base-cased',
 'max_span_size': 10,
 'lowercase': False,
 'sampling_processes': 4,
 'model_path': 'DeepPavlov/rubert-base-cased',
 'model_type': 'spert',
 'cpu': False,
 'eval_batch_size': 1,
 'max_pairs': 1000,
 'rel_filter_threshold': 0.4,
 'size_embedding': 25,
 'prop_drop': 0.1,
 'freeze_transformer': False,
 'no_overlapping': False,
 'seed': None,
 'cache_path': None,
 'debug': 