In [1]:
import pandas as pd
import numpy as np
import os
import re
import tqdm
from typing import List, Dict
import json
import shutil

In [2]:
import stanza
import pymorphy3
import pymorphy2

nlp = stanza.Pipeline('uk', processors='tokenize,mwt,pos,lemma,depparse')
morph = pymorphy3.MorphAnalyzer()


# # stanza.download('uk')

2025-04-22 10:29:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-22 10:29:14 INFO: Downloaded file to /Users/linndfors/stanza_resources/resources.json
2025-04-22 10:29:15 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |
| depparse  | iu_charlm   |

2025-04-22 10:29:15 INFO: Using device: cpu
2025-04-22 10:29:15 INFO: Loading: tokenize
2025-04-22 10:29:17 INFO: Loading: mwt
2025-04-22 10:29:17 INFO: Loading: pos
2025-04-22 10:29:21 INFO: Loading: lemma
2025-04-22 10:29:23 INFO: Loading: depparse
2025-04-22 10:29:23 INFO: Done loading processors!


In [3]:
parallel_dataset_ng_dataset = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/data/csv_files_with_par_sentences/ng_parallel.csv")
parallel_dataset_bruk_dataset = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/data/csv_files_with_par_sentences/bruk_parallel.csv")

In [5]:
parallel_dataset_bruk_dataset.head()

Unnamed: 0,original_sentence,orig_sent_id,changed_sentence,changed_sent_id,original_file_name,orig_ann,changed_ann
0,Його редактором був поет-символіст Яків Савчен...,46,Його редакторкою була поетеса-символістка Анже...,46,e5e76a8efa0f.txt,"{'T30': ('поет-символіст', 'JOB')}","{'T30': ('поетеса-символістка', 'JOB')}"
1,Міжпредметні паралелі . Маніфест футуристів ск...,62,Міжпредметні паралелі . Маніфест футуристок ск...,62,e5e76a8efa0f.txt,"{'T49': ('поет', 'JOB')}","{'T49': ('поетеса', 'JOB')}"
2,Помітною була організація « Гарт » ( 1923 — 19...,96,Помітною була організація « Гарт » ( 1923 — 19...,96,e5e76a8efa0f.txt,"{'T104': ('поет', 'JOB')}","{'T104': ('поетка', 'JOB')}"
3,Її очолив байкар і прозаїк Сергій Пилипенко .,127,Її очолила байкарка і прозаїкиня Марія Пилипен...,127,e5e76a8efa0f.txt,"{'T143': ('байкар', 'JOB'), 'T144': ('прозаїк'...","{'T143': ('байкарка', 'JOB'), 'T144': ('прозаї..."
4,У Галицько-Волинському літописі згадується спі...,171,У Галицько-Волинському літописі згадується спі...,171,e5e76a8efa0f.txt,"{'T218': ('співець', 'JOB')}","{'T218': ('співчиня', 'JOB')}"


# Split: Female, Male, Common genders

In [3]:
with open('/Users/linndfors/study/diploma/uk-gender-word-mapper/common_gender_words_list.txt') as file:
    common_gender_words_list = [line.strip() for line in file if line.strip()]

with open('/Users/linndfors/study/diploma/uk-gender-word-mapper/male_words_list.txt') as file:
    male_list = [line.strip() for line in file if line.strip()]

with open('/Users/linndfors/study/diploma/uk-gender-word-mapper/female_words_list.txt') as file:
    female_list = [line.strip() for line in file if line.strip()]

gender_dict_df = pd.read_csv("/Users/linndfors/study/diploma/uk-gender-word-mapper/gender_pairs_dictionary.csv")

gender_dict = {'male': [], 'female': []}

for _, row in gender_dict_df.iterrows():
    gender_dict['male'].append(row['male'])
    female_values = [f.strip() for f in row['female'].split(',')]
    gender_dict['female'].extend(female_values)

gender_dict['female'] = list(set(gender_dict['female']))

exceptions_common = ["судді",  "глава", "голова", "керівництво", "в. о.", "головою"]
exceptions_male = ['ієромонах', 'прокурор', 'віце-премʼєр', 'премʼєр', 'начальник', 'міністр', 'директор', 'підрядник', 'генпідрядник', 'керівник', 'головнокомандувач']
exceptions_female = ['премʼєрка', 'докторка', 'докторантка', 'директорка', 'міністерка', 'керівниця', 'начальниця', 'начальницю', 'генпідрядниця', 'прокурорка', 'ректорка']

In [4]:
def parse_output(ent):
    noun_dict = {
        "хліборобок": "хліборобка",
    "культурологині": "культурологиня",
    "директоркою": "директорка",
    "слідчій": "слідча",
    "прокурорка": "прокурорка",
    "прокурорки": "прокурорка",
    "Прокурорки": "прокурорка",
    "прокуроркою": "прокурорка",
    "прокурорці": "прокурорка",
    "адміністраторок": "адміністраторка",
    "слідчу": "слідча",
    "слідча": "слідча",
    "начальницею": "начальниця",
    "податківка": "податківка",
    "монопольниці": "монопольниця",
    "начальницю": "начальниця",
    "психіатрині": "психіатриня",
    "наркологині": "наркологиня",
    "міністерки": "міністерка",
    "рітейлерками": "рітейлерка",
    "начальниці": "начальниця",
    "екологині": "екологиня",
    "податківчині": "податківчиня",
    "журналісткам": "журналістка",
    "підрядницею": "підрядниця",
    "директорка": "директорка",
    "інженерка": "інженерка",
    "службовиці": "службовиця",
    "інженерці": "інженерка",
    "податківниць": "податківниця",
    "депутатку": "депутатка",
    "керівниця": "керівниця",
    "керівницею": "керівниця",
    "юристок": "юристка",
    "інспекторки": "інспекторка",
    "інженерок": "інженерка",
    "ревізорок": "ревізорка",
    "нардепок": "нардепка",
    "бізнесменок": "бізнесменка",
    "генпідрядниці": "генпідрядниця",
    "прокурор": "прокурор",
    "віце-премʼєра": "віце-прем'єр",
    "премʼєр": "прем'єр",
    "начальника": "начальник",
    "начальником": "начальник",
    "міністра": "міністр",
    "директором": "директор",
    "підрядником": "підрядник",
    "аграрія": "аграрій",
    "депутата": "депутат",
    "генпідрядника": "генпідрядник",
    "генпідрядником": "генпідрядник",
    "начальник": "начальник",
    "керівником": "керівник",
    "керівник": "керівник",
    "судді": "суддя",
    "ченці": "чернець",
    "ченців": "чернець",
    "глави": "глава",
    "голови": "голова",
    "керівництво": "керівництво",
    "головою": "голова",
    "князів": "князь",
    "водія": "водій",
    "хіміка": "хімік",
    "хіміки": "хімік",
    "ведучій": "ведуча",
    "механіка": "механік",
    "інокам": "інок",
    "головнокомандувача": "головнокомандувач",
    "наркологині": "наркологиня",
    "членкині": "членкиня",
    "інженерці": "інженерка",
    "мисливствознавиці": "мисливствознавиця",
    "логопедині": "логопединя",
    "математикині": "математикиня",
    "психологині": "психологиня", 
    "філологині": "філологиня",
    "голопедині": "голопединя",
    "урядовицs": "урядовиця",
    "філософині": "філософиня",
    "педагогині": "педагогиня",
    "мера": "мер"
    }
    if ent in noun_dict.keys():
        return noun_dict[ent]
    return

In [5]:
def extract_main_word(text):
    if "-" in text:
        parts = text.split("-")
        main_word = parts[-1]
        doc = nlp(main_word)
        for sentence in doc.sentences:
            for word in sentence.words:
                return word.lemma

    doc = nlp(text)

    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head == 0:
                main_word = word
                if main_word.text in {"рок", "анти", "псевдо", "віце", "топ"}:
                    continue
                return main_word.lemma
    for sentence in doc.sentences:
        for word in sentence.words:
            return word.lemma
        
def extract_job_gender(entity):
    entity = entity.lower()
    job = nlp(entity)
    join_sign = " " if " " in entity else ("" if "-" in entity else "")
    job_ent_list = [word.lemma for sent in job.sentences for word in sent.words]
    job_lemma = join_sign.join(job_ent_list)

    if len(job_ent_list) > 1:
        job_lemma = extract_main_word(job_lemma)

    words = entity.split()
    lemmatized_words = []
    for word in words:
        parsed_word = morph.parse(word)[0]
        if 'plur' in parsed_word.tag:
            singular_form = parsed_word.inflect({'sing'}).word if parsed_word.inflect({'sing'}) else parsed_word.normal_form
            lemmatized_words.append(singular_form)
        else:
            lemmatized_words.append(parsed_word.normal_form)
            
    job_lemma_pymorphy = join_sign.join(lemmatized_words)

    if len(job_ent_list) > 1:
        job_lemma_pymorphy = extract_main_word(job_lemma_pymorphy)

    ent_forms = [entity, job_lemma, job_lemma_pymorphy]

    custom_dict_ent_value = parse_output(entity)
    if custom_dict_ent_value:
        ent_forms.append(custom_dict_ent_value)

    for word in words:
        for common_word in exceptions_common:
            if common_word == word or common_word == parse_output(word):
                return "common", job_lemma
            
        for female_word in exceptions_female:
            if female_word == word or female_word == parse_output(word):
                return "female", job_lemma
            
        for male_word in exceptions_male:
            if male_word == word or male_word == parse_output(word):
                return "male", job_lemma
        
    for x in ent_forms:
        if (x in gender_dict['female']) or (x in female_list) or ("знавиця" in x):
            return "female", job_lemma
        elif (x in gender_dict['male']) or (x in male_list) or ("знавець" in x) or (x=="мера"):
            return "male", job_lemma
        elif x in common_gender_words_list:
            return "common", job_lemma
        
    # print("unkown for:", entity, " - ", job_lemma, " - ", job_lemma_pymorphy)
    return "unknown_gender", job_lemma

In [6]:
import ast

def return_gendered_dict(parallel_dataset, annotation_col_name, swapped=0):
    file_gender_dict = {"male": {}, "female": {}, "common": {}, "unknown_gender": {}}

    total_job_counter = 0
    job_list = []

    for x, row in tqdm.tqdm(parallel_dataset.iterrows()):
        filename = row['original_file_name']
        if swapped:
            filename = filename.replace(".txt", "_1.txt")
        orig_annotation = row[annotation_col_name]

        ann_str = orig_annotation.replace("'", '"')
        ann_str = ann_str.replace("–", '-')
        json_ann = ast.literal_eval(ann_str)
        
        try:
            for ent, feat in json_ann.items():
                if feat[1] == 'JOB':
                    total_job_counter += 1
                    job_list.append(feat[0])
                    gender_value, lemma_word = extract_job_gender(feat[0])
                    
                    if filename not in file_gender_dict[gender_value]:
                        file_gender_dict[gender_value][filename] = [(ent, lemma_word)]
                    else:
                        file_gender_dict[gender_value][filename].append((ent, lemma_word))
                    
        except Exception as e:
            print(f"Issue with row: {row} - Error: {e}")
    return file_gender_dict, total_job_counter, job_list

In [7]:
from collections import Counter

def return_gender_stat(total_job_counter, file_gender_dict):
    print("total size:", total_job_counter)

    for gender_class, val in file_gender_dict.items():
        print("================")
        
        print("gender:", gender_class)

        gender_entities = []

        print("number of files for the gender:", len(file_gender_dict[gender_class]))
        number_of_ents = 0
        for files, ents in file_gender_dict[gender_class].items():
            number_of_ents += len(ents)
            gender_entities += [pair[1] for pair in ents]
            
        print("number of entities for the gender:", number_of_ents)
        print("percentage:", number_of_ents/total_job_counter)

        counter = Counter(gender_entities)

        print("The most popular entity:", counter.most_common(1)[0][0])

        print("================")

In [None]:
def split_files(file_gender_dict, source_dir):
    female_files = []
    male_files = []
    common_files = []

    for filename, gen_stat in file_gender_dict.items():
        if len(list(gen_stat.keys())) == 1:
            full_path = os.path.join(source_dir, filename)
            gen = list(gen_stat.keys())[0]
            if gen == "male":
                male_files.append(full_path)
            elif gen == "female":
                female_files.append(full_path)
            elif gen == "common":
                common_files.append(full_path)


    return female_files, male_files, common_files

In [None]:
def return_gender_file_stat(file_gender_dict):
    file_gen_stat_dict = {}

    for gen, file_stat in file_gender_dict.items():
        for filename, ents in file_stat.items():
            if filename not in file_gen_stat_dict:
                file_gen_stat_dict[filename] = {}
            file_gen_stat_dict[filename][gen] = len(ents)
    return file_gen_stat_dict

## Original entitites

### Ng

In [15]:
orig_ng_file_gender_dict, orig_ng_total_job_counter, orig_ng_job_list = return_gendered_dict(parallel_dataset_ng_dataset, "orig_ann")

178it [00:58,  3.98it/s]

unkown for: податківці  -  податок  -  податківці


234it [01:16,  2.98it/s]

unkown for: _керівником  -  ткерівник  -  _керівник


311it [01:45,  2.76it/s]

unkown for: в. о. керівника  -  в  -  в.


317it [01:48,  1.77it/s]

unkown for: народного депутата-бютівця  -  бютівець  -  бютівець


328it [01:52,  1.65it/s]

unkown for: віце-премʼєра  -  прем’єрний  -  прем’єрый


506it [02:47,  4.70it/s]

unkown for: головні мисливствознавці держлісгоспів  -  мисливствітнавка  -  мисливствітнавка
unkown for: аграрія  -  аграрія  -  аграріть


563it [03:00,  5.47it/s]

unkown for: смотрящим  -  смотрящий  -  смотреть


671it [03:27,  9.13it/s]

unkown for: службовці  -  службовка  -  службовці


742it [03:50,  2.37it/s]

unkown for: в. о. генерального директора  -  в  -  в.


784it [04:06,  5.70it/s]

unkown for: аграрії  -  аграрія  -  аграрії


865it [04:25,  6.39it/s]

unkown for: податківців  -  податокець  -  податківціть


908it [04:37,  4.97it/s]

unkown for: в. о. гендиректора держкомпанії  -  в  -  в.


930it [04:43,  4.40it/s]

unkown for: т. в. о. директора  -  в.  -  в.


953it [04:49,  4.08it/s]

unkown for: податківка  -  податківка  -  податківка


954it [04:49,  4.83it/s]

unkown for: податківка  -  податківка  -  податківка


972it [04:53,  4.92it/s]

unkown for: в. о. управління  -  в  -  в.


990it [04:57,  4.49it/s]

unkown for: віце-премʼєра  -  прем’єрний  -  прем’єрый


1021it [05:04,  3.35it/s]


In [13]:
return_gender_stat(orig_ng_total_job_counter, orig_ng_file_gender_dict)

total size: 1338
gender: male
number of files for the gender: 269
number of entities for the gender: 1134
The most popular entity: директор
gender: female
number of files for the gender: 16
number of entities for the gender: 25
The most popular entity: підприємиця
gender: common
number of files for the gender: 85
number of entities for the gender: 161
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 16
number of entities for the gender: 18
The most popular entity: в


In [None]:
# orig_ng_female_files, orig_ng_male_files, orig_ng_common_files = split_files(orig_ng_file_gender_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/ng")

In [40]:
orig_ng_file_gen_stat_dict = return_gender_file_stat(orig_ng_file_gender_dict)

In [41]:
orig_ng_female_files, orig_ng_male_files, orig_ng_common_files = split_files(orig_ng_file_gen_stat_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/ng")

In [42]:
print(f"Total size: {len(orig_ng_file_gen_stat_dict.keys())}")
print(f"Number of male only files: {len(orig_ng_male_files)}")
print(f"Number of female only files: {len(orig_ng_female_files)}")
print(f"Number of common only files: {len(orig_ng_common_files)}")

Total size: 274
Number of male only files: 166
Number of female only files: 0
Number of common only files: 4


### BRUK

In [43]:
orig_bruk_file_gender_dict, orig_bruk_total_job_counter, orig_bruk_job_list = return_gendered_dict(parallel_dataset_bruk_dataset, "orig_ann")
return_gender_stat(orig_bruk_total_job_counter, orig_bruk_file_gender_dict)
# orig_bruk_female_files, orig_bruk_male_files, orig_bruk_common_files = split_files(orig_bruk_file_gender_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/bruk")

0it [00:00, ?it/s]

2it [00:00,  3.47it/s]

unkown for: поет-символіст  -  символіст  -  символіст


85it [00:19,  3.49it/s]

unkown for: підмайстра  -  підмайстр  -  підмайстр


88it [00:20,  5.13it/s]

unkown for: підмайстра  -  підмайстр  -  підмайстр


188it [00:57,  5.19it/s]

unkown for: гармаші  -  гармаша  -  гармаші


213it [01:01,  5.03it/s]

unkown for: спортсменок  -  спортсменок  -  спортсменки


221it [01:04,  2.18it/s]

unkown for: командувачів видів  -  вид  -  командувати


250it [01:10,  4.73it/s]

unkown for: вчені  -  вчень  -  вчені


255it [01:11,  7.02it/s]

unkown for: богословів  -  богослів  -  богословіть


335it [01:24,  4.00it/s]

unkown for: канд . істор . наук  -  наука  -  наука


336it [01:26,  1.74it/s]

unkown for: канд . істор . наук  -  наука  -  наука


337it [01:27,  1.51it/s]

unkown for: канд . істор . наук  -  наука  -  наука


340it [01:29,  1.42it/s]

unkown for: канд . істор . наук  -  наука  -  наука


342it [01:30,  1.97it/s]

unkown for: канд . істор . наук  -  наука  -  наука


368it [01:36,  8.86it/s]

unkown for: учителі-ченці  -  ченка  -  ченка


372it [01:38,  4.18it/s]

unkown for: учителів-ченців  -  ченц  -  ченцати


379it [01:42,  1.51it/s]

unkown for: учителі-ченці  -  ченка  -  ченка


381it [01:43,  2.05it/s]

unkown for: ктиторів  -  ктитор  -  ктиторіть


386it [01:44,  3.46it/s]

unkown for: ченців-законників  -  законник  -  законнити


393it [01:45,  7.63it/s]

unkown for: ченцях  -  ченці  -  ченци
unkown for: ченцях  -  ченці  -  ченци


400it [01:46,  7.29it/s]

unkown for: ченцеві  -  ченцева  -  ченцеві


405it [01:47,  6.93it/s]

unkown for: келарів  -  келар  -  келаріть


413it [01:48,  6.46it/s]

unkown for: єпископів  -  єписокп  -  єпископіть


414it [01:49,  6.10it/s]

unkown for: біскупи  -  біскупа  -  біскупить


452it [01:55,  9.75it/s]

unkown for: нквд-исти  -  нквд-исти  -  нквд-исть


454it [01:56,  8.12it/s]

unkown for: податківцям  -  податокець  -  податківци


460it [01:58,  2.89it/s]

unkown for: медичної сестри  -  сестра  -  сестереть


462it [01:59,  2.68it/s]

unkown for: медсестри  -  медсестр  -  медсестрить


473it [02:01,  4.72it/s]

unkown for: поетів-авангардистів  -  авангардист  -  авангардисти


476it [02:02,  6.52it/s]

unkown for: лінгвісти  -  лінгвість  -  лінгвість


480it [02:03,  3.22it/s]

unkown for: урядовці  -  урядовка  -  урядовці


492it [02:05,  3.92it/s]

total size: 638
gender: male
number of files for the gender: 122
number of entities for the gender: 508
The most popular entity: поет
gender: female
number of files for the gender: 23
number of entities for the gender: 46
The most popular entity: вчителька
gender: common
number of files for the gender: 24
number of entities for the gender: 53
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 14
number of entities for the gender: 31
The most popular entity: наука





In [47]:
orig_bruk_file_gen_stat_dict = return_gender_file_stat(orig_bruk_file_gender_dict)
orig_bruk_female_files, orig_bruk_male_files, orig_bruk_common_files = split_files(orig_bruk_file_gen_stat_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/bruk")

In [48]:
print(f"Total size: {len(orig_bruk_file_gen_stat_dict.keys())}")
print(f"Number of male only files: {len(orig_bruk_male_files)}")
print(f"Number of female only files: {len(orig_bruk_female_files)}")
print(f"Number of common only files: {len(orig_bruk_common_files)}")

Total size: 131
Number of male only files: 84
Number of female only files: 6
Number of common only files: 3


## Swapped entitites

### Ng

In [49]:
changed_ng_file_gender_dict, changed_ng_total_job_counter, changed_ng_job_list = return_gendered_dict(parallel_dataset_ng_dataset, "changed_ann", 1)
return_gender_stat(changed_ng_total_job_counter, changed_ng_file_gender_dict)
# changed_ng_female_files, changed_ng_male_files, changed_ng_common_files = split_files(changed_ng_file_gender_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/ng_changed")

84it [00:26,  3.94it/s]

unkown for: академічки  -  академічка  -  академічка


151it [01:01,  4.26it/s]

unkown for: бойовички  -  бойовичка  -  бойовичка


234it [01:42,  3.67it/s]

unkown for: _керівницею  -  хкерівниця  -  _керівница


255it [01:49,  3.61it/s]

unkown for: де  -  де  -  де


277it [01:54,  3.17it/s]

unkown for: членкині наглядової ради  -  рада  -  членкин


311it [02:04,  3.72it/s]

unkown for: в. о. керівниці  -  в.  -  в.


317it [02:06,  2.52it/s]

unkown for: народну депутатку-бютівку  -  бютівка  -  бютівка


328it [02:11,  1.34it/s]

unkown for: віце-премʼєра  -  прем’єрний  -  прем’єрый


359it [02:21,  2.76it/s]

unkown for: освітя́нок  -  освітя́нка  -  освітя́нки


428it [02:39,  4.96it/s]

unkown for: митниці  -  митниця  -  митниці


468it [02:48,  3.98it/s]

unkown for: головній інженерці  -  інженерець  -  головнити


505it [02:58,  3.65it/s]

unkown for: головні мисливствознавиці держлісгоспів  -  мисливствітнацець  -  держлісгоспіть


563it [03:12,  5.38it/s]

unkown for: смотрящою  -  смотрящий  -  смотрящий


599it [03:20,  5.24it/s]

unkown for: компанії  -  компанія  -  компанії


635it [03:30,  2.59it/s]

unkown for: завідувачку виробничого відділу  -  відділ  -  відділ


673it [03:42,  9.18it/s]

unkown for: нардепками  -  нардепок  -  нардепкой


680it [03:45,  2.96it/s]

unkown for: родина екс-презид  -  презид  -  презид


731it [03:56,  4.44it/s]

unkown for: інженерка лісового господарства  -  господарство  -  господарство


742it [04:05,  2.13it/s]

unkown for: в. о. генеральної директорки  -  в  -  в.


745it [04:07,  1.53it/s]

unkown for: сбушниці  -  сбушниця  -  сбушниці


774it [04:18,  2.69it/s]

unkown for: віце-премʼєрка  -  премʼєрка  -  премʼєркий


784it [04:20,  4.82it/s]

unkown for: аграрії  -  аграрія  -  аграрії


832it [04:33,  4.67it/s]

unkown for: прокуророк  -  прокуророк  -  прокуророк


851it [04:39,  3.04it/s]

unkown for: підрядницею департаменту  -  департамент  -  департамент


865it [04:43,  5.32it/s]

unkown for: податківниць  -  податківниця  -  податківниць


903it [04:56,  4.24it/s]

unkown for: прокуророк  -  прокуророк  -  прокуророк


908it [04:57,  3.14it/s]

unkown for: в. о. гендиректорки держкомпанії  -  в  -  в.


930it [05:05,  3.06it/s]

unkown for: т. в. о. директорки  -  т.  -  в.


963it [05:17,  3.75it/s]

unkown for: світлана зубачик  -  зубачик  -  зубачик


973it [05:19,  4.96it/s]

unkown for: в. о. управління  -  в  -  в.


1021it [05:30,  3.09it/s]

total size: 1338
gender: male
number of files for the gender: 88
number of entities for the gender: 146
The most popular entity: слідчий
gender: female
number of files for the gender: 263
number of entities for the gender: 1002
The most popular entity: директорка
gender: common
number of files for the gender: 85
number of entities for the gender: 160
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 30
number of entities for the gender: 30
The most popular entity: в





In [70]:
changed_ng_file_gen_stat_dict = return_gender_file_stat(changed_ng_file_gender_dict)
changed_ng_female_files, changed_ng_male_files, changed_ng_common_files = split_files(changed_ng_file_gen_stat_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/ng_changed")

In [71]:
print(f"Total size: {len(changed_ng_file_gen_stat_dict.keys())}")
print(f"Number of male only files: {len(changed_ng_male_files)}")
print(f"Number of female only files: {len(changed_ng_female_files)}")
print(f"Number of common only files: {len(changed_ng_common_files)}")

Total size: 274
Number of male only files: 5
Number of female only files: 114
Number of common only files: 4


### BRUK

In [68]:
changed_bruk_file_gender_dict, changed_bruk_total_job_counter, changed_bruk_job_list = return_gendered_dict(parallel_dataset_bruk_dataset, "changed_ann", 1)
return_gender_stat(changed_bruk_total_job_counter, changed_bruk_file_gender_dict)
# changed_bruk_female_files, changed_bruk_male_files, changed_bruk_common_files = split_files(changed_bruk_file_gender_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/bruk_changed")

1it [00:04,  4.56s/it]

unkown for: поетеса-символістка  -  символістка  -  символістка


7it [00:05,  2.98it/s]

unkown for: поетес  -  поетес  -  поетёс


26it [00:11,  4.25it/s]

unkown for: поетес  -  поетес  -  поетёс
unkown for: поетес  -  поетес  -  поетёс


50it [00:16,  6.59it/s]

unkown for: орнітологині  -  орнітологин  -  орнітологині
unkown for: орнітологині  -  орнітологин  -  орнітологині


63it [00:19,  2.44it/s]

unkown for: мол . наук . співробітниці  -  наука  -  наука


66it [00:19,  4.55it/s]

unkown for: пе  -  пе  -  пе


86it [00:26,  4.99it/s]

unkown for: підмайстрині  -  підмайстрина  -  підмайстрині
unkown for: підмайстрині  -  підмайстрина  -  підмайстрині


108it [00:36,  4.55it/s]

unkown for: найманок  -  найманок  -  найманки


112it [00:36,  6.17it/s]

unkown for: солдаток  -  солдатко  -  солдатки


134it [00:39,  7.85it/s]

unkown for: хімікині  -  хімікина  -  хімікині
unkown for: хімікині  -  хімікина  -  хімікині


137it [00:39,  7.94it/s]

unkown for: хімікині  -  хімікина  -  хімікині


146it [00:42,  4.12it/s]

unkown for: м президен  -  м  -  м


151it [00:42,  6.87it/s]

unkown for: сторожинь  -  сторожиня  -  сторожинить


184it [00:49,  3.76it/s]

unkown for: шоколадник  -  шоколадник  -  шоколадник
unkown for: шоколадника  -  шоколадник  -  шоколадник


187it [00:49,  5.50it/s]

unkown for: гармашки  -  гармашка  -  гармашка


190it [00:50,  6.37it/s]

unkown for: отаманко  -  отаманко  -  отаманко


203it [00:51,  9.30it/s]

unkown for: старшинкам  -  старшинка  -  старшинку


219it [00:53,  8.65it/s]

unkown for: головнокомандувачки збройних сил україни  -  головнокомандувачка  -  українити


221it [00:55,  2.67it/s]

unkown for: командувачок видів  -  вид  -  видіти


245it [01:01,  5.45it/s]

unkown for: силовиць  -  силовиця  -  силовиць


250it [01:02,  5.27it/s]

unkown for: мікологині  -  мікологина  -  мікологині
unkown for: вчені  -  вчень  -  вчені


255it [01:02,  7.78it/s]

unkown for: богословинь  -  богословиня  -  богословинить


290it [01:10,  4.27it/s]

unkown for: стрільчині  -  стрільчина  -  стрільчині


303it [01:12,  7.16it/s]

unkown for: няні  -  нян  -  няні


314it [01:14,  7.95it/s]

unkown for: механікині  -  механікина  -  механікині
unkown for: механікині  -  механікина  -  механікині


319it [01:14,  7.37it/s]

unkown for: покоївця  -  покоївець  -  покоївцть


334it [01:19,  4.16it/s]

unkown for: авторок  -  авторок  -  авторок


335it [01:19,  3.83it/s]

unkown for: канд . істор . наук  -  наука  -  наука


336it [01:21,  1.32it/s]

unkown for: докторант відділу соціальної антропології  -  відділ  -  відділ
unkown for: канд . істор . наук  -  наука  -  наука


337it [01:22,  1.17it/s]

unkown for: канд . істор . наук  -  наука  -  наука


340it [01:25,  1.37it/s]

unkown for: канд . істор . наук  -  наука  -  наука


341it [01:25,  1.49it/s]

unkown for: канд . істор . наук  -  наука  -  наука


357it [01:31,  5.52it/s]

unkown for: інокині  -  інокина  -  інокині


368it [01:32,  9.31it/s]

unkown for: ігумені  -  ігумень  -  ігумені


374it [01:34,  4.02it/s]

unkown for: ректорку  -  ректорк  -  ректорк


381it [01:36,  4.29it/s]

unkown for: ктиторок  -  ктиторок  -  ктиторок


385it [01:37,  4.15it/s]

unkown for: черниць-законниць  -  законниця  -  законниця


399it [01:39,  7.71it/s]

unkown for: голяку-ченці  -  ченка  -  ченка


403it [01:39,  7.52it/s]

unkown for: шафаркам  -  шафарок  -  шафарке
unkown for: управителькам  -  управителький  -  управительке


404it [01:39,  6.39it/s]

unkown for: келарок  -  келарка  -  келарки


409it [01:40,  5.02it/s]

unkown for: єпископки  -  єпископка  -  єпископка


413it [01:41,  6.97it/s]

unkown for: єпискинь  -  єпискиня  -  єпискинуть
unkown for: єпискиня  -  єпискиня  -  єпискиня


418it [01:42,  6.01it/s]

unkown for: хліборобок  -  хліборобок  -  хліборобкий


421it [01:42,  7.70it/s]

unkown for: хліборобок  -  хліборобок  -  хліборобкий


434it [01:44,  4.09it/s]

unkown for: коронні гетьманші  -  гетьманешь  -  гетьманешь


452it [01:48,  9.40it/s]

unkown for: нквд-истки  -  нквд-истка  -  нквд-исткать


453it [01:48,  7.92it/s]

unkown for: юристки  -  юристок  -  юристкать
unkown for: податківцям  -  податокець  -  податківци


461it [01:51,  2.47it/s]

unkown for: медичного брата  -  брат  -  брат


467it [01:52,  5.33it/s]

unkown for: тренеркам  -  тренерк  -  тренерке


476it [01:54,  7.69it/s]

unkown for: академікині  -  академікина  -  академікині


492it [01:57,  4.19it/s]

total size: 638
gender: male
number of files for the gender: 44
number of entities for the gender: 80
The most popular entity: вчитель
gender: female
number of files for the gender: 112
number of entities for the gender: 445
The most popular entity: черниця
gender: common
number of files for the gender: 24
number of entities for the gender: 52
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 29
number of entities for the gender: 61
The most popular entity: наука





In [72]:
changed_bruk_file_gender_stat_dict = return_gender_file_stat(changed_bruk_file_gender_dict)

changed_bruk_female_files, changed_bruk_male_files, changed_bruk_common_files = split_files(changed_bruk_file_gender_stat_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_balanced/bruk_changed")

In [73]:
print(f"Total size: {len(changed_bruk_file_gender_stat_dict.keys())}")
print(f"Number of male only files: {len(changed_bruk_male_files)}")
print(f"Number of female only files: {len(changed_bruk_female_files)}")
print(f"Number of common only files: {len(changed_bruk_common_files)}")

Total size: 131
Number of male only files: 10
Number of female only files: 60
Number of common only files: 3


## Match pairs

In [84]:
changed_bruk_file_gender_dict

{'male': {'66395d7c394b_1.txt': [('T17', 'військовий')],
  '1ed3fef56c8f_1.txt': [('T13', 'портьє')],
  'aad4dce361f1_1.txt': [('T13', 'панотець')],
  'bf4968831cc7_1.txt': [('T2', 'актор')],
  '15238b87db03_1.txt': [('T5', 'методист')],
  'efc2665e4bed_1.txt': [('T2', 'співробітник')],
  '624df1293f3d_1.txt': [('T4', 'педагог'),
   ('T14', 'вчитель'),
   ('T23', 'вчитель')],
  '0d902430768c_1.txt': [('T49', 'лікар'), ('T65', 'інженер')],
  '326978d2c58c_1.txt': [('T7', 'лікар'),
   ('T8', 'лікар'),
   ('T9', 'секретар'),
   ('T10', 'секретар'),
   ('T11', 'психолог'),
   ('T12', 'психолог'),
   ('T17', 'мовознавець'),
   ('T18', 'кандидат'),
   ('T19', 'викладач')],
  '6b5d4229b548_1.txt': [('T9', 'викладач')],
  '9494fddb0caa_1.txt': [('T3', 'монах')],
  'fabc18a7d8fd_1.txt': [('T18', 'вихователь'), ('T19', 'продавець')],
  '4b1efc986bb8_1.txt': [('T20', 'учений')],
  'e023d6d6f1a3_1.txt': [('T5', 'вчитель'),
   ('T6', 'дільничний'),
   ('T22', 'дільничний'),
   ('T32', 'дільничний')

In [97]:
import re

aligned_pairs = {}

def normalize_filename(name):
    """Remove '_1.txt' suffix to align with original filename"""
    return re.sub(r'_1(?=\.txt$)', '', name)

def flatten_all_gender_dicts(gender_dict):
    """Flatten a nested gender-based dict into a flat filename -> items dict"""
    flat = {}
    for gender in ['male', 'female', 'common']:
        if gender in gender_dict:
            for fname, items in gender_dict[gender].items():
                flat[fname] = items
    return flat

# Flatten both dicts without assuming gender order
# original_flat = flatten_all_gender_dicts(orig_bruk_file_gender_dict)
# swapped_flat = flatten_all_gender_dicts(changed_bruk_file_gender_dict)

original_flat = flatten_all_gender_dicts(orig_ng_file_gender_dict)
swapped_flat = flatten_all_gender_dicts(changed_ng_file_gender_dict)

# Build aligned pairs: always (original, swapped)
for swapped_fname, swapped_ents in swapped_flat.items():
    orig_fname = normalize_filename(swapped_fname)
    if orig_fname in original_flat:
        orig_ents = dict(original_flat[orig_fname])
        swapped_ents_dict = dict(swapped_ents)
        paired = {
            tid: (orig_ents[tid], swapped_ents_dict[tid])
            for tid in orig_ents
            if tid in swapped_ents_dict
        }
        if paired:
            aligned_pairs[orig_fname] = paired

print(aligned_pairs)


{'104263660695.txt': {'T25': ('директор', 'директорка'), 'T26': ('засновник', 'засновниця')}, 'bbc5330abbdc.txt': {'T25': ('міністр', 'міністерка')}, '989b35bbc2b6.txt': {'T25': ('засновник', 'засновниця'), 'T27': ('директор', 'директорка'), 'T29': ('керівник', 'керівниця')}, '8098d69d861f.txt': {'T49': ('засновник', 'засновниця'), 'T50': ('директор', 'директорка')}, '9c264bd5a100.txt': {'T12': ('голова', 'голова')}, '3e21f62b9a2f.txt': {'T18': ('засновник', 'засновниця'), 'T28': ('керівник', 'керівниця'), 'T29': ('засновник', 'засновниця'), 'T34': ('директор', 'директорка')}, '037bb6843538.txt': {'T15': ('депутат', 'депутатка'), 'T17': ('депутат', 'депутатка'), 'T35': ('співзасновник', 'співзасновниця'), 'T38': ('нардеп', 'нардепка'), 'T39': ('директор', 'директорка')}, '36aa8022709f.txt': {'T27': ('в', 'в')}, '4671f9337340.txt': {'T29': ('заступник', 'заступниця')}, 'db4996034aa6.txt': {'T12': ('засновник', 'засновниця')}, '7faf035614c1.txt': {'T34': ('голова', 'голова')}, '8806c823d

In [98]:
import pandas as pd

rows = []
for filename, entity_dict in aligned_pairs.items():
    for tid, (orig_text, swapped_text) in entity_dict.items():
        rows.append({
            "filename": filename,
            "entity_id": tid,
            "original": orig_text,
            "swapped": swapped_text
        })

df = pd.DataFrame(rows)

In [99]:
df.head()

Unnamed: 0,filename,entity_id,original,swapped
0,104263660695.txt,T25,директор,директорка
1,104263660695.txt,T26,засновник,засновниця
2,bbc5330abbdc.txt,T25,міністр,міністерка
3,989b35bbc2b6.txt,T25,засновник,засновниця
4,989b35bbc2b6.txt,T27,директор,директорка


In [100]:
df.shape

(805, 4)

In [None]:
df.to_csv("/Users/linndfors/study/diploma/ner_for_fem/data/utils_files/bruk_gender_pairs_from_swapping.csv")
# df.to_csv("/Users/linndfors/study/diploma/ner_for_fem/data/utils_files/ng_gender_pairs_from_swapping.csv")

## Collect dataset

In [60]:
def move_file(filename, destination_dir):
    file_name = os.path.basename(filename)

    destination_path = os.path.join(destination_dir, file_name)

    shutil.copy(filename, destination_path)

def move_gender_files(gender_files_list, dest_dir):
    
    for gender_file in gender_files_list:
        move_file(gender_file, dest_dir)

        ann_gender_file = gender_file.replace(".txt", ".ann")
        move_file(ann_gender_file, dest_dir)
        

### Original

Ng

In [89]:
move_gender_files(orig_ng_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_male/ng')
move_gender_files(orig_ng_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_female/ng')
move_gender_files(orig_ng_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_common/ng')

In [62]:
import os

def count_files_recursive(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

directory_path = "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_male"
print("male Total files:", count_files_recursive(directory_path))

directory_path = "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_female"
print("female Total files:", count_files_recursive(directory_path))

directory_path = "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_common"
print("common Total files:", count_files_recursive(directory_path))

male Total files: 1046
female Total files: 828
common Total files: 436


In [93]:
import os

def count_files_recursive(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

directory_path = "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_male"
print("male Total files:", count_files_recursive(directory_path))

directory_path = "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_female"
print("female Total files:", count_files_recursive(directory_path))

directory_path = "/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_common"
print("common Total files:", count_files_recursive(directory_path))

male Total files: 530
female Total files: 360
common Total files: 28


BRUK

In [90]:
move_gender_files(orig_bruk_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_male/bruk')
move_gender_files(orig_bruk_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_female/bruk')
move_gender_files(orig_bruk_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_common/bruk')

### Swapped

Ng

In [91]:
move_gender_files(changed_ng_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_male/ng_changed')
move_gender_files(changed_ng_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_female/ng_changed')
move_gender_files(changed_ng_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_common/ng_changed')

BRUK

In [92]:
move_gender_files(changed_bruk_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_male/bruk_changed')
move_gender_files(changed_bruk_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_female/bruk_changed')
move_gender_files(changed_bruk_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_gender/data_for_ner_common/bruk_changed')

## For filtered parallel sentence dataset

In [74]:
import os
import glob

def extract_job_entities(directory):
    pers_entities = {}
    
    ann_files = glob.glob(os.path.join(directory, "*.ann"))
    print(ann_files)
    
    for ann_file in ann_files:
        with open(ann_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) == 5:
                    
                    entity_type = parts[1]
                    
                    if entity_type == "JOB":
                        entity_text = parts[4]
                        if ann_file in pers_entities:
                            pers_entities[ann_file].append(entity_text)
                        else:
                            pers_entities[ann_file] = [entity_text]
    
    return pers_entities

def return_gendered_PERS_dict(pers_dict):
    file_gender_dict = {"male": {}, "female": {}, "common": {}, "unknown_gender": {}}

    total_pers_counter = 0
    pers_list = []

    for filename, values in tqdm.tqdm(pers_dict.items()):
        filename = filename.replace(".ann", ".txt")
        
        try:
            for ent in values:
                total_pers_counter += 1
                pers_list.append(ent)
                # if len(ent.split(' ')) > 1:
                #     ent = ent.split(' ')[0]
                gender_value, lemma_word = extract_job_gender(ent)
                # print(gender_value, lemma_word)
                
                if filename not in file_gender_dict[gender_value]:
                    file_gender_dict[gender_value][filename] = [(ent, lemma_word)]
                else:
                    file_gender_dict[gender_value][filename].append((ent, lemma_word))
                    
        except Exception as e:
            print(f"Issue with row: {row} - Error: {e}")
    return file_gender_dict, total_pers_counter, pers_list

In [None]:
bruk_job_dict = extract_job_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk")
bruk_job_file_gender_dict, bruk_job_total_counter, bruk_job_list = return_gendered_PERS_dict(bruk_job_dict)

In [73]:
return_gender_stat(bruk_job_total_counter, bruk_job_file_gender_dict)

total size: 481
gender: male
number of files for the gender: 39
number of entities for the gender: 67
percentage: 0.1392931392931393
The most popular entity: вчитель
gender: female
number of files for the gender: 107
number of entities for the gender: 343
percentage: 0.7130977130977131
The most popular entity: журналістка
gender: common
number of files for the gender: 21
number of entities for the gender: 33
percentage: 0.06860706860706861
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 23
number of entities for the gender: 38
percentage: 0.079002079002079
The most popular entity: наука


In [75]:
ng_job_dict = extract_job_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng")
ng_job_file_gender_dict, ng_job_total_counter, ng_job_list = return_gendered_PERS_dict(ng_job_dict)

['/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/989b35bbc2b6-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/5e33850771e3-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/1c48b2f37af3-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/0050229d8534-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/150446f83aa2-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/00edded01d7f-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/f50037706d0a-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/6d47a8c4d755-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/5e407e3ddb68-swapped.ann', '/Users/linndfors/study/diploma/ner_

100%|██████████| 269/269 [06:00<00:00,  1.34s/it]


In [76]:
return_gender_stat(ng_job_total_counter, ng_job_file_gender_dict)

total size: 1248
gender: male
number of files for the gender: 71
number of entities for the gender: 114
percentage: 0.09134615384615384
The most popular entity: засновник
gender: female
number of files for the gender: 257
number of entities for the gender: 972
percentage: 0.7788461538461539
The most popular entity: директорка
gender: common
number of files for the gender: 78
number of entities for the gender: 135
percentage: 0.10817307692307693
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 26
number of entities for the gender: 27
percentage: 0.021634615384615384
The most popular entity: в


In [79]:
print("total:", 483 + 1250)
print("male:", 67 + 115)
print("female:", 345 + 971)
print("common:", 35 + 135)
print("unknown:", 38 + 27)

total: 1733
male: 182
female: 1316
common: 170
unknown: 65


----------

In [25]:
all_sentences = pd.read_csv("all_parallel_sentence_both_datasets_with_filenames_and_labels.csv")

In [36]:
orig_all_file_gender_dict, orig_all_total_job_counter, orig_all_job_list = return_gendered_dict(all_sentences, "orig_ann")

1403it [07:46,  3.01it/s]


In [39]:
return_gender_stat(orig_all_total_job_counter, orig_all_file_gender_dict)

total size: 1836
gender: male
number of files for the gender: 385
number of entities for the gender: 1559
percentage: 0.849128540305011
The most popular entity: директор
gender: female
number of files for the gender: 34
number of entities for the gender: 61
percentage: 0.0332244008714597
The most popular entity: підприємиця
gender: common
number of files for the gender: 103
number of entities for the gender: 174
percentage: 0.09477124183006536
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 25
number of entities for the gender: 42
percentage: 0.02287581699346405
The most popular entity: наука


In [40]:
changed_all_file_gender_dict, changed_all_total_job_counter, changed_all_job_list = return_gendered_dict(all_sentences, "changed_ann", 1)
return_gender_stat(changed_all_total_job_counter, changed_all_file_gender_dict)

1403it [08:08,  2.87it/s]

total size: 1836
gender: male
number of files for the gender: 112
number of entities for the gender: 183
percentage: 0.09967320261437909
The most popular entity: засновник
gender: female
number of files for the gender: 369
number of entities for the gender: 1405
percentage: 0.7652505446623094
The most popular entity: директорка
gender: common
number of files for the gender: 102
number of entities for the gender: 173
percentage: 0.09422657952069717
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 53
number of entities for the gender: 75
percentage: 0.04084967320261438
The most popular entity: наука





# For PERS entities

In [45]:
female_names = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/src/female_fname_freq_dict.csv")
male_names = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/src/male_fname_freq_dict.csv")

In [46]:
female_names_list = female_names['name'].values
male_names_list = male_names['name'].values

In [47]:
dict_df = pd.read_csv('/Users/linndfors/study/diploma/dict_uk/out/dict_corp_lt.txt', delimiter=' ', header=None, names=['word', 'lemma', 'grammar'])

In [48]:
def define_sex(name):
    try:
        filtered_df = dict_df[(dict_df['lemma'] == name) & (dict_df['word'] == name)]
        if not filtered_df.empty:
            grammar_pers = filtered_df['grammar'].iloc[0]

            if re.search(r'fname', grammar_pers):
                if re.search(r'\bf\b', grammar_pers):
                    return "F"
                elif re.search(r'\bm\b', grammar_pers):
                    return "M"
                else:
                    return "U"
            elif re.search(r'lname', grammar_pers):
                return "U"
            else:
                return "U"
        else:
            # print("No matches found for:", name)
            return "U"
    except Exception as e:
        # print("error", e)
        return "U"

In [None]:
import os
import glob

def extract_gender(entity):

    pers_parts = entity.split(" ")
    for i in pers_parts:
        if define_sex(i) == "F":
            return "female", i
        elif define_sex(i) == "M":
            return "male", i
                
    doc = nlp(entity)
    
    words = {word.lemma for sentence in doc.sentences for word in sentence.words}
    words.update(entity.split(" "))

    for x in words:
        if x in female_names_list:
            return "female", x
        if x in male_names_list:
            return "male", x
        
    # print("unkown for:", entity)
    return "unknown_gender", entity

In [62]:
def extract_pers_entities(directory):
    pers_entities = {}
    
    ann_files = glob.glob(os.path.join(directory, "*.ann"))
    print(ann_files)
    
    for ann_file in ann_files:
        with open(ann_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) == 5:
                    
                    entity_type = parts[1]
                    
                    if entity_type == "PERS":
                        entity_text = parts[4]
                        if ann_file in pers_entities:
                            pers_entities[ann_file].append(entity_text)
                        else:
                            pers_entities[ann_file] = [entity_text]
    
    return pers_entities

def return_gendered_PERS_dict(pers_dict):
    file_gender_dict = {"male": {}, "female": {}, "unknown_gender": {}}

    total_pers_counter = 0
    pers_list = []

    for filename, values in tqdm.tqdm(pers_dict.items()):
        filename = filename.replace(".ann", ".txt")
        
        try:
            for ent in values:
                total_pers_counter += 1
                pers_list.append(ent)
                if len(ent.split(' ')) > 1:
                    ent = ent.split(' ')[0]
                gender_value, lemma_word = extract_gender(ent)
                # print(gender_value, lemma_word)
                
                if filename not in file_gender_dict[gender_value]:
                    file_gender_dict[gender_value][filename] = [(ent, lemma_word)]
                else:
                    file_gender_dict[gender_value][filename].append((ent, lemma_word))
                    
        except Exception as e:
            print(f"Issue with row: {row} - Error: {e}")
    return file_gender_dict, total_pers_counter, pers_list

In [51]:
with open('ng_pers_orig_only_jobs_sents.txt', 'r', encoding='utf-8') as f:
    loaded_list_ng = [line.strip() for line in f]

with open('bruk_pers_orig_only_jobs_sents.txt', 'r', encoding='utf-8') as f:
    loaded_list_bruk = [line.strip() for line in f]

In [14]:
file_gender_dict

{'male': 556, 'female': 197, 'unknown_gender': 432}

In [16]:
556 -30 + 197 -17 + 432 - 80

1185

In [19]:
file_gender_dict_bruk

{'male': 146, 'female': 54, 'unknown_gender': 169}

In [20]:
146 -5 + 54-3 + 169 - 35

369

In [None]:
1058 - ng
326 - bruk
1384 - total

In [18]:
file_gender_dict_bruk = {"male": 0, "female": 0, "unknown_gender": 0}

for ent in tqdm.tqdm(loaded_list_bruk):
    
    if len(ent.split(' ')) > 1:
        ent = ent.split(' ')[0]
    gender_value, lemma_word = extract_gender(ent)
    # print(ent, gender_value)
    file_gender_dict_bruk[gender_value] += 1
    # print(file_gender_dict)

100%|██████████| 369/369 [14:57<00:00,  2.43s/it]


In [None]:
file_gender_dict = {"male": 0, "female": 0, "unknown_gender": 0}

for ent in tqdm.tqdm(loaded_list_ng):
    
    if len(ent.split(' ')) > 1:
        ent = ent.split(' ')[0]
    gender_value, lemma_word = extract_gender(ent)
    # print(ent, gender_value)
    file_gender_dict[gender_value] += 1
    # print(file_gender_dict)

In [None]:
changed_bruk_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk_changed")
changed_bruk_file_gender_pers_dict, changed_bruk_total_pers_counter, changed_bruk_pers_list = return_gendered_PERS_dict(changed_bruk_pers_dict)

In [288]:
return_gender_stat(changed_bruk_total_pers_counter, changed_bruk_file_gender_pers_dict)
changed_bruk_pers_female_files, changed_bruk_pers_male_files, changed_bruk_pers_common_files = split_files(changed_bruk_file_gender_pers_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk_changed")

total size: 326
gender: male
number of files for the gender: 32
number of entities for the gender: 52
The most popular entity: Сергій
gender: female
number of files for the gender: 67
number of entities for the gender: 132
The most popular entity: Олександра
gender: unknown_gender
number of files for the gender: 48
number of entities for the gender: 142
The most popular entity: А


In [None]:
# changed_bruk_female_files, changed_bruk_male_files, changed_bruk_common_files = split_files(changed_bruk_file_gender_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk_changed")

In [244]:
changed_ng_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng_changed")
changed_ng_file_gender_pers_dict, changed_ng_total_pers_counter, changed_ng_pers_list = return_gendered_PERS_dict(changed_ng_pers_dict)

100%|██████████| 253/253 [39:13<00:00,  9.30s/it]   


In [286]:
return_gender_stat(changed_ng_total_pers_counter, changed_ng_file_gender_pers_dict)
changed_ng_pers_female_files, changed_ng_pers_male_files, changed_ng_pers_common_files = split_files(changed_ng_file_gender_pers_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng_changed")

total size: 1058
gender: male
number of files for the gender: 106
number of entities for the gender: 170
The most popular entity: Валентин
gender: female
number of files for the gender: 225
number of entities for the gender: 602
The most popular entity: Олена
gender: unknown_gender
number of files for the gender: 145
number of entities for the gender: 286
The most popular entity: Світлани


In [276]:
all_names = []

for filaname, ents in changed_ng_file_gender_pers_dict['unknown_gender'].items():
    for ent in ents:
        all_names.append(ent[0])

In [None]:
name_counts = Counter(all_names)
print(name_counts)

Counter({'Світлани': 7, 'Юлії': 7, 'Людмили': 5, 'Любові': 5, 'Павленко': 4, 'Олені': 4, 'Козименко': 4, 'Кузнєцова': 3, 'Непийвода': 3, 'Машненкова': 3, 'Лілії': 3, 'Ірині': 3, 'Кобринчук': 3, 'Янукович': 3, 'Андреїшин': 3, 'Кушнір': 3, 'Сміяненко': 3, 'Тарпан': 2, 'Філіпчук': 2, 'Ніколєнко': 2, 'Тютюнник': 2, 'Ганну': 2, 'Ріната': 2, 'Ярич': 2, 'Пелих': 2, 'Ніні': 2, 'Осипенко': 2, 'Кокседж': 2, 'Машненкову': 2, 'Наталію': 2, 'Борзих': 2, 'Корнієць': 2, 'Чепурненко': 2, 'Владислави': 2, 'Фірташа': 2, 'Ігоря': 2, 'Василенко': 2, 'Смик': 2, 'Любач': 2, 'Біловол': 2, 'Юлію': 2, 'Ковач': 2, 'Підвисоцька': 2, 'Шишацька': 2, 'Голяшкіної': 2, 'Григорія': 2, 'Тертиця': 2, 'Колуги': 2, 'Чернікова': 2, 'Васильєвою': 1, 'Кисельовою': 1, 'Котельникову': 1, 'Довгополова': 1, 'Ватерина': 1, 'Демішкан': 1, 'Сагайдак': 1, 'Остапенко': 1, 'Анісімових': 1, 'Котляров': 1, 'Анісімова': 1, 'Ахметова': 1, 'Анжеліці': 1, 'Кусок': 1, 'Медведчук': 1, 'Овчаренко': 1, 'Аблова': 1, 'Червачова': 1, 'Качаненку': 

In [261]:
bruk_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk")
bruk_file_gender_pers_dict, bruk_total_pers_counter, bruk_pers_list = return_gendered_PERS_dict(bruk_pers_dict)

['/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/68f6d1cfc486.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/dc25535e13d7.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/fad2422a7bf3.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/7e1defe0dda2.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/87ae82667e2a.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/b972a141f2be.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/85622bc925dc.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/36ac3bc5ed9c.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/3d0ab8ae38cf.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk/ead06d34c49d.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/

100%|██████████| 231/231 [2:59:33<00:00, 46.64s/it]  


In [284]:
return_gender_stat(bruk_total_pers_counter, bruk_file_gender_pers_dict)
orig_bruk_pers_female_files, orig_bruk_pers_male_files, orig_bruk_pers_common_files = split_files(bruk_file_gender_pers_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk")

total size: 4415
gender: male
number of files for the gender: 176
number of entities for the gender: 1274
The most popular entity: Андрій
gender: female
number of files for the gender: 135
number of entities for the gender: 970
The most popular entity: Аліна
gender: unknown_gender
number of files for the gender: 200
number of entities for the gender: 2171
The most popular entity: Бог


In [1]:
import pandas as pd

In [263]:
ng_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng")
ng_file_gender_pers_dict, ng_total_pers_counter, ng_pers_list = return_gendered_PERS_dict(ng_pers_dict)

['/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/b4fe41ad2268.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/150446f83aa2.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/ee7cde9751a7.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/3fdf22393022.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/76b9b07172fb.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/7982679365fb.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/003d28360166.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/02634352df22.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/a9d2c035032d.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng/3eef4bd7fc75.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NE

100%|██████████| 279/279 [1:09:37<00:00, 14.97s/it]


In [278]:
return_gender_stat(ng_total_pers_counter, ng_file_gender_pers_dict)
orig_ng_pers_female_files, orig_ng_pers_male_files, orig_ng_pers_common_files = split_files(ng_file_gender_pers_dict, "/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng")

total size: 1820
gender: male
number of files for the gender: 252
number of entities for the gender: 846
The most popular entity: Сергій
gender: female
number of files for the gender: 158
number of entities for the gender: 316
The most popular entity: Олена
gender: unknown_gender
number of files for the gender: 205
number of entities for the gender: 658
The most popular entity: Ігоря


In [281]:
len(orig_ng_pers_male_files)

252

### Move files

orig ng

In [None]:
move_gender_files(orig_ng_pers_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_male/ng')
move_gender_files(orig_ng_pers_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_female/ng')
# move_gender_files(orig_ng_pers_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_common/ng')

orig bruk

In [None]:
move_gender_files(orig_bruk_pers_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_male/bruk')
move_gender_files(orig_bruk_pers_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_female/bruk')
# move_gender_files(orig_bruk_pers_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_common/bruk')

swapped ng

In [287]:
move_gender_files(changed_ng_pers_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_male/ng_changed')
move_gender_files(changed_ng_pers_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_female/ng_changed')
# move_gender_files(changed_ng_pers_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_common/ng')

swapped bruk

In [290]:
move_gender_files(changed_bruk_pers_male_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_male/bruk_changed')
move_gender_files(changed_bruk_pers_female_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_female/bruk_changed')
# move_gender_files(orig_bruk_pers_common_files, '/Users/linndfors/study/diploma/ner_for_fem/data/data_for_ner/data_for_ner_pers/data_for_ner_common/bruk')

## PERS for filtered dataset

In [None]:
ng_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng")
ng_file_gender_pers_dict, ng_total_pers_counter, ng_pers_list = return_gendered_PERS_dict(ng_pers_dict)

In [60]:
return_gender_stat(ng_total_pers_counter, ng_file_gender_pers_dict)

total size: 1003
gender: male
number of files for the gender: 80
number of entities for the gender: 126
percentage: 0.12562313060817548
The most popular entity: Олександр
gender: female
number of files for the gender: 223
number of entities for the gender: 604
percentage: 0.6021934197407777
The most popular entity: Олена
gender: unknown_gender
number of files for the gender: 139
number of entities for the gender: 273
percentage: 0.27218344965104685
The most popular entity: Юлії


In [64]:
bruk_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk")
bruk_file_gender_pers_dict, bruk_total_pers_counter, bruk_pers_list = return_gendered_PERS_dict(bruk_pers_dict)

['/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/17d3d678df81-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/fabc18a7d8fd-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/85622bc925dc-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/cee02d4c030f-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/0b962bee49bc-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/aad4dce361f1-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/ea3fab248fbe-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/326978d2c58c-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/0d902430768c-swapped.ann', '/Users/linndfors/

100%|██████████| 91/91 [09:49<00:00,  6.48s/it]


In [65]:
return_gender_stat(bruk_total_pers_counter, bruk_file_gender_pers_dict)

total size: 279
gender: male
number of files for the gender: 25
number of entities for the gender: 30
percentage: 0.10752688172043011
The most popular entity: Василь
gender: female
number of files for the gender: 65
number of entities for the gender: 126
percentage: 0.45161290322580644
The most popular entity: Олександра
gender: unknown_gender
number of files for the gender: 46
number of entities for the gender: 123
percentage: 0.44086021505376344
The most popular entity: Т


In [71]:
print("total:", 1282)
print("male:", 126 + 30)
print("female:", 604 + 126)
print("unknown:", 123 + 273)

total: 1282
male: 156
female: 730
unknown: 396
