This script is created for splitting JOB and PERS entities by Gender.

In [1]:
import pandas as pd
import numpy as np
import os
import re
import tqdm
from typing import List, Dict
import json
import shutil

In [2]:
import stanza
import pymorphy3
import pymorphy2

nlp = stanza.Pipeline('uk', processors='tokenize,mwt,pos,lemma,depparse')
morph = pymorphy3.MorphAnalyzer()


# # stanza.download('uk')

2025-04-22 10:29:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-22 10:29:14 INFO: Downloaded file to /Users/linndfors/stanza_resources/resources.json
2025-04-22 10:29:15 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |
| depparse  | iu_charlm   |

2025-04-22 10:29:15 INFO: Using device: cpu
2025-04-22 10:29:15 INFO: Loading: tokenize
2025-04-22 10:29:17 INFO: Loading: mwt
2025-04-22 10:29:17 INFO: Loading: pos
2025-04-22 10:29:21 INFO: Loading: lemma
2025-04-22 10:29:23 INFO: Loading: depparse
2025-04-22 10:29:23 INFO: Done loading processors!


In [3]:
parallel_dataset_ng_dataset = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/data/csv_files_with_par_sentences/ng_parallel.csv")
parallel_dataset_bruk_dataset = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/data/csv_files_with_par_sentences/bruk_parallel.csv")

In [5]:
parallel_dataset_bruk_dataset.head()

Unnamed: 0,original_sentence,orig_sent_id,changed_sentence,changed_sent_id,original_file_name,orig_ann,changed_ann
0,Його редактором був поет-символіст Яків Савчен...,46,Його редакторкою була поетеса-символістка Анже...,46,e5e76a8efa0f.txt,"{'T30': ('поет-символіст', 'JOB')}","{'T30': ('поетеса-символістка', 'JOB')}"
1,Міжпредметні паралелі . Маніфест футуристів ск...,62,Міжпредметні паралелі . Маніфест футуристок ск...,62,e5e76a8efa0f.txt,"{'T49': ('поет', 'JOB')}","{'T49': ('поетеса', 'JOB')}"
2,Помітною була організація « Гарт » ( 1923 — 19...,96,Помітною була організація « Гарт » ( 1923 — 19...,96,e5e76a8efa0f.txt,"{'T104': ('поет', 'JOB')}","{'T104': ('поетка', 'JOB')}"
3,Її очолив байкар і прозаїк Сергій Пилипенко .,127,Її очолила байкарка і прозаїкиня Марія Пилипен...,127,e5e76a8efa0f.txt,"{'T143': ('байкар', 'JOB'), 'T144': ('прозаїк'...","{'T143': ('байкарка', 'JOB'), 'T144': ('прозаї..."
4,У Галицько-Волинському літописі згадується спі...,171,У Галицько-Волинському літописі згадується спі...,171,e5e76a8efa0f.txt,"{'T218': ('співець', 'JOB')}","{'T218': ('співчиня', 'JOB')}"


# Split: Female, Male, Common genders

In [3]:
with open('/Users/linndfors/study/diploma/uk-gender-word-mapper/common_gender_words_list.txt') as file:
    common_gender_words_list = [line.strip() for line in file if line.strip()]

with open('/Users/linndfors/study/diploma/uk-gender-word-mapper/male_words_list.txt') as file:
    male_list = [line.strip() for line in file if line.strip()]

with open('/Users/linndfors/study/diploma/uk-gender-word-mapper/female_words_list.txt') as file:
    female_list = [line.strip() for line in file if line.strip()]

gender_dict_df = pd.read_csv("/Users/linndfors/study/diploma/uk-gender-word-mapper/gender_pairs_dictionary.csv")

gender_dict = {'male': [], 'female': []}

for _, row in gender_dict_df.iterrows():
    gender_dict['male'].append(row['male'])
    female_values = [f.strip() for f in row['female'].split(',')]
    gender_dict['female'].extend(female_values)

gender_dict['female'] = list(set(gender_dict['female']))

exceptions_common = ["судді",  "глава", "голова", "керівництво", "в. о.", "головою"]
exceptions_male = ['ієромонах', 'прокурор', 'віце-премʼєр', 'премʼєр', 'начальник', 'міністр', 'директор', 'підрядник', 'генпідрядник', 'керівник', 'головнокомандувач']
exceptions_female = ['премʼєрка', 'докторка', 'докторантка', 'директорка', 'міністерка', 'керівниця', 'начальниця', 'начальницю', 'генпідрядниця', 'прокурорка', 'ректорка']

In [4]:
def parse_output(ent):
    noun_dict = {
        "хліборобок": "хліборобка",
    "культурологині": "культурологиня",
    "директоркою": "директорка",
    "слідчій": "слідча",
    "прокурорка": "прокурорка",
    "прокурорки": "прокурорка",
    "Прокурорки": "прокурорка",
    "прокуроркою": "прокурорка",
    "прокурорці": "прокурорка",
    "адміністраторок": "адміністраторка",
    "слідчу": "слідча",
    "слідча": "слідча",
    "начальницею": "начальниця",
    "податківка": "податківка",
    "монопольниці": "монопольниця",
    "начальницю": "начальниця",
    "психіатрині": "психіатриня",
    "наркологині": "наркологиня",
    "міністерки": "міністерка",
    "рітейлерками": "рітейлерка",
    "начальниці": "начальниця",
    "екологині": "екологиня",
    "податківчині": "податківчиня",
    "журналісткам": "журналістка",
    "підрядницею": "підрядниця",
    "директорка": "директорка",
    "інженерка": "інженерка",
    "службовиці": "службовиця",
    "інженерці": "інженерка",
    "податківниць": "податківниця",
    "депутатку": "депутатка",
    "керівниця": "керівниця",
    "керівницею": "керівниця",
    "юристок": "юристка",
    "інспекторки": "інспекторка",
    "інженерок": "інженерка",
    "ревізорок": "ревізорка",
    "нардепок": "нардепка",
    "бізнесменок": "бізнесменка",
    "генпідрядниці": "генпідрядниця",
    "прокурор": "прокурор",
    "віце-премʼєра": "віце-прем'єр",
    "премʼєр": "прем'єр",
    "начальника": "начальник",
    "начальником": "начальник",
    "міністра": "міністр",
    "директором": "директор",
    "підрядником": "підрядник",
    "аграрія": "аграрій",
    "депутата": "депутат",
    "генпідрядника": "генпідрядник",
    "генпідрядником": "генпідрядник",
    "начальник": "начальник",
    "керівником": "керівник",
    "керівник": "керівник",
    "судді": "суддя",
    "ченці": "чернець",
    "ченців": "чернець",
    "глави": "глава",
    "голови": "голова",
    "керівництво": "керівництво",
    "головою": "голова",
    "князів": "князь",
    "водія": "водій",
    "хіміка": "хімік",
    "хіміки": "хімік",
    "ведучій": "ведуча",
    "механіка": "механік",
    "інокам": "інок",
    "головнокомандувача": "головнокомандувач",
    "наркологині": "наркологиня",
    "членкині": "членкиня",
    "інженерці": "інженерка",
    "мисливствознавиці": "мисливствознавиця",
    "логопедині": "логопединя",
    "математикині": "математикиня",
    "психологині": "психологиня", 
    "філологині": "філологиня",
    "голопедині": "голопединя",
    "урядовицs": "урядовиця",
    "філософині": "філософиня",
    "педагогині": "педагогиня",
    "мера": "мер"
    }
    if ent in noun_dict.keys():
        return noun_dict[ent]
    return

In [5]:
def extract_main_word(text):
    if "-" in text:
        parts = text.split("-")
        main_word = parts[-1]
        doc = nlp(main_word)
        for sentence in doc.sentences:
            for word in sentence.words:
                return word.lemma

    doc = nlp(text)

    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head == 0:
                main_word = word
                if main_word.text in {"рок", "анти", "псевдо", "віце", "топ"}:
                    continue
                return main_word.lemma
    for sentence in doc.sentences:
        for word in sentence.words:
            return word.lemma
        
def extract_job_gender(entity):
    entity = entity.lower()
    job = nlp(entity)
    join_sign = " " if " " in entity else ("" if "-" in entity else "")
    job_ent_list = [word.lemma for sent in job.sentences for word in sent.words]
    job_lemma = join_sign.join(job_ent_list)

    if len(job_ent_list) > 1:
        job_lemma = extract_main_word(job_lemma)

    words = entity.split()
    lemmatized_words = []
    for word in words:
        parsed_word = morph.parse(word)[0]
        if 'plur' in parsed_word.tag:
            singular_form = parsed_word.inflect({'sing'}).word if parsed_word.inflect({'sing'}) else parsed_word.normal_form
            lemmatized_words.append(singular_form)
        else:
            lemmatized_words.append(parsed_word.normal_form)
            
    job_lemma_pymorphy = join_sign.join(lemmatized_words)

    if len(job_ent_list) > 1:
        job_lemma_pymorphy = extract_main_word(job_lemma_pymorphy)

    ent_forms = [entity, job_lemma, job_lemma_pymorphy]

    custom_dict_ent_value = parse_output(entity)
    if custom_dict_ent_value:
        ent_forms.append(custom_dict_ent_value)

    for word in words:
        for common_word in exceptions_common:
            if common_word == word or common_word == parse_output(word):
                return "common", job_lemma
            
        for female_word in exceptions_female:
            if female_word == word or female_word == parse_output(word):
                return "female", job_lemma
            
        for male_word in exceptions_male:
            if male_word == word or male_word == parse_output(word):
                return "male", job_lemma
        
    for x in ent_forms:
        if (x in gender_dict['female']) or (x in female_list) or ("знавиця" in x):
            return "female", job_lemma
        elif (x in gender_dict['male']) or (x in male_list) or ("знавець" in x) or (x=="мера"):
            return "male", job_lemma
        elif x in common_gender_words_list:
            return "common", job_lemma
        
    # print("unkown for:", entity, " - ", job_lemma, " - ", job_lemma_pymorphy)
    return "unknown_gender", job_lemma

In [6]:
import ast

def return_gendered_dict(parallel_dataset, annotation_col_name, swapped=0):
    file_gender_dict = {"male": {}, "female": {}, "common": {}, "unknown_gender": {}}

    total_job_counter = 0
    job_list = []

    for x, row in tqdm.tqdm(parallel_dataset.iterrows()):
        filename = row['original_file_name']
        if swapped:
            filename = filename.replace(".txt", "_1.txt")
        orig_annotation = row[annotation_col_name]

        ann_str = orig_annotation.replace("'", '"')
        ann_str = ann_str.replace("–", '-')
        json_ann = ast.literal_eval(ann_str)
        
        try:
            for ent, feat in json_ann.items():
                if feat[1] == 'JOB':
                    total_job_counter += 1
                    job_list.append(feat[0])
                    gender_value, lemma_word = extract_job_gender(feat[0])
                    
                    if filename not in file_gender_dict[gender_value]:
                        file_gender_dict[gender_value][filename] = [(ent, lemma_word)]
                    else:
                        file_gender_dict[gender_value][filename].append((ent, lemma_word))
                    
        except Exception as e:
            print(f"Issue with row: {row} - Error: {e}")
    return file_gender_dict, total_job_counter, job_list

In [7]:
from collections import Counter

def return_gender_stat(total_job_counter, file_gender_dict):
    print("total size:", total_job_counter)

    for gender_class, val in file_gender_dict.items():
        print("================")
        
        print("gender:", gender_class)

        gender_entities = []

        print("number of files for the gender:", len(file_gender_dict[gender_class]))
        number_of_ents = 0
        for files, ents in file_gender_dict[gender_class].items():
            number_of_ents += len(ents)
            gender_entities += [pair[1] for pair in ents]
            
        print("number of entities for the gender:", number_of_ents)
        print("percentage:", number_of_ents/total_job_counter)

        counter = Counter(gender_entities)

        print("The most popular entity:", counter.most_common(1)[0][0])

        print("================")

In [None]:
def split_files(file_gender_dict, source_dir):
    female_files = []
    male_files = []
    common_files = []

    for filename, gen_stat in file_gender_dict.items():
        if len(list(gen_stat.keys())) == 1:
            full_path = os.path.join(source_dir, filename)
            gen = list(gen_stat.keys())[0]
            # print("gen stat:", gen_stat)
            # print("gen:", gen)
            if gen == "male":
                male_files.append(full_path)
                # print(f"add to {gen}")
            elif gen == "female":
                female_files.append(full_path)
                # print(f"add to {gen}")
            elif gen == "common":
                common_files.append(full_path)
                # print(f"add to {gen}")

    return female_files, male_files, common_files

In [None]:
def return_gender_file_stat(file_gender_dict):
    file_gen_stat_dict = {}

    for gen, file_stat in file_gender_dict.items():
        for filename, ents in file_stat.items():
            if filename not in file_gen_stat_dict:
                file_gen_stat_dict[filename] = {}
            file_gen_stat_dict[filename][gen] = len(ents)
    return file_gen_stat_dict

## Classify gender for entities from NER evaluation

### Balanced NER model

In [None]:
def read_entities(file_path):
    """
    Reads a text file where each non‐empty line is an entity,
    and returns a list of those entities (stripped of whitespace).
    """
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

In [None]:
fn_list_balanced = read_entities("/Users/linndfors/study/diploma/ner_for_fem/data/gender_classification_for_ner/balanced_fn.txt")
tp_list_balanced = read_entities("/Users/linndfors/study/diploma/ner_for_fem/data/gender_classification_for_ner/balanced_tp.txt")

In [38]:
female_tp, male_tp, common_tp, unknown_tp = [], [], [], []

for ent in tqdm.tqdm(tp_list_balanced):
    res = extract_job_gender(ent)
    job_gender = res[0]
    # print(job_gender)
    if job_gender == "female":
        female_tp.append(ent)
    elif job_gender == "male":
        male_tp.append(ent)
    elif job_gender == "common":
        common_tp.append(ent)
    else:
        unknown_tp.append(ent)

  0%|          | 0/809 [00:00<?, ?it/s]

100%|██████████| 809/809 [03:05<00:00,  4.37it/s]


In [39]:
female_fn, male_fn, common_fn, unknown_fn = [], [], [], []

for ent in tqdm.tqdm(fn_list_balanced):
    res = extract_job_gender(ent)
    job_gender = res[0]
    # print(job_gender)
    if job_gender == "female":
        female_fn.append(ent)
    elif job_gender == "male":
        male_fn.append(ent)
    elif job_gender == "common":
        common_fn.append(ent)
    else:
        unknown_fn.append(ent)


100%|██████████| 380/380 [01:02<00:00,  6.11it/s]


In [48]:
female_recall = len(female_tp) / (len(female_tp) + len(female_fn)) if (len(female_tp) + len(female_fn)) > 0 else 0
print("Female recall:", female_recall)
male_recall = len(male_tp) / (len(male_tp) + len(male_fn)) if (len(male_tp) + len(male_fn)) > 0 else 0
print("Male recall:", male_recall)
common_recall = len(common_tp) / (len(common_tp) + len(common_fn)) if (len(common_tp) + len(common_fn)) > 0 else 0
print("Comon recall:", common_recall)
unknown_recall = len(unknown_tp) / (len(unknown_tp) + len(unknown_fn)) if (len(unknown_tp) + len(unknown_fn)) > 0 else 0
print("Unknown recall:", unknown_recall)

Female recall: 0.802439024390244
Male recall: 0.5867768595041323
Comon recall: 0.873015873015873
Unknown recall: 0.3125


### Original NER Model

In [None]:
fn_list_orig = read_entities("/Users/linndfors/study/diploma/ner_for_fem/data/gender_classification_for_ner/orig_fn.txt")
tp_list_orig = read_entities("/Users/linndfors/study/diploma/ner_for_fem/data/gender_classification_for_ner/orig_tp.txt")

In [44]:
orig_female_tp, orig_male_tp, orig_common_tp, orig_unknown_tp = [], [], [], []

for ent in tqdm.tqdm(tp_list_orig):
    res = extract_job_gender(ent)
    job_gender = res[0]
    # print(job_gender)
    if job_gender == "female":
        orig_female_tp.append(ent)
    elif job_gender == "male":
        orig_male_tp.append(ent)
    elif job_gender == "common":
        orig_common_tp.append(ent)
    else:
        orig_unknown_tp.append(ent)

100%|██████████| 817/817 [02:47<00:00,  4.87it/s]


In [45]:
orig_female_fn, orig_male_fn, orig_common_fn, orig_unknown_fn = [], [], [], []

for ent in tqdm.tqdm(fn_list_orig):
    res = extract_job_gender(ent)
    job_gender = res[0]
    # print(job_gender)
    if job_gender == "female":
        orig_female_fn.append(ent)
    elif job_gender == "male":
        orig_male_fn.append(ent)
    elif job_gender == "common":
        orig_common_fn.append(ent)
    else:
        orig_unknown_fn.append(ent)


100%|██████████| 421/421 [01:43<00:00,  4.09it/s]


In [46]:
orig_female_recall = len(orig_female_tp) / (len(orig_female_tp) + len(orig_female_fn)) if (len(orig_female_tp) + len(orig_female_fn)) > 0 else 0
print("Female recall:", orig_female_recall)
orig_male_recall = len(orig_male_tp) / (len(orig_male_tp) + len(orig_male_fn)) if (len(orig_male_tp) + len(orig_male_fn)) > 0 else 0
print("Male recall:", orig_male_recall)
orig_common_recall = len(orig_common_tp) / (len(orig_common_tp) + len(orig_common_fn)) if (len(orig_common_tp) + len(orig_common_fn)) > 0 else 0
print("Common recall:", orig_common_recall)
orig_unknown_recall = len(orig_unknown_tp) / (len(orig_unknown_tp) + len(orig_unknown_fn)) if (len(orig_unknown_tp) + len(orig_unknown_fn)) > 0 else 0
print("Common recall:", orig_unknown_recall)

Female recall: 0.6899766899766899
Male recall: 0.6406995230524642
Common recall: 0.8492063492063492
Common recall: 0.2037037037037037


## Gender distribution of JOB entities for Filtered Swapped corpora

In [None]:
import os
import glob

def extract_job_entities(directory):
    pers_entities = {}
    
    ann_files = glob.glob(os.path.join(directory, "*.ann"))
    print(ann_files)
    
    for ann_file in ann_files:
        with open(ann_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) == 5:
                    
                    entity_type = parts[1]
                    
                    if entity_type == "JOB":
                        entity_text = parts[4]
                        if ann_file in pers_entities:
                            pers_entities[ann_file].append(entity_text)
                        else:
                            pers_entities[ann_file] = [entity_text]
    
    return pers_entities

def return_gendered_PERS_dict(pers_dict):
    file_gender_dict = {"male": {}, "female": {}, "common": {}, "unknown_gender": {}}

    total_pers_counter = 0
    pers_list = []

    for filename, values in tqdm.tqdm(pers_dict.items()):
        filename = filename.replace(".ann", ".txt")
        
        try:
            for ent in values:
                total_pers_counter += 1
                pers_list.append(ent)

                gender_value, lemma_word = extract_job_gender(ent)
                
                if filename not in file_gender_dict[gender_value]:
                    file_gender_dict[gender_value][filename] = [(ent, lemma_word)]
                else:
                    file_gender_dict[gender_value][filename].append((ent, lemma_word))
                    
        except Exception as e:
            print(f"Issue with row: {row} - Error: {e}")
    return file_gender_dict, total_pers_counter, pers_list

### Bruk

In [None]:
bruk_job_dict = extract_job_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk")
bruk_job_file_gender_dict, bruk_job_total_counter, bruk_job_list = return_gendered_PERS_dict(bruk_job_dict)

In [73]:
return_gender_stat(bruk_job_total_counter, bruk_job_file_gender_dict)

total size: 481
gender: male
number of files for the gender: 39
number of entities for the gender: 67
percentage: 0.1392931392931393
The most popular entity: вчитель
gender: female
number of files for the gender: 107
number of entities for the gender: 343
percentage: 0.7130977130977131
The most popular entity: журналістка
gender: common
number of files for the gender: 21
number of entities for the gender: 33
percentage: 0.06860706860706861
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 23
number of entities for the gender: 38
percentage: 0.079002079002079
The most popular entity: наука


### Ng

In [75]:
ng_job_dict = extract_job_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng")
ng_job_file_gender_dict, ng_job_total_counter, ng_job_list = return_gendered_PERS_dict(ng_job_dict)

['/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/989b35bbc2b6-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/5e33850771e3-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/1c48b2f37af3-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/0050229d8534-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/150446f83aa2-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/00edded01d7f-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/f50037706d0a-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/6d47a8c4d755-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng/5e407e3ddb68-swapped.ann', '/Users/linndfors/study/diploma/ner_

100%|██████████| 269/269 [06:00<00:00,  1.34s/it]


In [76]:
return_gender_stat(ng_job_total_counter, ng_job_file_gender_dict)

total size: 1248
gender: male
number of files for the gender: 71
number of entities for the gender: 114
percentage: 0.09134615384615384
The most popular entity: засновник
gender: female
number of files for the gender: 257
number of entities for the gender: 972
percentage: 0.7788461538461539
The most popular entity: директорка
gender: common
number of files for the gender: 78
number of entities for the gender: 135
percentage: 0.10817307692307693
The most popular entity: голова
gender: unknown_gender
number of files for the gender: 26
number of entities for the gender: 27
percentage: 0.021634615384615384
The most popular entity: в


### Total

In [79]:
print("total:", 483 + 1250)
print("male:", 67 + 115)
print("female:", 345 + 971)
print("common:", 35 + 135)
print("unknown:", 38 + 27)

total: 1733
male: 182
female: 1316
common: 170
unknown: 65


## Gender distribution of PERS entities for Filtered Swapped corpora

In [45]:
female_names = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/src/female_fname_freq_dict.csv")
male_names = pd.read_csv("/Users/linndfors/study/diploma/ner_for_fem/src/male_fname_freq_dict.csv")

In [46]:
female_names_list = female_names['name'].values
male_names_list = male_names['name'].values

In [47]:
dict_df = pd.read_csv('/Users/linndfors/study/diploma/dict_uk/out/dict_corp_lt.txt', delimiter=' ', header=None, names=['word', 'lemma', 'grammar'])

In [48]:
def define_sex(name):
    try:
        filtered_df = dict_df[(dict_df['lemma'] == name) & (dict_df['word'] == name)]
        if not filtered_df.empty:
            grammar_pers = filtered_df['grammar'].iloc[0]

            if re.search(r'fname', grammar_pers):
                if re.search(r'\bf\b', grammar_pers):
                    return "F"
                elif re.search(r'\bm\b', grammar_pers):
                    return "M"
                else:
                    return "U"
            elif re.search(r'lname', grammar_pers):
                return "U"
            else:
                return "U"
        else:
            # print("No matches found for:", name)
            return "U"
    except Exception as e:
        # print("error", e)
        return "U"

In [None]:
import os
import glob

def extract_gender(entity):

    pers_parts = entity.split(" ")
    for i in pers_parts:
        if define_sex(i) == "F":
            return "female", i
        elif define_sex(i) == "M":
            return "male", i
                
    doc = nlp(entity)
    
    words = {word.lemma for sentence in doc.sentences for word in sentence.words}
    words.update(entity.split(" "))

    for x in words:
        if x in female_names_list:
            return "female", x
        if x in male_names_list:
            return "male", x
        
    # print("unkown for:", entity)
    return "unknown_gender", entity

In [62]:
def extract_pers_entities(directory):
    pers_entities = {}
    
    ann_files = glob.glob(os.path.join(directory, "*.ann"))
    print(ann_files)
    
    for ann_file in ann_files:
        with open(ann_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) == 5:
                    
                    entity_type = parts[1]
                    
                    if entity_type == "PERS":
                        entity_text = parts[4]
                        if ann_file in pers_entities:
                            pers_entities[ann_file].append(entity_text)
                        else:
                            pers_entities[ann_file] = [entity_text]
    
    return pers_entities

def return_gendered_PERS_dict(pers_dict):
    file_gender_dict = {"male": {}, "female": {}, "unknown_gender": {}}

    total_pers_counter = 0
    pers_list = []

    for filename, values in tqdm.tqdm(pers_dict.items()):
        filename = filename.replace(".ann", ".txt")
        
        try:
            for ent in values:
                total_pers_counter += 1
                pers_list.append(ent)
                if len(ent.split(' ')) > 1:
                    ent = ent.split(' ')[0]
                gender_value, lemma_word = extract_gender(ent)
                # print(gender_value, lemma_word)
                
                if filename not in file_gender_dict[gender_value]:
                    file_gender_dict[gender_value][filename] = [(ent, lemma_word)]
                else:
                    file_gender_dict[gender_value][filename].append((ent, lemma_word))
                    
        except Exception as e:
            print(f"Issue with row: {row} - Error: {e}")
    return file_gender_dict, total_pers_counter, pers_list

### Ng

In [None]:
ng_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng")
ng_file_gender_pers_dict, ng_total_pers_counter, ng_pers_list = return_gendered_PERS_dict(ng_pers_dict)

In [60]:
return_gender_stat(ng_total_pers_counter, ng_file_gender_pers_dict)

total size: 1003
gender: male
number of files for the gender: 80
number of entities for the gender: 126
percentage: 0.12562313060817548
The most popular entity: Олександр
gender: female
number of files for the gender: 223
number of entities for the gender: 604
percentage: 0.6021934197407777
The most popular entity: Олена
gender: unknown_gender
number of files for the gender: 139
number of entities for the gender: 273
percentage: 0.27218344965104685
The most popular entity: Юлії


### Bruk

In [64]:
bruk_pers_dict = extract_pers_entities("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk")
bruk_file_gender_pers_dict, bruk_total_pers_counter, bruk_pers_list = return_gendered_PERS_dict(bruk_pers_dict)

['/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/17d3d678df81-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/fabc18a7d8fd-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/85622bc925dc-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/cee02d4c030f-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/0b962bee49bc-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/aad4dce361f1-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/ea3fab248fbe-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/326978d2c58c-swapped.ann', '/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk/0d902430768c-swapped.ann', '/Users/linndfors/

100%|██████████| 91/91 [09:49<00:00,  6.48s/it]


In [65]:
return_gender_stat(bruk_total_pers_counter, bruk_file_gender_pers_dict)

total size: 279
gender: male
number of files for the gender: 25
number of entities for the gender: 30
percentage: 0.10752688172043011
The most popular entity: Василь
gender: female
number of files for the gender: 65
number of entities for the gender: 126
percentage: 0.45161290322580644
The most popular entity: Олександра
gender: unknown_gender
number of files for the gender: 46
number of entities for the gender: 123
percentage: 0.44086021505376344
The most popular entity: Т


### Total

In [71]:
print("total:", 1282)
print("male:", 126 + 30)
print("female:", 604 + 126)
print("unknown:", 123 + 273)

total: 1282
male: 156
female: 730
unknown: 396
