In [1]:
import os
import torch
import pandas as pd
from pathlib import Path
import json

In [2]:
input_files_directory = "bamman2014_similarity_triplets/txt_files"
input_files_directory = "CharaSim-fr/txt_files"

output_files_directory = input_files_directory.replace("txt_files", "propp_processed_files")
os.makedirs(output_files_directory, exist_ok=True)

In [3]:
characters_aliases_dict_path = input_files_directory.replace("txt_files", "characters_aliases_dict.json")

with open(characters_aliases_dict_path, "r", encoding="utf-8") as f:
    characters_aliases_dict = json.load(f)
print(len(characters_aliases_dict))

163


In [4]:
txt_files = sorted(p.stem for p in Path(input_files_directory).iterdir() if p.suffix == ".txt")
entities_files = sorted(p.stem for p in Path(output_files_directory).iterdir() if p.suffix == ".entities")

print(f"Text files: {len(txt_files):,}")
print(f"Entities files: {len(entities_files):,}")

Text files: 163
Book files: 163


In [5]:
from propp_fr import load_models, load_text_file, generate_tokens_df, load_tokenizer_and_embedding_model, get_embedding_tensor_from_tokens_df, generate_entities_df, add_features_to_entities, perform_coreference, extract_attributes, save_tokens_df, save_entities_df

from pathlib import Path
from tqdm.auto import tqdm


unprocessed_files = [file for file in txt_files if file not in entities_files]
print(f"Unprocessed Files: {len(unprocessed_files):,}")

spacy_model, mentions_detection_model, coreference_resolution_model = load_models(
    spacy_model_name='fr_dep_news_trf',
    mentions_detection_model_name='AntoineBourgois/propp-fr_NER_camembert-large_PER', coreference_resolution_model_name='AntoineBourgois/propp-fr_coreference-resolution_camembert-large_PER')
tokenizer, embedding_model = load_tokenizer_and_embedding_model(mentions_detection_model["base_model_name"])

for file_name in tqdm(unprocessed_files, desc="Processing .txt Files"):
    print(f"Processing: {file_name}...")
    text_content = load_text_file(file_name, input_files_directory)
    tokens_df = generate_tokens_df(text_content, spacy_model, max_char_sentence_length=25000)
    tokens_embedding_tensor = get_embedding_tensor_from_tokens_df(
        text_content,
        tokens_df,
        tokenizer,
        embedding_model,
    )
    torch.save(tokens_embedding_tensor, os.path.join(output_files_directory, file_name + ".tokens_embedding_tensor"))

    entities_df = generate_entities_df(
        tokens_df,
        tokens_embedding_tensor,
        mentions_detection_model,
    )

    entities_df = add_features_to_entities(entities_df, tokens_df)

    characters_alias_list = characters_aliases_dict[file_name] if file_name in characters_aliases_dict else None
    print(characters_alias_list)

    entities_df = perform_coreference(
        entities_df,
        tokens_embedding_tensor,
        coreference_resolution_model,
        propagate_coref=True,
        rule_based_postprocess=False,
        characters_alias_list=characters_alias_list,
    )

    tokens_df = extract_attributes(entities_df, tokens_df)

    save_tokens_df(tokens_df, file_name, output_files_directory)
    save_entities_df(entities_df, file_name, output_files_directory)

propp_fr package loaded successfully.
Unprocessed Files: 0
Loading models...
CUDA is required, Spacy model should run on GPU.
Model Loaded Successfully from local path: /home/antoine/Bureau/character_attributes_classification/AntoineBourgois/propp-fr_NER_camembert-large_PER/final_model.pkl
Model Loaded Successfully from local path: /home/antoine/Bureau/character_attributes_classification/AntoineBourgois/propp-fr_coreference-resolution_camembert-large_PER/final_model

Models Loaded Successfully:
Spacy: fr_dep_news_trf
Mentions Detection: AntoineBourgois/propp-fr_NER_camembert-large_PER
Coreference Resolution: AntoineBourgois/propp-fr_coreference-resolution_camembert-large_PER


Some weights of CamembertModel were not initialized from the model checkpoint at almanach/camembert-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded encoder model: almanach/camembert-large


Processing .txt Files: 0it [00:00, ?it/s]

## Assign Character Name to COREF chain

In [6]:
files_directory = "bamman2014_similarity_triplets/propp_processed_files"
files_directory = "CharaSim-fr/propp_processed_files"

characters_aliases_dict_path = files_directory.replace("propp_processed_files", "characters_aliases_dict.json")

with open(characters_aliases_dict_path, "r", encoding="utf-8") as f:
    characters_aliases_dict = json.load(f)
print(len(characters_aliases_dict))

163


In [7]:
from propp_fr import load_entities_df, save_entities_df

entities_files = sorted(p.stem for p in Path(files_directory).iterdir() if p.suffix == ".entities")

for file_name in tqdm(entities_files, desc="Processing .txt Files"):
    entities_df = load_entities_df(file_name, files_directory)
    entities_df["COREF_name"] = entities_df["COREF"].astype(str)
    characters_alias_list = characters_aliases_dict[file_name] if file_name in characters_aliases_dict else None

    if characters_alias_list is not None:
        for char_name, char_aliases in characters_alias_list.items():
            char_COREF = entities_df[entities_df["text"]==char_aliases[0]]
            if len(char_COREF) > 0:
                char_COREF = char_COREF["COREF"].tolist()[0]
                entities_df.loc[entities_df[entities_df["COREF"]==char_COREF].index, "COREF_name"] = char_name

    save_entities_df(entities_df, file_name, files_directory)

Processing .txt Files:   0%|          | 0/163 [00:00<?, ?it/s]