# Exploratory Data Analysis

## Imports and constants

In [None]:
%load_ext autoreload
%autoreload 2

import os
from typing import Dict, Any
import logging 

logging.basicConfig(
     level=logging.INFO, 
     format= '[%(asctime)s|%(levelname)s|%(module)s.py:%(lineno)s] %(message)s',
     datefmt='%H:%M:%S'
 )
import itertools
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()
from transformers import AutoTokenizer
from datasets import load_dataset

from defi_textmine_2025.data.problem_formulation import (    
    TextToMultiLabelDataGenerator,
    Mention2TypeDataGenerator,
    EntityBracketTaggingDataGenerator
)
from defi_textmine_2025.data.utils import (
    load_labeled_raw_data,
    load_test_raw_data,
    clean_raw_dataset,
    print_value_types,
    save_data,
    convert_text_to_entity_spans
)
from defi_textmine_2025.data.utils import TARGET_COL, INPUT_COLS, INTERIM_DIR, EDA_DIR

VALIDATION_RATE = 0.25
BASE_CHECKPOINT = "camembert/camembert-base"

tokenizer = AutoTokenizer.from_pretrained(BASE_CHECKPOINT)
def count_tokens(text: str, entity_types: list=None) -> int:
    if entity_types:
        return len(tokenizer("->".join(entity_types), text)["input_ids"])
    else:
        return len(tokenizer(text)["input_ids"])

## Loading data

In [None]:
labeled_raw_df = load_labeled_raw_data()
labeled_raw_df

In [None]:
test_raw_df = load_test_raw_data()
test_raw_df

## Analysing raw data

### Check value types

In [None]:
labeled_raw_df.info()

In [None]:
test_raw_df.info()

In [None]:
# types of values in train data
print_value_types(labeled_raw_df)

In [None]:
# types of values in test data
print_value_types(test_raw_df)

### Check for duplicated rows

In [None]:
labeled_raw_df[labeled_raw_df.duplicated()]

In [None]:
test_raw_df[test_raw_df.duplicated()]

### Check for missing values

In [None]:
labeled_raw_df.isnull().sum()

In [None]:
test_raw_df.isnull().sum()

In [None]:
labeled_raw_df.relations.sort_values()

## Data Cleaning/Wrangling

### Fix value typing

In [None]:
labeled_clean_df = clean_raw_dataset(labeled_raw_df)
test_clean_df = clean_raw_dataset(test_raw_df)

In [None]:
assert (labeled_raw_df.index == labeled_clean_df.index).all()
assert (test_raw_df.index == test_clean_df.index).all()

In [None]:
test_raw_df.head(1)

In [None]:
test_clean_df.head(1)

### Check value types

In [None]:
# types of values in train data
print_value_types(labeled_clean_df)

In [None]:
# types of values in test data
print_value_types(test_clean_df)

### Save cleaned datasets

For manual analysis

In [None]:
save_data(labeled_clean_df, os.path.join(INTERIM_DIR, "train_cleaned.csv"))
save_data(test_clean_df, os.path.join(INTERIM_DIR, "test_cleaned.csv"))

## Analyze clean data

### All relation types

In [None]:
relation_classes = set(
    sum(
        labeled_clean_df.relations.apply(
            lambda row: list({r for (e1, r, e2) in row})
        ).values.tolist(),
        [],
    )
)
print(len(relation_classes))
relation_classes

#### Summary stats on relation classes

In [None]:
# number of text containing a relation category

plt.figure(figsize=(15, 10))
n_texts_per_relation_type = pd.Series({cat: labeled_raw_df[labeled_raw_df.relations.str.contains(cat)].shape[0] for cat in relation_classes}).sort_values(ascending=True)
n_texts_per_relation_type.plot.barh(xlabel="Number of texts containing the relation", ylabel="Relation")

#### number of relation instances per text

In [None]:
nb_rel_occur_per_text_df = pd.DataFrame({cat: labeled_raw_df.relations.str.count(cat) for cat in relation_classes})
nb_rel_occur_per_text_df

In [None]:
# distribution of the occurences of each category among texts
nb_rel_occur_per_text_df.describe().T.sort_values("50%", ascending=False)

#### Number of instances per relation type

In [None]:
nb_rel_occur_per_text_df.sum(axis=0).sort_values()

In [None]:
labeled_raw_df.relations.str.count("IS_LOCATED_IN")

In [None]:
labeled_clean_df.relations.loc[181]

In [None]:
# Total occurences of each category in the labeled dataset
pd.Series({cat: labeled_raw_df.relations.str.count(cat).sum() for cat in relation_classes}).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(15, 10))
pd.Series({cat: labeled_raw_df.relations.str.count(cat).sum() for cat in relation_classes}).sort_values(ascending=True).plot.barh(xlabel="Number of labeled relations of a given category", ylabel="Category of relation", title="Total occurences of each category in the labeled dataset")
# pd.Series({cat: labeled_raw_df.relations.str.count(cat).sum() for cat in relation_classes}).sort_values(ascending=False)

#### How many relations can we expect to extract from a text?

In [None]:
n_relations_per_text_df = labeled_clean_df.relations.map(lambda x: len(x))
print(n_relations_per_text_df.describe())
n_relations_per_text_df.hist()

### All entity types

In [None]:
entity_classes = set(
    sum(
        labeled_clean_df.entities.apply(
            lambda row: list({e["type"] for e in row})
        ).values.tolist(),
        [],
    )
)
print(len(entity_classes))
entity_classes

#### Summary stats on entity classes

* entity type frequencies

In [None]:
labeled_raw_df[labeled_raw_df.entities.str.contains("MATERIEL")].head()

In [None]:
plt.figure(figsize=(15, 10))
n_texts_per_entity_type = pd.Series({cat: labeled_raw_df[labeled_raw_df.entities.str.contains(cat)].shape[0] for cat in entity_classes}).sort_values(ascending=True)
n_texts_per_entity_type.plot.barh(xlabel="Number of texts containing the entity", ylabel="Entity")

### Stats about the cooccurence between entity categories and relation categories

For each relation category:
- what is the number of time each entity category is e1?
- what is the number of time each entity category is e2?
- what pairs of entity categories are never into a relation?

In [None]:
labeled_clean_df.values

In [None]:
for text_idx, text, text_entities, text_relations in labeled_clean_df.reset_index().values:
    pass

In [None]:

entity_relation_cat_df  = pd.concat(
    [
        pd.DataFrame([[text_idx, e1_id, e2_id, text_entities[e1_id]["type"], r_cat, text_entities[e2_id]["type"]] for e1_id, r_cat, e2_id in text_relations], columns=["text_id", "e1_id", "e2_id", "e1_cat", "r_cat", "e2_cat"])
        for text_idx, text, text_entities, text_relations in labeled_clean_df.reset_index().values
    ],
    axis=0
)
logging.info(f"{entity_relation_cat_df.shape=}")
entity_relation_cat_df.head()

In [None]:
print(len(relation_classes), relation_classes)
print(len(entity_classes), entity_classes)

In [None]:
entity_relation_cat_df.query("r_cat=='HAS_CONTROL_OVER'").e1_cat.value_counts(normalize=True)

In [None]:
entity_relation_cat_df.query("r_cat=='HAS_CONTROL_OVER'").e2_cat.value_counts(normalize=True)

#### For a relation, how many times is an entity category e1 or e2?

In [None]:
# plt.figure(figsize=(15, 10))
df = pd.concat([entity_relation_cat_df.value_counts(normalize=False), entity_relation_cat_df.value_counts(normalize=True)], axis=1).reset_index(drop=False)
n_relation_classes = len(relation_classes)
n_entity_classes = len(entity_classes)
logging.info(f"{df.shape[0]} existing relations vs. {n_entity_classes * n_relation_classes * n_entity_classes=} imaginable relations !")
df.to_csv(os.path.join(EDA_DIR, "e1_cat-r_cat-e2_cat-freq.csv")) #.plot.barh()
df

In [None]:
entity_relation_cat_df.query("r_cat=='STARTED_IN'").drop("r_cat", axis=1).value_counts().rename("count_STARTED_IN")

In [None]:
entity_relation_cat_df.query("r_cat=='WAS_CREATED_IN'").drop("r_cat", axis=1).value_counts()

In [None]:
entity_relation_cat_df.query("e2_cat=='TIME_EXACT'").drop(["e1_cat", "e2_cat", "e1_id", "e2_id"], axis=1).value_counts()

In [None]:
entity_relation_cat_df.query("e2_cat=='TIME_FUZZY'").drop(["e1_cat", "e2_cat", "e1_id", "e2_id"], axis=1).value_counts()

In [None]:
# number of relation categories between a pair of entity categories
entity_relation_cat_df.drop(["e1_id", "e2_id"], axis=1).drop_duplicates().groupby(["e1_cat", "e2_cat"]).count().sort_values(by="r_cat")

#### what pairs of entity categories are in any relationship in the train dataset?

In [None]:
entity_cat_pair_in_relation_df = entity_relation_cat_df[["e1_cat", "e2_cat", "e1_id", "e2_id"]]#.drop_duplicates(subset=["e1_cat", "e2_cat"])
entity_cat_pair_in_relation_df

In [None]:
df = pd.concat([entity_cat_pair_in_relation_df.value_counts(normalize=False), entity_cat_pair_in_relation_df.value_counts(normalize=True)], axis=1).reset_index(drop=False)
n_entity_classes = len(entity_classes)
logging.info(f"{df.shape[0]} existing entity category pairs in relation vs. {n_entity_classes * n_entity_classes=} imaginable entity category pairs !")
df.to_csv(os.path.join(EDA_DIR, "e1_cat-e2_cat-freq.csv")) #.plot.barh()
df

### How many times the relation involves 2 entities of the same type?

In [None]:
e1_cat_equal_e2_cat_df = entity_cat_pair_in_relation_df.query("e1_cat==e2_cat").drop(["e1_id", "e2_id"], axis=1)
e1_cat_equal_e2_cat_df

In [None]:
df = pd.concat([e1_cat_equal_e2_cat_df.value_counts(normalize=False), e1_cat_equal_e2_cat_df.value_counts(normalize=True)], axis=1).reset_index(drop=False)
n_entity_classes = len(entity_classes)
logging.info(f"{df.shape[0]} existing entity of identical categories in relation vs. {n_entity_classes=} imaginable entity category !")
df.to_csv(os.path.join(EDA_DIR, "e1_cat-equal-e2_cat-freq.csv")) #.plot.barh()
df

In [None]:
per_r_cat_e1_cat_equal_e2_cat_df = entity_relation_cat_df.query("e1_cat==e2_cat").drop(["e1_id", "e2_id"], axis=1)
per_r_cat_e1_cat_equal_e2_cat_df

In [None]:
df = pd.concat([per_r_cat_e1_cat_equal_e2_cat_df.value_counts(normalize=False), per_r_cat_e1_cat_equal_e2_cat_df.value_counts(normalize=True)], axis=1).reset_index(drop=False)
n_entity_classes = len(entity_classes)
logging.info(f"{df.shape[0]} existing entity of identical categories in relation vs. {n_entity_classes=} imaginable entity category !")
df.to_csv(os.path.join(EDA_DIR, "per_r_cat-e1_cat-equal-e2_cat_df-freq.csv")) #.plot.barh()
df

#### what pairs of entity categories never have any relationship?

- we don't need to attempt to classify pairs of entities of these types

In [None]:
all_possible_entity_cat_pairs_df = pd.DataFrame([[e1_cat, e2_cat] for (e1_cat, e2_cat) in itertools.product(*[list(entity_classes)]*2)], columns=["e1_cat", "e2_cat"]).set_index(["e1_cat", "e2_cat"])
all_possible_entity_cat_pairs_df

In [None]:
entity_cat_pair_in_relation_df.drop_duplicates().set_index(["e1_cat", "e2_cat"])

##### Binary relations

In [None]:
entity_type_pairs_in_binary_relation = set(entity_cat_pair_in_relation_df.query("e1_id != e2_id")[["e1_cat", "e2_cat"]].set_index(["e1_cat", "e2_cat"]).index.to_list())
print(entity_type_pairs_in_binary_relation)
len(entity_type_pairs_in_binary_relation)

In [None]:
print(set(all_possible_entity_cat_pairs_df.index.to_list()))

In [None]:
print(set(all_possible_entity_cat_pairs_df.index.to_list()))

In [None]:
entity_pairs_never_in_relation_df = pd.DataFrame(index=all_possible_entity_cat_pairs_df.index.difference(entity_cat_pair_in_relation_df.drop_duplicates().set_index(["e1_cat", "e2_cat"]).index)).reset_index(drop=False)
entity_pairs_never_in_relation_df.to_csv(os.path.join(EDA_DIR, "entity_pairs_never_in_relation.csv"))
entity_pairs_never_in_relation_df

##### Unary relation

In [None]:
entity_cat_pair_in_relation_df.query("e1_id == e2_id")

In [None]:
entity_types_in_unary_relation = set(entity_cat_pair_in_relation_df.query("e1_id == e2_id")[["e1_cat"]].set_index(["e1_cat"]).index.to_list())
print(entity_types_in_unary_relation)
len(entity_types_in_unary_relation)

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'PLACE'")

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'ACCIDENT'")

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'CIVILIAN'").r_cat.value_counts()

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'TERRORIST_OR_CRIMINAL'").r_cat.value_counts()

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'NON_MILITARY_GOVERNMENT_ORGANISATION'")

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'GROUP_OF_INDIVIDUALS'")

In [None]:
entity_relation_cat_df.query("e1_id == e2_id & e1_cat== 'MILITARY'")

In [None]:
entity_relation_cat_df.shape

### What are the entity types involved in minority relation types

... for data augmentation by entity replacement

In [None]:
text, text_entities, text_relations = labeled_clean_df.iloc[100].to_list()
text

In [None]:
text_entities[0]

In [None]:
convert_text_to_entity_spans(text, text_entities)

In [None]:
plt.figure(figsize=(15, 10))
n_texts_per_entity_type.plot.barh()

In [None]:
entity_relation_cat_df.query("e2_cat=='COLOR'")#.shape[0]

In [None]:
entity_relation_cat_df.query("e1_cat=='MATERIEL'").shape[0]

In [None]:
entity_relation_cat_df.query("r_cat=='HAS_CONSEQUENCE'").shape[0]

In [None]:
entity_relation_cat_df.query("r_cat=='GENDER_FEMALE'")#.shape[0]

In [None]:
print(n_texts_per_relation_type.rename("n_examples").to_frame().query("n_examples < 500").index.shape[0])
n_texts_per_relation_type.rename("n_examples").to_frame().query("n_examples < 500")

In [None]:
n_texts_per_relation_type.index

In [None]:
entity_relation_cat_df.query(f"r_cat=='IS_IN_CONTACT_WITH'")#.text_id.duplicated()

In [None]:
entity_relation_cat_df.query("e1_cat == e2_cat")

In [None]:
labeled_clean_df.loc[2455].text

In [None]:
labeled_clean_df[labeled_raw_df.relations.str.contains("HAS_LATITUDE")]

### Text length

#### Number of chars

In [None]:
pd.DataFrame(
    {
        set_name: df.text.apply(len).describe()
        for set_name, df in zip(
            ["labeled", "test"],
            [labeled_clean_df, test_clean_df],
        )
    }
)#.plot.bar()

#### Number of tokens (subwords from CamemBERT tokenizer)

In [None]:
pd.DataFrame(
    {
        set_name: df.text.apply(count_tokens).describe()
        for set_name, df in zip(
            ["labeled", "test"],
            [labeled_clean_df, test_clean_df],
        )
    }
)#.plot.bar()

## Generate interim datasets

for each text, generate a csv file containing all the generated texts annotated as a multilabeled Text classification task

The csv file is named after the index of the text in the raw data


In [None]:
excluded_entity_pairs = entity_pairs_never_in_relation_df.set_index(['e1_cat', 'e2_cat']).index.to_list()
excluded_entity_pairs[:3]

In [None]:
entity_pairs_never_in_relation_df.query("e1_cat == e2_cat")

### Method 1: tag entity role and types

In [None]:
generated_data_dir_path = os.path.join(INTERIM_DIR, "entity_role_n_type_tagged_text_dataset")
# assert not os.path.exists(
#     generated_data_dir_path
# ), f"You must delete this folder first {generated_data_dir_path}!"

data_generator = TextToMultiLabelDataGenerator(
    allowed_binary_relation_entity_type_pairs=entity_type_pairs_in_binary_relation,
    allowed_unary_relation_entity_types=entity_types_in_unary_relation
)

for split_name, clean_df in zip(
    ["test", "train"],
    [test_clean_df, labeled_clean_df],
):
    dest_dir_path = os.path.join(generated_data_dir_path, split_name)
    for multilabel_data in (
        pb := tqdm(
            # data_generator.generate_row_multilabel_data(clean_df, only_w_relation=True if split_name!="test" else False),
            data_generator.generate_row_multilabel_data(clean_df, only_w_relation=False),
            total=clean_df.shape[0],
            desc=f"{dest_dir_path} <- ",
        )
    ):
        text_index = multilabel_data.iloc[0][data_generator.text_index_col]
        dest_csv_file = os.path.join(dest_dir_path, f"{text_index}.csv")
        pb.set_description(f"{dest_csv_file} <-")
        save_data(multilabel_data, dest_csv_file, False)
        break
    # break
multilabel_data

In [None]:
multilabel_data.iloc[1].text

### Method2: Replace entity by entity type and role in a single tag

In [None]:
generated_data_dir_path = os.path.join(INTERIM_DIR, "entity_mention2type_tagged_text_dataset")
# assert not os.path.exists(
#     generated_data_dir_path
# ), f"You must delete this folder first {generated_data_dir_path}!"

data_generator = Mention2TypeDataGenerator(
    allowed_binary_relation_entity_type_pairs=entity_type_pairs_in_binary_relation,
    allowed_unary_relation_entity_types=entity_types_in_unary_relation
)

for split_name, clean_df in zip(
    ["test", "train"],
    [test_clean_df, labeled_clean_df],
):
    dest_dir_path = os.path.join(generated_data_dir_path, split_name)
    for multilabel_data in (
        pb := tqdm(
            # data_generator.generate_row_multilabel_data(clean_df, only_w_relation=True if split_name!="test" else False),
            data_generator.generate_row_multilabel_data(clean_df, only_w_relation=False),
            total=clean_df.shape[0],
            desc=f"{dest_dir_path} <- ",
        )
    ):
        text_index = multilabel_data.iloc[0][data_generator.text_index_col]
        dest_csv_file = os.path.join(dest_dir_path, f"{text_index}.csv")
        pb.set_description(f"{dest_csv_file} <-")
        save_data(multilabel_data, dest_csv_file, False)
        break
    # break
multilabel_data

In [None]:
multilabel_data.loc[1]["text"]

### Methode3: Tagging entity mentions with brackets only to express their role in the relation

- `{...}`: for the first entity
- `[...]`: for the second entity

In [None]:
generated_data_dir_path = os.path.join(INTERIM_DIR, "entity_bracket_tagging_dataset")
# assert not os.path.exists(
#     generated_data_dir_path
# ), f"You must delete this folder first {generated_data_dir_path}!"

data_generator = EntityBracketTaggingDataGenerator(
    allowed_binary_relation_entity_type_pairs=entity_type_pairs_in_binary_relation,
    allowed_unary_relation_entity_types=entity_types_in_unary_relation
)

for split_name, clean_df in zip(
    ["test", "train"],
    [test_clean_df, labeled_clean_df],
):
    dest_dir_path = os.path.join(generated_data_dir_path, split_name)
    for multilabel_data in (
        pb := tqdm(
            # data_generator.generate_row_multilabel_data(clean_df, only_w_relation=True if split_name!="test" else False),
            data_generator.generate_row_multilabel_data(clean_df, only_w_relation=False),
            total=clean_df.shape[0],
            desc=f"{dest_dir_path} <- ",
        )
    ):
        text_index = multilabel_data.iloc[0][data_generator.text_index_col]
        dest_csv_file = os.path.join(dest_dir_path, f"{text_index}.csv")
        pb.set_description(f"{dest_csv_file} <-")
        save_data(multilabel_data, dest_csv_file, False)
        # break
    # break
multilabel_data

In [None]:
multilabel_data.query("e1_id == e2_id")

## Check tagged text size using a  data loader to load data from csv files

to know wether the will fit at the input of the model (i.e. max of 512 tokens)

##### add special tokens to the tokenizer

In [None]:
# define special tokens to add to the tokenizer
# task_special_tokens = ["<e1>", "</e1>", "<e2>", "</e2>"] + [
#     f"<{entity_class}>" for entity_class in entity_classes
# ]

task_special_tokens = [f"<{entity_class}>" for entity_class in entity_classes]  # to specify the type of the 1st and 2nd entity
print(task_special_tokens)

In [None]:
# add special tokens to the tokenizer
num_added_tokens = tokenizer.add_tokens(task_special_tokens, special_tokens=True)
num_added_tokens

In [None]:
tokenizer

### initialize the data loader

In [None]:
interim_dataset = load_dataset(
    "csv",
    name="multilabel_tagged_text_dataset",
    data_dir="data/defi-text-mine-2025/interim/entity_bracket_tagging_dataset",
    streaming=False,
)

In [None]:
interim_dataset

In [None]:
interim_dataset["train"][0]

In [None]:
interim_dataset["train"][1]

In [None]:
interim_dataset["test"][7]

### Count tokens

In [None]:
def count_token_in_dataset_element(example: Dict[str, Any]) -> Dict[str, int]:
    return {"n_tokens": count_tokens(example["text"], [example["e1_type"], example["e2_type"]] if example["e2_id"] != example["e2_id"] else [example["e1_type"]])}


interim_dataset = interim_dataset.map(count_token_in_dataset_element)
interim_dataset

In [None]:
split2ntokens_df = pd.DataFrame(
    {
        split_name: pd.Series(
            [e["n_tokens"] for e in tqdm(interim_dataset[split_name], split_name)],
            name=f"{split_name}_text_n_tokens",
        )
        for split_name in interim_dataset.keys()
    }
)
split2ntokens_df.describe()

In [None]:
split2ntokens_df.hist()

In [None]:
interim_dataset["test"].filter(lambda x: x['n_tokens'] < 70)[:]

## Keep only sentences of interes: mentioning the entities

### Test `stanza` to split text into sentences

With `re` to filter sentences of interest

In [None]:
# interim_dataset["train"].filter(lambda x: x["text_index"] == 1175 and x["relations"] == ['IS_LOCATED_IN'])
interim_dataset["train"].filter(lambda x: x["text_index"] == 1175 and x["e1_id"] == x['e2_id'] and x["relations"] == "['IS_LOCATED_IN']")[:]

In [None]:
import stanza
from stanza import DownloadMethod
import re

lang = "fr"

# text = "Treize personnes ont trouvé la mort le 28 juin au matin dans le < sud du Togo >. Le minibus dans lequel ils se trouvaient a heurté un arbre, suite à l'éclatement de l'un de ses pneus sous l'effet d'une vitesse excessive. Selon un communiqué du ministre de la Sécurité, M. Billel Alibert, le bus qui transportait principalement des commerçants a dérapé à la suite de l'éclatement du pneu avant-droit. Le minibus s'est ensuite retrouvé sous le pont de la rivière, là où les corps ont été découverts. Dans sa chute, le bus a heurté un teck de 6 mètres de hauteur avant de se renverser sur son flanc droit, provoquant d'importants dégâts matériels et humains. L'accident s'est produit sur la nationale 1, reliant le < Togo > au Burkina Faso."
# text = """La 12e édition de la Journée de la Street Food s'est tenue ce [ 17 octobre 2015 ] à Londres. Malrgé la pluie, les { amateurs } de bonne cuisine étaient présents."""
text = """Des milliers de personnes se sont retrouvées sur la Place de Cybèle pour exprimer leur soutien aux familles touchées par le drame du 06 mai 2015. En effet, près de 50 personnes ont perdu la vie dans l’explosion d’une centrale nucléaire. Les <victimes> étaient pour la plupart des <travailleurs> de la centrale. Lili-May Lopez, veuve d’un des ingénieurs, a créé l’association Justice pour nos Défunts pour réclamer que la lumière soit faite sur ce qui s’est réellement passé. Le Syndicat des Travailleurs Libres s’est joint à cette association afin de demander en plus, des conditions de travail plus sécurisées dans les centrales nucléaires du pays. Selon le responsable de ce syndicat, l’uranium n’est pas stocké selon les normes et les générateurs de vapeur et mobiliers sont vétustes. On retrouve même des rats dans les bâtiments qui rongent les câbles et documents importants. Le gouvernement a assuré aux manifestants que leurs doléances seront prises en compte."""

nlp = stanza.Pipeline(lang=lang, processors='tokenize', download_method=DownloadMethod.REUSE_RESOURCES)
e1_pattern = re.compile(".*[\{\}<>].*")
e2_pattern = re.compile(".*[\[\]].*")
final_text_sentences = []
doc = nlp(text)
for i, sentence in enumerate(doc.sentences):
    # print(f'====== Sentence {i+1} tokens =======')
    sentence_text = " ".join([token.text for token in sentence.tokens])
    if e1_pattern.match(sentence_text) and e2_pattern.match(sentence_text):
        final_text_sentences = [sentence_text]
        break
    if e1_pattern.match(sentence_text) or e2_pattern.match(sentence_text):
        final_text_sentences.append(sentence_text)
" ".join(final_text_sentences)

In [2]:
import stanza
from stanza import DownloadMethod
import re

lang = "fr"

text = """Des milliers de personnes se sont retrouvées sur la Place de Cybèle pour exprimer leur soutien aux familles touchées par le drame du 06 mai 2015. En effet, près de 50 personnes ont perdu la vie dans l’explosion d’une centrale nucléaire. Les <victimes> étaient pour la plupart des <travailleurs> de la centrale. Lili-May Lopez, veuve d’un des ingénieurs, a créé l’association Justice pour nos Défunts pour réclamer que la lumière soit faite sur ce qui s’est réellement passé. Le Syndicat des Travailleurs Libres s’est joint à cette association afin de demander en plus, des conditions de travail plus sécurisées dans les centrales nucléaires du pays. Selon le responsable de ce syndicat, l’uranium n’est pas stocké selon les normes et les générateurs de vapeur et mobiliers sont vétustes. On retrouve même des rats dans les bâtiments qui rongent les câbles et documents importants. Le gouvernement a assuré aux manifestants que leurs doléances seront prises en compte."""

nlp = stanza.Pipeline(lang=lang, processors='tokenize', download_method=DownloadMethod.REUSE_RESOURCES)
e1_pattern = re.compile(".*[\{\}<>].*")
e2_pattern = re.compile(".*[\[\]].*")
final_text_sentences = []
doc = nlp(text)
for i, sentence in enumerate(doc.sentences):
    print(sentence.text)

2024-10-06 16:10:51 INFO: Loading these models for language: fr (French):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2024-10-06 16:10:51 INFO: Using device: cuda
2024-10-06 16:10:51 INFO: Loading: tokenize
2024-10-06 16:10:52 INFO: Loading: mwt
2024-10-06 16:10:52 INFO: Done loading processors!


Des milliers de personnes se sont retrouvées sur la Place de Cybèle pour exprimer leur soutien aux familles touchées par le drame du 06 mai 2015.
En effet, près de 50 personnes ont perdu la vie dans l’explosion d’une centrale nucléaire.
Les <victimes> étaient pour la plupart des <travailleurs> de la centrale.
Lili-May Lopez, veuve d’un des ingénieurs, a créé l’association Justice pour nos Défunts pour réclamer que la lumière soit faite sur ce qui s’est réellement passé.
Le Syndicat des Travailleurs Libres s’est joint à cette association afin de demander en plus, des conditions de travail plus sécurisées dans les centrales nucléaires du pays.
Selon le responsable de ce syndicat, l’uranium n’est pas stocké selon les normes et les générateurs de vapeur et mobiliers sont vétustes.
On retrouve même des rats dans les bâtiments qui rongent les câbles et documents importants.
Le gouvernement a assuré aux manifestants que leurs doléances seront prises en compte.


### Apply over the whole datasets

In [None]:
interim_dataset["train"]

In [None]:
from typing import List
import stanza
from stanza import DownloadMethod
import re

lang = "fr"

nlp = stanza.Pipeline(lang=lang, processors='tokenize', download_method=DownloadMethod.REUSE_RESOURCES)
e1_pattern = re.compile(".*[\{\}<>].*")
e2_pattern = re.compile(".*[\[\]].*")

def reduce_a_text_to_text_of_interest(tagged_text: str) -> str:
    final_text_sentences = []
    doc = nlp(text)
    for i, sentence in enumerate(doc.sentences):
        # print(f'====== Sentence {i+1} tokens =======')
        # sentence_text = " ".join([token.text for token in sentence.tokens])
        if e1_pattern.match(sentence.text) and e2_pattern.match(sentence.text):
            final_text_sentences = [sentence_text]
            break
        if e1_pattern.match(sentence.text) or e2_pattern.match(sentence.text):
            final_text_sentences.append(sentence.text)
    return " ".join(final_text_sentences)

def reduce_texts_to_text_of_interest(examples) -> List[Dict]:
    return examples | {'text_of_interest': [reduce_a_text_to_text_of_interest(text) for text in examples['text']]}

interim_dataset["train"] = interim_dataset["train"].map(reduce_texts_to_text_of_interest)