In [1]:
# Import required libraries and install any necessary packages
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

# Check the installed version of spaCy
spacy.__version__

# Check GPU information
!nvidia-smi

  from .autonotebook import tqdm as notebook_tqdm


Mon Jun  3 09:10:16 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.22                 Driver Version: 552.22         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   48C    P8             17W /  128W |    2407MiB /   8192MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [10]:
import json

# Charger les données depuis spacy_cleaned_data.json
with open('spacy_cleaned_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Préparer la structure pour annotations.json
annotations = []

# Parcourir chaque entrée dans le fichier spacy_cleaned_data.json
for entry in data:
    text = entry['text']
    ents = entry['ents']
    
    entities = []
    for ent in ents:
        start, end, label = ent['start'], ent['end'], ent['label']
        entities.append([start, end, label])
    
    # Ajouter l'entrée formatée dans la liste des annotations
    annotations.append([text, {"entities": entities}])

# Enregistrer les données formatées dans annotations.json
with open('annotations.json', 'w', encoding='utf-8') as f:
    json.dump(annotations, f, ensure_ascii=False, indent=4)

print("Conversion terminée. Les données sont enregistrées dans annotations.json.")


Conversion terminée. Les données sont enregistrées dans annotations.json.


In [5]:
import re

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > valid_start and valid_end <= len(text) and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            if valid_start < valid_end:  # Ensure the span is still valid
                valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [11]:
# Load the annotated data from a JSON file
cv_data = json.load(open('./annotations.json','r'))

# Display the number of items in the dataset
#len(cv_data)
#print(cv_data)
trimed_data = trim_entity_spans(cv_data)
#print(trimed_data)

# Display the first item in the dataset
cv_data[3]

['408048 infirmier(Ã¨re infirmier(Ã¨re \n clinicien(ne Ste Anne PÃ©rade \n\n IMPRIMER \n\n\n\n employeur \n centre intÃ©grÃ© universitaire santÃ© SERVICES \n sociau mauricie centre QUÃ‰BEC \n site web \n https://www.travaillerensante.com/ \n adresse lieu travail \n 752 rue couvent Saint-Tite quÃ©bec Canada G0X3H0 \n appellation emploi \n infirmier diplÃ´mÃ© infirmier diplÃ´mÃ© \n nombre poste \n 1 \n contacter \n ciuss MCQ \n Courriel 04codes@ssss.gouv.qc.ca \n description entreprise \n ciuss MCQ crÃ©er premier avril 2015 issu 12 Ã©tablissement public santÃ© service social rÃ©gion sociosanitair \n responsabilitÃ© assurer intÃ©gration soin service offrir population rÃ©seau territorial veille organisation service \n complÃ©mentaritÃ© cadre mission CH CLSC chsld centre protection enfance jeunesse centre \n rÃ©adaptation santÃ© public fonction besoin population rÃ©alitÃ© territorial \n description offrir emploi \n relation aide domaine passionn reconnaÃ®tre leadership orientÃ©e clientÃ¨le 

In [12]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [13]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(trimed_data, test_size=0.2)

# Display the number of items in the training and testing sets
len(train), len(test)

# Open a file to log errors during annotation processing
file = open('train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('train_data.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_data.spacy')

# Close the error log file
file.close()

100%|██████████| 11/11 [00:00<00:00, 305.57it/s]
100%|██████████| 3/3 [00:00<00:00, 187.51it/s]


## Ligne de commande à executer dans le terminal:
 python -m spacy train ./config.cfg  --output ./output  --paths.train ./train_data.spacy  --paths.dev ./test_data.spacy --gpu-id 0