In [1]:
import json
import os
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [2]:
current_directory = os.getcwd()


Transformation du fichier label studio au format compatible avec spacy

In [3]:
INPUTFILE_LABEL_STUDIO_EXPORT_JSON = os.path.join(current_directory, '..', 'data', 'raw_data', 'raw-entities.json')

OUPUTFILE_SPACY_JSON = os.path.join(current_directory, '..', 'data', 'raw_data', 'spacy_output_format.json')

with open(INPUTFILE_LABEL_STUDIO_EXPORT_JSON, 'r') as f:

    # Lecture du fichier JSON
    data = json.load(f)

    output_list = []

    # Boucle sur chaque element du fichier JSON
    for item in data:
        annotations = item["annotations"]
        text_data = item["data"]["text"]

        entities = []
        for annotation in annotations:
            results = annotation["result"]
            for result in results:
                try:
                    value = result["value"]
                    start = value["start"]
                    end = value["end"]
                    
                   
                    # Convertir les labels en liste
                    labels = ",".join(value["labels"])
                    
                    entities.append([start, end, labels])

                except (KeyError, IndexError):
                    print("Error: Value n'existe pas.")

        output_list.append([text_data, {"entities": entities}])

    # Création du nouveau fichier
    with open(OUPUTFILE_SPACY_JSON, "w", encoding="utf-8") as outfile:
        json.dump(output_list, outfile, indent=2, ensure_ascii=False)

    print(f'Fichier créé {OUPUTFILE_SPACY_JSON} .')


Fichier créé c:\M2\Projet_Chatbot_NLP\scripts\..\data\raw_data\spacy_output_format.json .


In [6]:
# 
INPUTFILE_SPACY_JSON=os.path.join(current_directory, '..', 'data', 'raw_data', 'spacy_output_format.json')

OUTPUT_directory = os.path.join(current_directory, '..', 'data')

with open(INPUTFILE_SPACY_JSON, 'r') as f:

    # ouvrir le fichier JSON
    data = json.load(f)
    
    
    count_items = len(data)
    print("Dataset:", count_items)


TRAIN_DATA = data[:150]
TEST_DATA = data[150:]

print('\n--- TRAIN_DATA')
print(TRAIN_DATA)

print('\n--- TEST_DATA')
print(TEST_DATA)

    
# train data
def convert(path, dataset):
    nlp = spacy.blank("pt")
    db = DocBin()
    for text, annot in tqdm(dataset):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
                doc.ents = ents
                db.add(doc)
                db.to_disk(path)

print(f'Fichier créé {INPUTFILE_SPACY_JSON}.')

convert(os.path.join(OUTPUT_directory, 'train.spacy') , TRAIN_DATA)
convert(os.path.join(OUTPUT_directory, 'test.spacy'), TEST_DATA)

Dataset: 200

--- TRAIN_DATA
[["I'd like a Mediterranean salad and a bottle of water.", {'entities': [[11, 30, 'food'], [47, 52, 'food'], [35, 36, 'qnt'], [9, 10, 'qnt']]}], ['Hello, can I have 2 duck breasts, 1 glass of red wine, and a bottle of water?', {'entities': [[20, 32, 'food'], [45, 53, 'food'], [71, 76, 'food'], [18, 19, 'qnt'], [34, 35, 'qnt'], [59, 60, 'qnt']]}], ['Give me a Mediterranean salad and 2 glasses of orange juice.', {'entities': [[10, 29, 'food'], [47, 59, 'food'], [8, 9, 'qnt'], [34, 35, 'qnt']]}], ['Could you get 3 beef carpaccios, 2 bottles of mineral water, and a glass of orange juice?', {'entities': [[16, 31, 'food'], [46, 59, 'food'], [14, 15, 'qnt'], [33, 34, 'qnt'], [76, 88, 'food'], [65, 66, 'qnt']]}], ['Hello, could I please get a grilled salmon and 2 glasses of red wine?', {'entities': [[28, 42, 'food'], [60, 68, 'food'], [26, 27, 'qnt'], [47, 48, 'qnt']]}], ["I'll take 2 duck breasts and 1 glass of orange juice.", {'entities': [[12, 24, 'food'], [40, 

100%|██████████| 150/150 [00:02<00:00, 55.23it/s]
100%|██████████| 50/50 [00:00<00:00, 107.89it/s]


In [7]:
modelpath=os.path.join(current_directory, '..', 'models', 'output','model-best')
nlps = spacy.load(modelpath)

In [8]:
text = "I'll have one duck breast, 2 beef tartares, and 2 glasses of red wine."
doc = nlps(text)

In [9]:
for ent in doc.ents:
    print(ent.text, ent.label_)

one qnt
duck breast food
2 qnt
beef tartares food
red wine food
