# Elaboration du fichier json et entrainement de LUIS

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
# Chagement du  fichier Frame

with open('frames\\frames.json') as fichier_frames:
    data = json.load(fichier_frames)
print(data[1]['turns'][0])

{'text': 'Hello, I am looking to book a vacation from Gotham City to Mos Eisley for $2100.', 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}], 'name': 'inform'}, {'args': [{'val': 'Mos Eisley', 'key': 'dst_city'}, {'val': 'Gotham City', 'key': 'or_city'}, {'val': '2100', 'key': 'budget'}], 'name': 'inform'}, {'args': [], 'name': 'greeting'}], 'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}], 'name': 'inform'}, {'args': [{'val': 'Mos Eisley', 'key': 'dst_city'}, {'val': 'Gotham City', 'key': 'or_city'}, {'val': '2100', 'key': 'budget'}], 'name': 'inform'}, {'args': [], 'name': 'greeting'}], 'active_frame': 1, 'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}], 'or_city': [{'val': 'Gotham City', 'negated': False}], 'dst_city': [{'val': 'Mos Eisley', 'negated': False}], 'budget': [{'val': '2100.0', 'negated': False}]}, 'frame_id': 1, 'requests': [], 'frame_parent_id': None, 'binary_questions': [], 'compare_requests': []}]}, 'author': 'user',

## Transformation du fichier frames en dataset avec les données utiles à LUIS

In [3]:
class create_luis_df:
    def __init__(self, frames_file):
        self.data = None
        self.df_data = None
        self.frames_file = frames_file
    
    def init_df_data(self):
        self.df_data = pd.DataFrame(columns=['text',
                                                    'from',
                                                    'from_start_pos',
                                                    'from_end_pos',
                                                    'to',
                                                    'to_start_pos',
                                                    'to_end_pos',
                                                    'str_date',
                                                    'str_date_start_pos',
                                                    'str_date_end_pos',
                                                    'end_date',
                                                    'end_date_start_pos',
                                                    'end_date_end_pos',
                                                    'budget',
                                                    'budget_start_pos',
                                                    'budget_end_pos',
                                                    'stratification'
                                            ])

    def get_data(self):
        with open(self.frames_file) as fichier_frames:
            self.data = json.load(fichier_frames)
    
    def get_pos(self, first_turn, keyword):
        start_pos, expression = self.get_start_pos(first_turn, keyword)
        end_pos = self.get_end_pos(start_pos, expression)
        return(start_pos, end_pos, expression)

    def get_start_pos(self, first_turn, keyword) -> int:
        if (type(first_turn['labels']['acts'])==list and len(first_turn['labels']['acts'])>1):
            for act in first_turn['labels']['acts']:
                if('args' in act):
                    dicts=act['args']
                    for dict in dicts:                  
                        if (dict['key']==keyword):
                            if('val' in dict):
                                expression = dict['val']
                                start_pos = first_turn['text'].find(expression)
                                if(start_pos>0):                    
                                    return (start_pos, expression)
                                else:
                                    return(None, None)
                            else:
                                print(dict)
        return(None, None)

    def get_end_pos(self, start_pos, expression) -> int:
        if(not start_pos == None):
            return (start_pos + len(expression)-1)
        else:
            return None

    def charge_df_data(self):       
        for one_data in self.data:
            first_turn = one_data['turns'][0]
            text = one_data['turns'][0]['text']
            from_start_pos, from_end_pos, or_city = self.get_pos(first_turn, 'or_city')
            to_start_pos, to_end_pos, dst_city = self.get_pos(first_turn, 'dst_city')
            depart_date_start_pos, depart_date_end_pos, str_date = self.get_pos(first_turn, 'str_date')
            return_date_start_pos, return_date_end_pos, end_date = self.get_pos(first_turn, 'end_date')
            max_price_start_pos, max_price_end_pos, budget = self.get_pos(first_turn, 'budget')
            stratification=("0" if(or_city==None) else "1")+("0" if(dst_city==None) else "1")+("0" if(str_date==None) else "1")+("0" if(end_date==None) else "1")+("0" if(budget==None) else "1")

            df_new_row = pd.DataFrame(  data=np.array([[
                                                        text,
                                                        or_city,
                                                        from_start_pos,
                                                        from_end_pos,
                                                        dst_city,
                                                        to_start_pos,
                                                        to_end_pos,
                                                        str_date,
                                                        depart_date_start_pos,
                                                        depart_date_end_pos,
                                                        end_date,                                                        
                                                        return_date_start_pos,
                                                        return_date_end_pos,
                                                        budget,
                                                        max_price_start_pos,
                                                        max_price_end_pos,
                                                        stratification
                                                    ]]),
                                        columns=[
                                                    'text',
                                                    'from',
                                                    'from_start_pos',
                                                    'from_end_pos',
                                                    'to',
                                                    'to_start_pos',
                                                    'to_end_pos',
                                                    'str_date',
                                                    'str_date_start_pos',
                                                    'str_date_end_pos',
                                                    'end_date',
                                                    'end_date_start_pos',
                                                    'end_date_end_pos',
                                                    'budget',
                                                    'budget_start_pos',
                                                    'budget_end_pos',
                                                    'stratification'
                                                ])
            self.df_data = pd.concat([self.df_data,df_new_row], ignore_index=True)
    
    def pipeline(self):
        self.init_df_data()
        self.get_data()
        self.charge_df_data()


monEssai = create_luis_df("frames\\frames.json")
monEssai.pipeline()

{'key': 'dst_city'}
{'key': 'dst_city'}
{'key': 'dst_city'}


In [4]:
monEssai.df_data.describe()

Unnamed: 0,text,from,from_start_pos,from_end_pos,to,to_start_pos,to_end_pos,str_date,str_date_start_pos,str_date_end_pos,end_date,end_date_start_pos,end_date_end_pos,budget,budget_start_pos,budget_end_pos,stratification
count,1369,653,653,653,752,752,752,305,305,305,173,173,173,184,184,184,1369
unique,1329,228,174,175,228,180,177,177,140,145,128,114,119,106,128,130,25
top,hi,Beijing,32,38,Punta Cana,16,25,August 27th,48,58,16,62,71,3200,7,42,0
freq,10,13,18,18,17,33,26,12,10,8,5,6,5,7,6,4,383


In [5]:
monEssai.df_data.head()

Unnamed: 0,text,from,from_start_pos,from_end_pos,to,to_start_pos,to_end_pos,str_date,str_date_start_pos,str_date_end_pos,end_date,end_date_start_pos,end_date_end_pos,budget,budget_start_pos,budget_end_pos,stratification
0,I'd like to book a trip to Atlantis from Capri...,Caprica,41.0,47.0,Atlantis,27,34,"Saturday, August 13, 2016",52.0,76.0,,,,1700,117.0,120.0,11101
1,"Hello, I am looking to book a vacation from Go...",Gotham City,44.0,54.0,Mos Eisley,59,68,,,,,,,2100,75.0,78.0,11001
2,Hello there i am looking to go on a vacation w...,,,,Gotham City,63,73,,,,,,,,,,1000
3,"Hi I'd like to go to Caprica from Busan, betwe...",Busan,34.0,38.0,Caprica,21,27,"Sunday August 21, 2016",49.0,70.0,"Wednesday August 31, 2016",76.0,100.0,,,,11110
4,"Hello, I am looking to book a trip for 2 adult...",Kochi,106.0,110.0,Denver,116,121,,,,,,,"$21,300",67.0,73.0,11001


In [6]:
monEssai.df_data['stratification'].value_counts()

00000    383
11000    248
01000    213
10000     97
11001     70
11100     59
11110     56
11111     23
10100     23
10110     23
01100     21
01110     21
00110     21
10001     19
00001     17
11101     15
10111     15
00100     14
01001     11
01101      5
01111      5
11010      5
00111      3
00010      1
00101      1
Name: stratification, dtype: int64

## Transformation du dataset en json exploitable par LUIS

In [7]:
class create_luis_data:
    def __init__(
                    self,
                    df_frames,
                    file_name_json          ="D:\\projets_ocr\\01_projets_python\\Projet_10_JSON\\data.json",
                    luis_schema_version     ="3.2.0",
                    versionId               ="0.1",
                    name                    ="projet_10_fly_booking",
                    desc                    ="A LUIS model for booking a fly.",
                    culture                 ="en-us",
                    tokenizerVersion        ="1.0.0",
                    intents_list            =['BookFlight'],
                    entities_list           =['from', 'to', 'str_date', 'end_date', 'budget'],
                    composites_list         =[],
                    patternAnyEntities_list =[],
                    regex_entities_list     =[],
                    prebuiltEntities_list   =[],
                    model_features_list     =[],
                    regex_features_list     =[],
                    patterns_list           =[],
                    utterances_list         =[],
                ):
        self.df_data                    = df_frames
        self.file_name_json             = file_name_json
        self.luis_schema_version        = luis_schema_version
        self.versionId                  = versionId
        self.name                       = name
        self.desc                       = desc
        self.culture                    = culture
        self.tokenizerVersion           = tokenizerVersion
        self.intents_list               = intents_list
        self.entities_list              = entities_list
        self.composites_list            = composites_list
        self.patternAnyEntities_list    = patternAnyEntities_list
        self.regex_entities_list        = regex_entities_list
        self.prebuiltEntities_list      = prebuiltEntities_list
        self.regex_features_list        = regex_features_list
        self.patterns_list              = patterns_list
        self.utterances_list            = utterances_list
        

    def luis_structure_python(self)->dict:
        struct_pyth = {}
        struct_pyth["luis_schema_version"]      = self.luis_schema_version
        struct_pyth["versionId"]                = self.versionId
        struct_pyth["name"]                     = self.name
        struct_pyth["desc"]                     = self.desc
        struct_pyth["culture"]                  = self.culture
        struct_pyth["tokenizerVersion"]         = self.tokenizerVersion
        struct_pyth["intents"]                  = self.get_intents()
        struct_pyth["entities"]                 = self.get_entities()
        struct_pyth["composites"]               = self.get_composites()
        struct_pyth["closedLists"]              = self.get_closedLists()
        struct_pyth["patternAnyEntities"]       = self.get_patternAnyEntities()
        struct_pyth["regex_entities"]           = self.get_regex_entities()
        struct_pyth["prebuiltEntities"]         = self.get_prebuiltEntities()
        struct_pyth["model_features"]           = self.get_model_features()
        struct_pyth["regex_features"]           = self.get_regex_features()
        struct_pyth["patterns"]                 = self.get_patterns()
        struct_pyth["utterances"]               = self.get_utterances()
        struct_pyth["settings"]                 = self.get_settings()
        return  struct_pyth

    def get_intents(self)->list:
        result = []        
        for intent in self.intents_list:
            w_dict = {}
            w_dict['name'] = intent
            result.append(w_dict)
        return result

    def get_entities(self)->list:
        result = []
        for entite in self.entities_list:
            w_dict = {}
            w_dict['name'] = entite
            result.append(w_dict)
        return result

    def get_composites(self)->list:
        result = []
        return result
    
    def get_closedLists(self)->list:
        result = []
        return result

    def get_patternAnyEntities(self)->list:
        result = []
        return result

    def get_regex_entities(self)->list:
        result = []
        return result

    def get_prebuiltEntities(self)->list:
        result = [{
                    "name": "datetimeV2",
                    "roles": []
                }]
        return result

    def get_model_features(self)->list:
        result = []
        return result

    def get_regex_features(self)->list:
        result = []
        return result

    def get_patterns(self)->list:
        result = []
        return result

    def get_utterances(self)->list:
        result = []
        for i in range(len(self.df_data)):
            w_dict = {}
            w_dict['text'] = self.df_data.loc[i,"text"]
            w_dict['intentName'] = "BookFlight"
            w_dict['entityLabels'] = self.get_utterances_entities(i)
            # il faut coder la fonction get_utterances_entities qui permet de récupérer
            # les entities pour une phrase
            result.append(w_dict)
        return result

    def get_utterances_entities(self, line)->list:
        START_POS="_start_pos"
        END_POS="_end_pos"
        result = []
        for entity in self.entities_list:
            # print(self.entities_list)
            dict={}
            # print(entity+START_POS)
            # print(self.df_data.loc[line,entity+START_POS])
            # print(entity+END_POS)
            # print(self.df_data.loc[line,entity+END_POS])

            if self.df_data.loc[line,entity+START_POS] != None and self.df_data.loc[line,entity+END_POS] != None:
                dict['entityName']=entity
                dict['startCharIndex']=self.df_data.loc[line,entity+START_POS]
                dict['endCharIndex']=self.df_data.loc[line,entity+END_POS]
                result.append(dict)
        return result

    def get_settings(self)->list:
        result = []
        return result

    def save_jsons(self, struct_pyth, file_name):
        with open(file_name, 'w') as fp:
            json.dump(struct_pyth, fp, indent=4)

    def json_pipeline(self):
        struct_pyth = self.luis_structure_python()
        self.save_jsons(struct_pyth, self.file_name_json)


## Entrainement, test et déploiement de LUIS

In [8]:
from azure.cognitiveservices.language.luis.authoring import LUISAuthoringClient
from azure.cognitiveservices.language.luis.authoring.models import ApplicationCreateObject, PrebuiltEntity, PrebuiltEntityExtractor
from azure.cognitiveservices.language.luis.runtime import LUISRuntimeClient
from msrest.authentication import CognitiveServicesCredentials
from functools import reduce

import json, time, uuid,os

In [9]:
from config import DefaultConfig
CONF= DefaultConfig()
auth_endpoint: str = CONF.LUIS_AUTHORING_END_POINT
auth_key: str = CONF.LUIS_AUTHORING_KEY
pred_endpoint: str = CONF.LUIS_PREDICTION_END_POINT
pred_key: str = CONF.LUIS_PREDICTION_KEY
luis_app_id : str = CONF.LUIS_APP_ID

In [10]:
# On prépare les données d'entrainement et les données de test
# X_train, X_test = train_test_split(monEssai.df_data, test_size=0.2, stratify=monEssai.df_data['stratification'])
df_complet = monEssai.df_data.drop(columns=["stratification"])
df_train, df_test = train_test_split(df_complet, test_size=0.2, random_state=20)

df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [11]:
df_train.index

RangeIndex(start=0, stop=1095, step=1)

In [12]:
print("nombre total de lignes : ", df_complet.shape)
print("nombre de lignes d'entrainement : ",df_train.shape[0])
print("nombre de lignes de test : ",df_test.shape[0])


nombre total de lignes :  (1369, 16)
nombre de lignes d'entrainement :  1095
nombre de lignes de test :  274


In [13]:
class create_luis_app():
    def __init__(self,authoring_endpoint,authoring_key,appName,versionId,culture,app_id=None,luis_app_json=None) -> None:
        self.endpoint           =authoring_endpoint
        self.credentials        =CognitiveServicesCredentials(authoring_key)
        self.appName            =appName
        self.versionId          =versionId
        self.client             =LUISAuthoringClient(self.endpoint, self.credentials)
        self.luis_app_json      =luis_app_json
        self.app_id             =app_id
        self.culture            =culture
        if(self.app_id==None) : self.create_app()
    
    def create_app(self):
        # define app basics
        appDefinition = ApplicationCreateObject(name=self.appName, initial_version_id=self.versionId, culture=self.culture)
        # create app
        self.app_id = self.client.apps.add(appDefinition)

    def add_intents(self, intents):
        for intent in intents:
            print(intent['name'])
            self.client.model.add_intent(self.app_id, self.versionId, intent['name'])
    
    def add_entities(self, entities):
        for entitie in entities:
            print(entitie['name'])
            modelId=self.client.model.add_entity(self.app_id, self.versionId, name=entitie["name"])
            print("modelId",modelId)
            model_object=self.client.model.get_entity(self.app_id, self.versionId, modelId)
            print("model_object",model_object)
    
    def add_utterances(self, utterances):
        for utterance in utterances:
            self.client.examples.add(self.app_id, self.versionId, utterance)
    
    def train_luis(self):
        self.client.train.train_version(self.app_id, self.versionId)
        waiting = True
        while waiting:
            info = self.client.train.get_status(self.app_id, self.versionId)
            # get_status returns a list of training statuses, one for each model.
            # Loop through them and make sure all are done.
            waiting = any(map(lambda x: 'Queued' == x.details.status or 'InProgress' == x.details.status, info))
            if waiting:
                print ("Waiting 10 seconds for training to complete...")
                time.sleep(10)
            else: 
                print ("trained")
                waiting = False
    
    def publish_luis(self):
        # Mark the app as public so we can query it using any prediction endpoint.
        # Note: For production scenarios, you should instead assign the app to your own LUIS prediction endpoint. See:
        # https://docs.microsoft.com/en-gb/azure/cognitive-services/luis/luis-how-to-azure-subscription#assign-a-resource-to-an-app
        self.client.apps.update_settings(self.app_id, is_public=True)
        responseEndpointInfo = self.client.apps.publish(self.app_id, self.versionId, is_staging=False)


In [14]:
my_luis_data = create_luis_data(df_train)
luis_dict = my_luis_data.luis_structure_python()
my_app_luis=create_luis_app(
                            auth_endpoint,
                            auth_key,
                            "FlightBooking",
                            "0.1",
                            "en-us",
                            )

In [15]:
my_app_luis.add_intents(luis_dict["intents"])

BookFlight


In [16]:
my_app_luis.add_entities(luis_dict["entities"])

from
modelId 36791a25-b7a4-433f-9c55-167218cc0f3f
model_object {'additional_properties': {}, 'id': '36791a25-b7a4-433f-9c55-167218cc0f3f', 'name': 'from', 'type_id': 1, 'readable_type': 'Entity Extractor', 'roles': [], 'custom_prebuilt_domain_name': None, 'custom_prebuilt_model_name': None, 'children': []}
to
modelId e791a37a-35fe-40d6-928c-77c97da259ab
model_object {'additional_properties': {}, 'id': 'e791a37a-35fe-40d6-928c-77c97da259ab', 'name': 'to', 'type_id': 1, 'readable_type': 'Entity Extractor', 'roles': [], 'custom_prebuilt_domain_name': None, 'custom_prebuilt_model_name': None, 'children': []}
str_date
modelId 321f733d-0e11-49df-b85a-5e2c4bd3a0e6
model_object {'additional_properties': {}, 'id': '321f733d-0e11-49df-b85a-5e2c4bd3a0e6', 'name': 'str_date', 'type_id': 1, 'readable_type': 'Entity Extractor', 'roles': [], 'custom_prebuilt_domain_name': None, 'custom_prebuilt_model_name': None, 'children': []}
end_date
modelId 52e082d0-77e5-4aca-b9fa-6df663492a5b
model_object {'add

In [17]:
my_app_luis.add_utterances(luis_dict["utterances"])

In [18]:
my_app_luis.client.model.add_prebuilt(app_id=my_app_luis.app_id,version_id='0.1', prebuilt_extractor_names=["datetimeV2"])

[<azure.cognitiveservices.language.luis.authoring.models._models_py3.PrebuiltEntityExtractor at 0x26ddd4d2c70>]

In [19]:
my_app_luis.train_luis()

Waiting 10 seconds for training to complete...
Waiting 10 seconds for training to complete...
trained


In [20]:
my_app_luis.publish_luis()

In [21]:
auth_endpoint

'https://p10luis2023-authoring.cognitiveservices.azure.com/'

In [22]:
my_app_luis.versionId

'0.1'

In [23]:
my_luis_test= create_luis_data(df_test)
struct_pyth_test = my_luis_data.luis_structure_python()
utterance = struct_pyth_test["utterances"][0]

In [24]:
for col in df_test.columns:
    if(df_test.loc[1,col]!=None):
        print(col)

text
from
from_start_pos
from_end_pos
to
to_start_pos
to_end_pos


In [25]:
from azure.cognitiveservices.language.luis.authoring import LUISAuthoringClient
from azure.cognitiveservices.language.luis.runtime import LUISRuntimeClient
from azure.cognitiveservices.language.luis.runtime.models import PredictionRequest
from msrest.authentication import CognitiveServicesCredentials

# On récupère les information de connexion
endpoint = pred_endpoint
authoring_key = pred_key
runtime_key = pred_key
app_id = my_app_luis.app_id
true_entities={}
wrong_entities={}

# Création des instances de LUISAuthoringClient et LUISRuntimeClient
authoring_client = LUISAuthoringClient(endpoint, CognitiveServicesCredentials(authoring_key))
runtime_client = LUISRuntimeClient(endpoint, CognitiveServicesCredentials(runtime_key))

# Pour chaque phrase du dataset de test
for line in range(len(df_test)):
    sorted_list_entitie=[]
    best_result_entitie=""
    instance={}
    # On définit la phrase à envoyer à LUIS
    input_text = df_test.loc[line,"text"]

    # On Crée une requête de prédiction pour LUIS
    prediction_request = PredictionRequest(query=input_text)

    # On envoie la requête de prédiction à LUIS
    prediction_response = runtime_client.prediction.get_slot_prediction(app_id=app_id, slot_name="production", prediction_request=prediction_request, verbose=True)

    # On obtient la prédiction de l'intent
    predicted_entities = prediction_response.prediction.entities
    predicted_intent = prediction_response.prediction.intents
    # On vérifie si la prédiction de l'intent correspond à ce qui est prévu
    if "BookFlight" in predicted_intent:
        for key, value in predicted_entities.items():
            if(key not in ['$instance', 'datetimeV2']):
                if predicted_entities.get(key, [{"$instance": {}}])[0]:
                    instance=predicted_entities.get("$instance", {}).get(key, [])
                    sorted_list_entitie = sorted(instance, key=lambda x: x['score'], reverse=True)
                    best_result_entitie = instance[0]["text"]

                    if (df_test.loc[line,key]==best_result_entitie):
                        # Bonne prédiction
                        if key in true_entities:
                            true_entities[key]=true_entities[key]+1
                        else:
                            true_entities[key]=1
                    else:
                            # Mauvaise prédiction
                            if key in wrong_entities:
                                wrong_entities[key]=wrong_entities[key]+1
                            else:
                                wrong_entities[key]=1
                else:
                    # on trouve pas donc c'est faux
                    if key in wrong_entities:
                            wrong_entities[key]=wrong_entities[key]+1
                    else:
                        wrong_entities[key]=1
    else:
        print("pas BookFlight")
        print(input_text)
        for col in df_test.columns:
            if(col not in ['text','from_start_pos','from_end_pos','to_start_pos','to_end_pos'] and df_test.loc[1,col]!=None):
                # on n'a pas trouvé l'intent donc chaque intent de la phrase n'est pas trouvé
                if key in wrong_entities:
                    wrong_entities[col]=wrong_entities[col]+1
                else:
                    wrong_entities[col]=1
    

pas BookFlight
psssstttttt


In [26]:
print(true_entities)
print(wrong_entities)

{'end_date': 31, 'from': 110, 'to': 119, 'str_date': 41, 'budget': 24}
{'from': 20, 'end_date': 13, 'to': 22, 'budget': 9, 'str_date': 17}


In [27]:
entities_acc={}
for key, value in true_entities.items():
    counterEntityFunc = df_test[key].apply(
        lambda x: True if x != None else False)
    nb = len(counterEntityFunc[counterEntityFunc == True].index)
    entities_acc[key]= int(value)/nb

print(entities_acc)

{'end_date': 0.8857142857142857, 'from': 0.7971014492753623, 'to': 0.7986577181208053, 'str_date': 0.6507936507936508, 'budget': 0.7272727272727273}


In [28]:
# # Création d'une application luis complète avec un json

# import requests

# struct_pyth = luis_json.luis_structure_python()
# struct_json = json.dumps(struct_pyth)
# json.dump

# url = 'https://luisp10-authoring.cognitiveservices.azure.com/luis/authoring/v3.0-preview/apps/import'

# headers={
#     'content-type': 'application/json',
#     'Ocp-apim-subscription-key':'99ce3768cc2c485ea84238e1e9a52e8d'
# }
# params={
#     'Endpoint':'https://luisp10-authoring.cognitiveservices.azure.com/'
# }

# x = requests.post(url, headers=headers, params=params, data=struct_json)

# print(x.text)