# Step 2: Generate queries

From Step 1, we now have a set of EN/FR pairs that we can use for IR and CLIR evaluations.

But first, we need queries. Particularly, we want natural language queries since this is the intended user interaction.

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('laws_pairs.csv.zip').fillna("")

In [5]:
df.columns

Index(['section_id', 'doc_id', 'type', 'doc_title_eng', 'doc_title_fra',
       'section_str_eng', 'section_str_fra', 'heading_str_eng',
       'heading_str_fra', 'text_eng', 'text_fra', 'char_cnt_eng',
       'char_cnt_fra', 'token_cnt_eng', 'token_cnt_fra'],
      dtype='object')

In [20]:
def combine_text(row):
    nl = "\n"
    return (
        f"{row['doc_title_eng']}\n"
        f"{' > ' + row['heading_str_eng'] + nl if row['heading_str_eng'] else ''}"
        f"{row['section_str_eng']}\n"
        f"---\n{row['text_eng']}"
    ), (
        f"{row['doc_title_fra']}\n"
        f"{' > ' + row['heading_str_fra'] + nl if row['heading_str_fra'] else ''}"
        f"{row['section_str_fra']}\n"
        f"---\n{row['text_fra']}"
    )

combined_texts = df.apply(combine_text, axis=1)
df["text_combined_eng"] = [x[0] for x in combined_texts]
df["text_combined_fra"] = [x[1] for x in combined_texts]

df[['text_combined_eng', 'text_combined_fra']].sample(5)

Unnamed: 0,text_combined_eng,text_combined_fra
32470,Telecommunications Act\n > Investigation and E...,Loi sur les télécommunications\n > Enquêtes et...
58177,Canada Occupational Health and Safety Regulati...,Règlement canadien sur la santé et la sécurité...
13221,Excise Tax Act\n > Air Transportation Tax > Pe...,Loi sur la taxe d’accise\n > Taxe de transport...
50300,Apprentice Loans Regulations\n > Removal of Re...,Règlement sur les prêts aux apprentis\n > Levé...
28950,Pension Act\n > Pensions > Pensions for Death\...,Loi sur les pensions\n > Pensions > Pensions p...


In [35]:
np.random.seed(42)
small_df = df.sample(1000)

## Question generation with LlamaIndex

In [94]:
import os
import textwrap as tr

import nest_asyncio
from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from azure.keyvault.secrets import SecretClient
from dotenv import load_dotenv
from llama_index.core import ServiceContext, set_global_service_context
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core.prompts import ChatMessage, ChatPromptTemplate, MessageRole
from tqdm import tqdm

# This is a hack to get some things to work in Jupyter Notebooks
nest_asyncio.apply()

def pwrap(text):
    print(tr.fill(str(text), width=80))

# Load environment variables from .env file
try:
    load_dotenv(dotenv_path=".env")
except:
    pass

# If we're running on Azure, use the Managed Identity to get the secrets
if os.environ.get("CREDENTIAL_TYPE").lower() == "managed":
    credential = ManagedIdentityCredential()
else:
    credential = DefaultAzureCredential()

# Login to KeyVault using Azure credentials
client = SecretClient(
    vault_url=os.environ.get("AZURE_KEYVAULT_URL"), credential=credential
)

OPENAI_API_BASE = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_API_VERSION = os.environ.get("AZURE_OPENAI_VERSION")
OPENAI_API_KEY = client.get_secret("OPENAI-SERVICE-KEY").value

api_key = OPENAI_API_KEY
azure_endpoint = OPENAI_API_BASE
api_version = OPENAI_API_VERSION

### Generate easy questions with GPT-3.5

In [None]:

QUESTION_GEN_USER_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "generate the relevant question."
)

QUESTION_GEN_SYS_TMPL = """\
You are labelling an cross-language information retrieval (CLIR) dataset.
You are given a chunk of context information, which will be in {language}.
Generate a question, in Canadian {language}, that relates to the context information.
The questions will be used to evaluate the quality of the information retrieval system.
Restrict the question to the context information provided.\
"""

question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
    ]
)

def generate_queries(texts, language="english"):
    queries = []
    llm = AzureOpenAI(
        model="gpt-35-turbo",
        deployment_name="gpt-35-turbo-unfiltered",
        api_key=api_key,
        azure_endpoint=azure_endpoint,
        api_version=api_version,
        temperature=0.1,
    )
    for i, text in enumerate(tqdm(texts)):
        fmt_messages = question_gen_template.format_messages(
            context_str=text,
            language=language,
        )
        try:
            chat_response = llm.chat(fmt_messages)
            queries.append(chat_response.message.content)
        except:
            queries.append("")

    return queries

In [None]:
from joblib import Parallel, delayed

# The results need to be returned in the same order as the input
# so we can use the joblib backend "loky" to ensure that
easy_eng_queries = Parallel(n_jobs=-1, backend="loky")(
    delayed(generate_queries)(small_df["text_combined_eng"][i:i+5], "English")
    for i in range(0, len(small_df), 5)
)

easy_eng_queries = [q for sublist in easy_eng_queries for q in sublist]

with open("easy_queries_eng.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(easy_eng_queries))

In [79]:
easy_fra_queries = Parallel(n_jobs=-1, backend="loky")(
    delayed(generate_queries)(small_df["text_combined_fra"][i:i+5], "French")
    for i in range(0, len(small_df), 5)
)

easy_fra_queries = [q for sublist in easy_fra_queries for q in sublist]

with open("easy_queries_fra.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(easy_fra_queries))



### Generate fuzzier questions with GPT-4

The previously generated questions might actually be too easy for a keyword retriever, since they tend to use the exact words.

This time, we'll try prompting the model to make fuzzier questions that may have synonyms etc.

In [73]:
QUESTION_GEN_SYS_TMPL_HARD = """\
You are labelling an cross-language information retrieval (CLIR) dataset.
You are given a chunk of context information, which will be in {language}.
Generate a question, in Canadian {language}, that relates to the context information.
The questions will be used to evaluate the quality of the information retrieval system.

Make the question slightly difficult for the IR system!
For example, use synonyms or paraphrasing rather than the exact words in the context.
You can include misspellings!
You may also ask more general questions, which the context only partially answers,
but it should still be possible for a very good retriever to find this context chunk
when given your question.
The questions should be in simple {language} and at most 10 words long.

Restrict the question to the context information provided.\
"""

question_gen_template_hard = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL_HARD),
        ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
    ]
)

def generate_queries_hard(texts, language="English"):
    queries = []
    llm = AzureOpenAI(
        model="gpt-4",
        deployment_name="gpt-4-unfiltered",
        api_key=api_key,
        azure_endpoint=azure_endpoint,
        api_version=api_version,
        temperature=0.5,
    )
    for i, text in enumerate(tqdm(texts)):
        fmt_messages = question_gen_template_hard.format_messages(
            context_str=text,
            language=language,
        )
        try:
            chat_response = llm.chat(fmt_messages)
            queries.append(chat_response.message.content)
        except:
            queries.append("")

    return queries

In [74]:
hard_eng_queries = Parallel(n_jobs=-1, backend="loky")(
    delayed(generate_queries_hard)(small_df["text_combined_eng"][i:i+5], "English")
    for i in range(0, len(small_df), 5)
)

hard_eng_queries = [q for sublist in hard_eng_queries for q in sublist]

# Save queries to hard_queries_eng.txt
with open("hard_queries_eng.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(hard_eng_queries))

In [91]:
hard_fra_queries = Parallel(n_jobs=-1, backend="loky")(
    delayed(generate_queries_hard)(small_df["text_combined_fra"][i:i+5], "French")
    for i in range(0, len(small_df), 5)
)

hard_fra_queries = [q for sublist in hard_fra_queries for q in sublist]

# Save queries to hard_queries_fra.txt
with open("hard_queries_fra.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(hard_fra_queries))

In [92]:
small_df["easy_eng_queries"] = easy_eng_queries
small_df["easy_fra_queries"] = easy_fra_queries
small_df["hard_eng_queries"] = hard_eng_queries
small_df["hard_fra_queries"] = hard_fra_queries

small_df.to_csv("small_df.csv", index=False)

In [93]:
# The parallel calls to the Azure API may have failed for some of the queries.
# Fill in the missing values by running those queries again (without parallelization)
missing_easy_eng = small_df[small_df["easy_eng_queries"] == ""]
print(len(missing_easy_eng), "missing easy English queries")
missing_hard_eng = small_df[small_df["hard_eng_queries"] == ""]
print(len(missing_hard_eng), "missing hard English queries")
missing_easy_fra = small_df[small_df["easy_fra_queries"] == ""]
print(len(missing_easy_fra), "missing easy French queries")
missing_hard_fra = small_df[small_df["hard_fra_queries"] == ""]
print(len(missing_hard_fra), "missing hard French queries")

fill_easy_eng = generate_queries(missing_easy_eng["text_combined_eng"], "English")
fill_hard_eng = generate_queries_hard(missing_hard_eng["text_combined_eng"], "English")
fill_easy_fra = generate_queries(missing_easy_fra["text_combined_fra"], "French")
fill_hard_fra = generate_queries_hard(missing_hard_fra["text_combined_fra"], "French")

small_df.loc[missing_easy_eng.index, "easy_eng_queries"] = fill_easy_eng
small_df.loc[missing_hard_eng.index, "hard_eng_queries"] = fill_hard_eng
small_df.loc[missing_easy_fra.index, "easy_fra_queries"] = fill_easy_fra
small_df.loc[missing_hard_fra.index, "hard_fra_queries"] = fill_hard_fra

small_df.to_csv("small_df.csv", index=False)

0 missing easy English queries
0 missing hard English queries
1 missing easy French queries
18 missing hard French queries


0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
100%|██████████| 18/18 [00:34<00:00,  1.93s/it]
