## GENERAZIONE DI DOMANDE ATTINENTI AL CONTESTO

In [None]:
JV_GEMINI_TOKEN = "AIzaSyArDcTFUTzztpgCIlogXSYQwBhUieZxv7Y"
RS_GEMINI_TOKEN = "AIzaSyAS0kVBJkyFyosoCwqAQyJM0ElyKEzrmgM"
VM_GEMINI_TOKEN = "AIzaSyD22Kr3nfSrvkE45KJlbIZHLuTA_cYuBYM"

In [None]:
import re

def extract_sections(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    sections = re.split(r'<----------section---------->', text)
    
    sections = [section.strip() for section in sections if section.strip()]
    
    return sections

file_path = "data/3Steps_6Marzo2025.txt"  
sections = extract_sections(file_path)

In [None]:

from tqdm import tqdm
import random
import time
import google.generativeai as genai

TOKENS = [ VM_GEMINI_TOKEN, JV_GEMINI_TOKEN, RS_GEMINI_TOKEN]
current_token_index = 0

def call_llm(prompt, token):
    genai.configure(api_key=token)
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    return response.text

question_data = []

for section_index, section in enumerate(tqdm(sections, desc="Generating Questions"), start=1):
    
    prompt = (
        "generate 5 questions based on the following text: \n\n"
        f"{section}\n\n\n"
        "separate the questions with separator <----------question---------->"
        "give me only the questions separated by a row and the separator <----------question---------->"
        "not add any other text or information like answers or context or enumeration"
    )
    
    response = call_llm(prompt, TOKENS[current_token_index])
    questions = response.split("<----------question---------->")
    
    for question in questions:
        if question.strip():  # Evita di salvare stringhe vuote
            question_data.append({"section_index": section_index, "question": question.strip()})
    
    cont += 1
    if cont % 3 == 0:
        tempo_casuale_ms = random.randint(5000, 10000) / 1000 
        time.sleep(tempo_casuale_ms)
        current_token_index = (current_token_index + 1) % len(TOKENS)  #token prossimo nella coda circolare
    tempo_casuale_ms = random.randint(4000, 7500) / 1000 
    time.sleep(tempo_casuale_ms)

In [None]:
output_file = "data/questions/6Marzo2025__ALL.json"

import json
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(question_data, file, ensure_ascii=False, indent=4)

## TEST

In [None]:
import json
import os
import time
import random
from tqdm import tqdm
from datetime import datetime
from Assistant import Assistant

# Impostazione directory risultati
dir_risultati = "data/questions/response_13Marzo2025"
os.makedirs(dir_risultati, exist_ok=True)

file_checkpoint = "data/questions/13Marzo2025.json"

# Caricamento domande
try:
    with open("data/questions/6Marzo2025__ALL.json", "r", encoding="utf-8") as f:
        domande = json.load(f)
    print(f"Caricate {len(domande)} domande")
except FileNotFoundError:
    print("File domande non trovato.")
    raise

# Carica checkpoint
def carica_checkpoint():
    if os.path.exists(file_checkpoint):
        with open(file_checkpoint, "r", encoding="utf-8") as f:
            return json.load(f).get("last_index", 0)
    return 0

# Salva checkpoint
def salva_checkpoint(indice):
    with open(file_checkpoint, "w", encoding="utf-8") as f:
        json.dump({"last_index": indice, "timestamp": datetime.now().isoformat()}, f)

def test_domande():
    assistente = Assistant(
        faiss_index="data/faiss_index/ALL__11Marzo2025__bge-m3", 
        log_file="data/logs/TestAssistant13Marzo2025.log"
    )
    
    indice_inizio = carica_checkpoint()
    print(f"Inizio dal numero {indice_inizio}")
    
    for i in tqdm(range(indice_inizio, len(domande))):
        domanda = domande[i]["question"]
        sezione = domande[i]["section_index"]
        
        try:
            assistente.clear_history()
            inizio = time.time()
            risposta = assistente.ask(domanda)["final_response"]
            tempo_risposta = time.time() - inizio
            
            risultato = {
                "question_id": i + 1,
                "section_index": sezione,
                "question": domanda,
                "response": risposta,
                "response_time": tempo_risposta,
                "timestamp": datetime.now().isoformat()
            }
            
            with open(f"{dir_risultati}/question_{i+1}.json", "w", encoding="utf-8") as f:
                json.dump(risultato, f, ensure_ascii=False, indent=2)
            
            salva_checkpoint(i + 1)
            time.sleep(random.uniform(3, 8))
            
        except Exception as e:
            print(f"Errore domanda {i+1}: {e}")
            errore = {
                "question_id": i + 1,
                "section_index": sezione,
                "question": domanda,
                "error": str(e),
                "timestamp": datetime.now().isoformat()
            }
            
            with open(f"{dir_risultati}/question_{i+1}_error.json", "w", encoding="utf-8") as f:
                json.dump(errore, f, ensure_ascii=False, indent=2)
            
            salva_checkpoint(i + 1)
            time.sleep(random.uniform(5, 10))

    print(f"Test completato! Risultati salvati in {dir_risultati}")

if __name__ == "__main__":
    test_domande()

## RECUPERO DOMANDE CASUALI

In [None]:
import json
import random

input_file = "data/questions/6Marzo2025__ALL.json"

with open(input_file, 'r', encoding='utf-8') as file:
    data = json.load(file)

questions = [item["question"] for item in data]
num_questions = len(questions)

#print('Total Number of questions:', num_questions)

#-------------------------------------------------------------------------#
NUM_OF_QUESTIONS = 100

sample_size = min(NUM_OF_QUESTIONS, num_questions)  
sample_questions = random.sample(questions, sample_size)


### TEST più modelli

In [None]:
from Assistant import Assistant
from tqdm import tqdm
import torch
torch.cuda.empty_cache()

JV_GEMINI_TOKEN = "AIzaSyArDcTFUTzztpgCIlogXSYQwBhUieZxv7Y"
RS_GEMINI_TOKEN = "AIzaSyAS0kVBJkyFyosoCwqAQyJM0ElyKEzrmgM"
EZ_GEMINI_TOKEN = "AIzaSyAVi3tobZhK9uBL-eGyXUcCnRTiEPChsF4"
VM_GEMINI_TOKEN = "AIzaSyD22Kr3nfSrvkE45KJlbIZHLuTA_cYuBYM"
VM_GEMINI_TOKEN2 = "AIzaSyAfNu529ZSMYVc2cPrCzaPi5XKlWpi09X0"
JV_COHERE_TOKEN = "XjJ6nkqZabaMHpq4aehIfyyksudq5LSm80QvUqcV"
RS_COHERE_TOKEN  = "t1GNKsIULpTSgepiuMsOtGicLqpTgVMX5UNnpiMg"


assistant1 = Assistant(faiss_index="data/faiss_index/ALL__22_03_2025__BGE-M3__MAX_INNER_PRODUCT", 
                       log_file="data/logs/test_2_assistant.log",
                       embedding_model="BAAI/bge-m3",
                       generation_model1="GEMINI",
                       token1=VM_GEMINI_TOKEN2,
                       generation_model2="GEMINI", 
                       token2=EZ_GEMINI_TOKEN)


assistant2 = Assistant(faiss_index="data/faiss_index/ALL__22_03_2025__BGE-M3__MAX_INNER_PRODUCT", 
                       log_file="data/logs/test_2_assistant.log",
                       embedding_model="BAAI/bge-m3",
                       generation_model1="GEMINI",
                       token1=VM_GEMINI_TOKEN2,
                       generation_model2="COMMAND_R_PLUS", 
                       token2=RS_COHERE_TOKEN)

In [None]:
import random
import time

skip_first = 0
cont = 0

responses = {}


for q in tqdm(sample_questions, desc="Processing questions"):
    cont += 1
    if cont <= skip_first:
        continue
    
    responses[q] = {
        "Assistant1": assistant1.ask(q)['final_response'],
        "Assistant2": assistant2.ask(q)['final_response']
    }
    
    tempo_casuale_ms = random.randint(5000, 10000) / 1000 
    time.sleep(tempo_casuale_ms)

output_file = "data/questions/responses/test_last2Models.json"

import os
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(responses, f, indent=4, ensure_ascii=False)

print(f"Risposte salvate in {output_file}")

## TEST PIU' MODELLI - Manuale

In [None]:
import json
import random
import time
from tqdm import tqdm

JV_GEMINI_TOKEN = "AIzaSyArDcTFUTzztpgCIlogXSYQwBhUieZxv7Y"
RS_GEMINI_TOKEN = "AIzaSyAS0kVBJkyFyosoCwqAQyJM0ElyKEzrmgM"
EZ_GEMINI_TOKEN = "AIzaSyAVi3tobZhK9uBL-eGyXUcCnRTiEPChsF4"
VM_GEMINI_TOKEN = "AIzaSyD22Kr3nfSrvkE45KJlbIZHLuTA_cYuBYM"
VM_GEMINI_TOKEN2 = "AIzaSyAfNu529ZSMYVc2cPrCzaPi5XKlWpi09X0"
JV_COHERE_TOKEN = "XjJ6nkqZabaMHpq4aehIfyyksudq5LSm80QvUqcV"

import importlib
import Assistant
importlib.reload(Assistant)
from Assistant import Assistant


assistant = Assistant(faiss_index="data/faiss_index/ALL__19_03_2025__BGE-M3__MAX_INNER_PRODUCT", 
                       log_file="data/logs/assistant.log",
                       generation_model1="GEMINI",
                       token1=EZ_GEMINI_TOKEN,
                       generation_model2="COMMAND_R_PLUS",
                       token2=JV_COHERE_TOKEN)

# 100 domande fuori contesto in inglese
domande_fuori_contesto_en = [
    "What is the capital of France?",
    "Who won the last Super Bowl?",
    "How do you cook a Margherita pizza?",
    "What is the average temperature on Mars?",
    "Who wrote 'Pride and Prejudice'?",
    "What is the best exercise to train biceps?",
    "How much does a ticket to Disneyland cost?",
    "What is the latest iPhone model released?",
    "How do you say 'good morning' in Japanese?",
    "Who directed the movie 'Inception'?",
    "What is the recipe for tiramisu?",
    "How do you obtain American citizenship?",
    "What are the symptoms of seasonal flu?",
    "Who is the current president of the United States?",
    "How can I book a flight to Tokyo?",
    "What is the difference between a dolphin and a whale?",
    "How can I invest in the stock market?",
    "What is the most titled football team in the world?",
    "Who discovered penicillin?",
    "What is the best science fiction movie of 2024?",
    "What is the fastest way to lose weight?",
    "How do you play chess?",
    "Who invented the steam engine?",
    "Where is the Grand Canyon located?",
    "What is the most listened song on Spotify?",
    "How can I learn to play the guitar?",
    "Who won the Nobel Prize in Literature in 2023?",
    "How do you tie a tie?",
    "What is the difference between a virus and a bacterium?",
    "What is the best-selling book of all time?",
    "What is the formula for speed?",
    "How can I start meditating?",
    "What are the best restaurants in Rome?",
    "How do you make homemade bread?",
    "Who is the founder of Tesla?",
    "What are the benefits of yoga?",
    "What is the smartest dog breed?",
    "How can I improve my memory?",
    "Where is the Great Wall of China located?",
    "What is the difference between green tea and black tea?",
    "How can I quit smoking?",
    "Who wrote '1984'?",
    "What is the deepest lake in the world?",
    "How can I save more money each month?",
    "What are the signs of the zodiac?",
    "Who invented the Internet?",
    "What is the official currency of Japan?",
    "How can I improve my posture?",
    "What is the largest planet in the solar system?",
    "What are the most spoken languages in the world?",
    "Who discovered America?",
    "How do you prepare a Mojito cocktail?",
    "What are the causes of climate change?",
    "Who is the highest paid soccer player in the world?",
    "What is the most populous city on Earth?",
    "How can I improve my productivity?",
    "Where is the Colosseum located?",
    "What are the benefits of meditation?",
    "Who painted the Mona Lisa?",
    "What is the longest river in the world?",
    "How can I learn a new language quickly?",
    "Who wrote 'The Little Prince'?",
    "What are the symptoms of a panic attack?",
    "How can I increase my self-esteem?",
    "What is the best way to relax after a stressful day?",
    "Who invented the radio?",
    "What are the best smartphones of 2025?",
    "How can I improve my diet?",
    "What is the most active volcano in the world?",
    "What are the best tourist destinations for 2025?",
    "Who discovered gravity?",
    "What is the difference between a psychologist and a psychiatrist?",
    "How can I make friends more easily?",
    "What is the most popular color in interior design?",
    "Where is the Eiffel Tower located?",
    "How can I protect my online data?",
    "Who wrote 'The Lord of the Rings'?",
    "What is the most innovative technology of 2025?",
    "What are the best action movies of all time?",
    "How can I improve my posture while working?",
    "What is the best video game released this year?",
    "What is the difference between a star and a planet?",
    "What are the best books to read in 2025?",
    "How can I better organize my time?",
    "What is the largest number ever discovered?",
    "Who won the Oscar for Best Picture in 2024?",
    "How can I start an online business?",
    "What are the healthiest foods?",
    "What is the most visited city in the world?",
    "Who wrote 'Don Quixote'?",
    "How can I sleep better at night?",
    "What is the best advice for success in life?"
]

# Esegui il test delle domande fuori contesto
risultati = []

In [None]:
cont = 0
skip_first = 0
for i, domanda in enumerate(tqdm(domande_fuori_contesto_en, desc="Processing questions")):
    if i < skip_first:
        continue
    cont += 1
    response = assistant.ask(domanda)
    risultati.append(response)
        
    tempo_casuale_ms = random.randint(5000, 10000) / 1000 
    time.sleep(tempo_casuale_ms)

# Salvataggio dei risultati in un file JSON
with open("test_riformulazione_OUT_OF_CONTEXT2.json", "w", encoding="utf-8") as f:
    json.dump(risultati, f, ensure_ascii=False, indent=4)

print("Test completato. I risultati sono stati salvati.")

In [None]:

# Salvataggio dei risultati in un file JSON
with open("test_riformulazione_OUT_OF_CONTEXT2.json", "w", encoding="utf-8") as f:
    json.dump(risultati, f, ensure_ascii=False, indent=4)

print("Test completato. I risultati sono stati salvati.")

In [None]:
assistant.clear_history()

# Esegui il test delle domande fuori contesto
risultati = []
for domanda in tqdm(sample_questions, desc="Processing questions"):
    reformulated_query = assistant.query_reformulation_chain.invoke({
        "question": domanda,
        "chat_history": assistant.get_history_for_prompt()
    })["text"].strip()
    risultati.append({"domanda": domanda, "query_riformulata": reformulated_query})    
   
    tempo_casuale_ms = random.randint(5000, 10000) / 1000 
    time.sleep(tempo_casuale_ms)


# Salvataggio dei risultati in un file JSON
with open("test_riformulazione_en.json", "w", encoding="utf-8") as f:
    json.dump(risultati, f, ensure_ascii=False, indent=4)

print("Test completato. I risultati sono stati salvati in 'test_riformulazione_IN_CONTEXT.json'.")