## 1. Install Dependencies

In [1]:
# We'll install all necessary libraries:
#  - 'transformers' and 'datasets' from GitHub (latest dev version)
#  - 'accelerate', 'sentencepiece' for T5 tokenizers and GPU acceleration
#  - 'overpy' for OpenStreetMap queries
#  - 'gradio' for the UI
#  - 'sentence-transformers' for building embeddings in RAG

!pip install git+https://github.com/huggingface/transformers.git  \
             git+https://github.com/huggingface/datasets.git      \
             accelerate sentencepiece overpy gradio \
             sentence-transformers

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-pkrn_t_g
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-pkrn_t_g
  Resolved https://github.com/huggingface/transformers.git to commit 94ae1ba5b55e79ba766582de8a199d8ccf24a021
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/huggingface/datasets.git
  Cloning https://github.com/huggingface/datasets.git to /tmp/pip-req-build-6qtbuf5v
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/datasets.git /tmp/pip-req-build-6qtbuf5v
  Resolved https://github.com/huggingface/datasets.git to commit f693f4e93aabafa878470c80fd42ddb10ec550d6
  Installing build dependencies ... [?25l[?25hdone
  Gett

## 2. Download & Unzip CamRest, Inspect Files

In [2]:
# 1) Clone the "ConvLab/camrest" dataset from huggingface.co
!git clone https://huggingface.co/datasets/ConvLab/camrest

# 2) Unzip data.zip into a 'data' folder
!unzip camrest/data.zip -d camrest/data

# List the extracted files
!ls camrest/data

Cloning into 'camrest'...
remote: Enumerating objects: 24, done.[K
remote: Total 24 (delta 0), reused 0 (delta 0), pack-reused 24 (from 1)[K
Unpacking objects: 100% (24/24), 23.62 KiB | 2.36 MiB/s, done.
Archive:  camrest/data.zip
  inflating: camrest/data/data/CamRestDB.json  
  inflating: camrest/data/data/ontology.json  
  inflating: camrest/data/data/dialogues.json  
data


## 3. Parse CamRest (User–System) Dialogues

In [3]:
import json

# We assume dialogues.json is in /content/camrest/data/ after unzipping
with open("camrest/data/data/dialogues.json", "r") as f:
    all_dialogues = json.load(f)

def extract_user_system_pairs(dialogue):
    """
    Given a single 'dialogue' dict with a 'turns' list,
    produce (user_text, system_text) pairs.
    """
    pairs = []
    turns = dialogue["turns"]
    for i in range(len(turns) - 1):
        if turns[i]["speaker"] == "user" and turns[i+1]["speaker"] == "system":
            user_text = turns[i]["utterance"]
            system_text = turns[i+1]["utterance"]
            pairs.append((user_text, system_text))
    return pairs

all_pairs = []
for d in all_dialogues:
    all_pairs.extend(extract_user_system_pairs(d))

print("Number of user–system pairs:", len(all_pairs))
print("Example pair:\n", all_pairs[0])

Number of user–system pairs: 2744
Example pair:
 ("I need to find an expensive restauant that's in the south section of the city.", 'There are several restaurants in the south part of town that serve expensive food. Do you have a cuisine preference?')


## 4. Build a Hugging Face Dataset

In [4]:
from datasets import Dataset

# We'll transform each (user_text, system_text) into (input_text, target_text).
# "input_text" is a prompt for T5: "User: ...\nSystem:"
# "target_text" is the system's reply.

input_texts = []
target_texts = []
for user_text, system_text in all_pairs:
    prompt = f"User: {user_text}\nSystem:"
    input_texts.append(prompt)
    target_texts.append(system_text)

hf_data = {
    "input_text": input_texts,
    "target_text": target_texts
}

camrest_dataset = Dataset.from_dict(hf_data)
print(camrest_dataset)

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 2744
})


## 5. Load Flan-T5 (Large), Tokenize, & Prepare for Fine-Tuning

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"  # or "google/flan-t5-base", "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def tokenize_function(example):
    # We'll do truncation and rely on dynamic padding from the data collator
    inputs = tokenizer(example["input_text"], truncation=True, max_length=256)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["target_text"], truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Map the tokenize function across all samples
tokenized_dataset = camrest_dataset.map(tokenize_function, batched=True)

# Remove the original text columns; keep "input_ids", "attention_mask", "labels"
tokenized_dataset = tokenized_dataset.remove_columns(["input_text", "target_text"])
tokenized_dataset.set_format("torch")
print(tokenized_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/2744 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2744
})


## Fine-Tune Flan-T5 with Trainer

In [6]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

# DataCollatorForSeq2Seq handles dynamic padding for seq2seq tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# Basic training hyperparams
training_args = TrainingArguments(
    output_dir="camrest_finetuned_model",
    num_train_epochs=3,              # you can increase if you want more thorough training
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    evaluation_strategy="no",        # or "steps"/"epoch" if you have a val set
    save_strategy="no",
    logging_steps=50,
    fp16=True,                       # half-precision if your GPU supports it
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # The tokenized dataset
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("camrest_finetuned_model")

  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


### 7. Quick Test of the Fine-Tuned Model

In [None]:
from transformers import pipeline

finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("camrest_finetuned_model")
finetuned_tokenizer = AutoTokenizer.from_pretrained("camrest_finetuned_model")

inference_pipeline = pipeline(
    "text2text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer
)

test_input = "User: I'd like a cheap Chinese restaurant.\nSystem:"
res = inference_pipeline(test_input, max_new_tokens=60)
print("Model's system reply:", res[0]["generated_text"])

## 8. Download OSM Restaurants for Lyon

In [7]:
import overpy
import pandas as pd

api = overpy.Overpass()
query = """
[out:json];
area["name"="Lyon"]->.searchArea;
(
  node["amenity"="restaurant"](area.searchArea);
  way["amenity"="restaurant"](area.searchArea);
  relation["amenity"="restaurant"](area.searchArea);
);
out center;
"""
result = api.query(query)

restaurant_data = []

def parse_tags(obj):
    tags = obj.tags
    lat = obj.lat if hasattr(obj, "lat") else obj.center_lat
    lon = obj.lon if hasattr(obj, "lon") else obj.center_lon
    return {
        "name": tags.get("name", "Unknown"),
        "cuisine": tags.get("cuisine", "Unknown"),
        "phone": tags.get("phone", "N/A"),
        "website": tags.get("website", "N/A"),
        "opening_hours": tags.get("opening_hours", "N/A"),
        "lat": lat,
        "lon": lon
    }

for node in result.nodes:
    restaurant_data.append(parse_tags(node))
for way in result.ways:
    restaurant_data.append(parse_tags(way))
for rel in result.relations:
    restaurant_data.append(parse_tags(rel))

df_restaurants = pd.DataFrame(restaurant_data).drop_duplicates().reset_index(drop=True)
print("Number of OSM restaurants in Lyon:", len(df_restaurants))
df_restaurants.head()

Number of OSM restaurants in Lyon: 2400


Unnamed: 0,name,cuisine,phone,website,opening_hours,lat,lon
0,L'Esprit Bistrot,french,+33 4 78 74 38 42,https://www.lespritbistrot.com/lesprit-bistrot...,"Mo-Su 12:00-14:00, 19:30-22:00",45.7410332,4.8689407
1,Comptoir des Marronniers,Unknown,+33 4 72 77 10 00,http://lecomptoirdesmarronniers.fr/,,45.7569848,4.8346121
2,Léon de Lyon,Unknown,+33 4 72 10 11 12,,"Tu-Fr 12:00-14:00, 19:00-22:00; Sa 12:00-14:30...",45.7660411,4.8335068
3,Restaurant de la Plaine,Unknown,,,,45.7444471,4.7842502
4,Flam's Lyon,Unknown,+33 4 78 37 51 61,https://flams.fr/index.php/nos-restaurants/fla...,"Mo 12:00-14:00,18:30-22:30; Tu-Fr 12:00-14:00,...",45.7628507,4.8335475


## 9. Encode Restaurants (SentenceTransformers) & Define RAG Retrieval

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def build_description(row):
    return (
        f"Name: {row['name']}. "
        f"Cuisine: {row['cuisine']}. "
        f"Phone: {row['phone']}. "
        f"Website: {row['website']}. "
        f"Opening hours: {row['opening_hours']}. "
        f"Location: lat={row['lat']}, lon={row['lon']}."
    )

descriptions = df_restaurants.apply(build_description, axis=1).tolist()
embeddings = embedding_model.encode(descriptions, convert_to_numpy=True)

def retrieve_restaurants(query, top_k=3):
    # 1) Encode the user query
    q_emb = embedding_model.encode([query], convert_to_numpy=True)
    # 2) Cosine similarity with each restaurant
    sims = cosine_similarity(q_emb, embeddings)[0]
    # 3) Get top_k indices
    indices = sims.argsort()[-top_k:][::-1]
    # 4) Build final results
    results = []
    for idx in indices:
        row = df_restaurants.iloc[idx]
        results.append({
            "name": row["name"],
            "cuisine": row["cuisine"],
            "phone": row["phone"],
            "website": row["website"],
            "opening_hours": row["opening_hours"],
            "lat": row["lat"],
            "lon": row["lon"],
            "similarity": sims[idx],
            "description": descriptions[idx]
        })
    return results

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 10. Chat Function Combining CamRest-Fine-Tuned + RAG

In [9]:
def chat(user_query):
    # 1) Retrieve top 3
    retrieved = retrieve_restaurants(user_query, top_k=3)

    if not retrieved:
        context_str = "No restaurants found in our local data.\n"
    else:
        context_str = "Here are some possible restaurants in Lyon:\n"
        for i, item in enumerate(retrieved, start=1):
            context_str += (
                f"{i}. {item['name']} - Cuisine: {item['cuisine']}\n"
                f"   Phone: {item['phone']}, Website: {item['website']}\n"
                f"   Opening Hours: {item['opening_hours']}\n"
                f"   Location: lat={item['lat']}, lon={item['lon']}\n\n"
            )

    # 2) Build a final prompt for the fine-tuned T5
    prompt = (
        "You are a restaurant assistant. "
        "Use the local context plus your knowledge to answer the user in a helpful way.\n\n"
        f"Context:\n{context_str}\n"
        f"User: {user_query}\n"
        "System:"
    )

    # 3) Generate with the fine-tuned T5
    inputs = finetuned_tokenizer([prompt], return_tensors="pt").to(finetuned_model.device)
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=80,
        num_beams=2
    )
    answer = finetuned_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return answer

## 11. Gradio Chat Interface

In [10]:
import gradio as gr

def gradio_chat(user_message, chat_history):
    bot_answer = chat(user_message)
    chat_history.append((user_message, bot_answer))
    return chat_history, chat_history

with gr.Blocks() as demo:
    gr.Markdown("<h2>Lyon Restaurant Chatbot (Fine-Tuned on CamRest + RAG from OSM)</h2>")
    chatbot = gr.Chatbot([], label="Chat")
    user_box = gr.Textbox(label="Ask about restaurants in Lyon")
    clear_btn = gr.Button("Clear")

    # On submit, we call 'gradio_chat', appending the user input and bot answer to the chat.
    user_box.submit(gradio_chat, [user_box, chatbot], [chatbot, chatbot])
    clear_btn.click(lambda: [], None, chatbot, queue=False)

demo.launch()


  chatbot = gr.Chatbot([], label="Chat")


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://441f4bfefc1704bf29.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Here are a few prompts to try in your Gradio chatbot:

*   Recommend a cheap Chinese restaurant in Lyon.
*   I’d like a French restaurant with a phone number in the city center.
*   Any vegetarian options near lat=45.77, lon=4.83?
*   How many restaurants do we have in the database?
*   Do you have the website for an Italian place open late?
