#### Toulouse Translation DB

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [5]:
def build_translation_prompt(project_name):
    prompt = f"""You are a professional translator. Please translate the following text from French to English, preserving the tone, style, and meaning as accurately as possible. Do not add any explanations or comments ‚Äî return only the translated text.
Text:
{project_name}
If the text contains geographic references or proper names (e.g., cities, people, institutions), keep them in their original form.
"""
    return prompt

In [None]:
#1. build prompt for project_name
df['prompt'] = df.apply(
    lambda x: build_translation_prompt(x['project_name']),
    axis=1
)

In [8]:
print(df.prompt.iloc[0])

You are a professional translator. Please translate the following text from French to English, preserving the tone, style, and meaning as accurately as possible. Do not add any explanations or comments ‚Äî return only the translated text.
Text:
Piste cyclable avenue Saint-Exup√©ry
If the text contains geographic references or proper names (e.g., cities, people, institutions), keep them in their original form.



In [9]:
df['n_tokens'] = df['prompt'].apply(lambda p: tokens_counter(p))
df['cost_usd'] = df['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(df.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(df.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(df.cost_usd.sum()))

mean tokens by prompt: 87.98
avg.cost of each prediction: $0.00
experiment total cost: 0.34


In [10]:
# run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

df['out'] = df['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [11]:
#2. build prompt for description
df['prompt_2'] = df.apply(
    lambda x: build_translation_prompt(x['description']),
    axis=1
)

In [12]:
df['n_tokens'] = df['prompt_2'].apply(lambda p: tokens_counter(p))
df['cost_usd'] = df['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(df.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(df.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(df.cost_usd.sum()))

mean tokens by prompt: 328.19
avg.cost of each prediction: $0.00
experiment total cost: 1.26


In [13]:
# run LLM!
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

df['description_eng'] = df['prompt_2'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [15]:
translated_projects_tls = df.filter(['project_id',
           'year', 
           'out', 
           'description_eng'])

translated_projects_tls.rename(columns={'out': 'project_name_eng'}, inplace=True)
translated_projects_tls

Unnamed: 0,project_id,year,project_name_eng,description_eng
0,136,2022,Bike lane on Saint-Exup√©ry Avenue,"When you live near Place de l'Ormeau, cycling ..."
1,7,2022,Create dedicated and well-defined bike and ped...,To promote cohabitation between pedestrians an...
2,5,2022,Greening of the Place du Capitole,To add even more majesty to this iconic locati...
3,132,2022,Mosquito Control,Installation of mosquito traps in the Pont des...
4,71,2022,Plant trees and shared gardens wherever possible!,A neighbor is using a piece of land from the l...
...,...,...,...,...
378,372,2024,Installation of a bike repair stand at La Reyn...,Fleury: Place Andr√© Abbal (Reynerie): It would...
379,356,2024,Remove the asphalt from the parking spaces on ...,I propose removing the asphalt from the parkin...
380,230,2024,Installation of waste bins for dog feces in Se...,"Install in the Sept-Deniers neighborhood, tras..."
381,370,2024,Set up tables and chairs at Jean Gilles' Maury...,There are not enough tables and chairs to acco...


In [16]:
translated_projects_tls.to_csv('data/tls_projects_translated_to_eng.csv', sep=";", index=False)

#### Wroclaw Translation DB

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [19]:
def build_translation_prompt(project_name):
    prompt = f"""You are a professional translator. Please translate the following text from Polish to English, preserving the tone, style, and meaning as accurately as possible. Do not add any explanations or comments ‚Äî return only the translated text.
Text:
{project_name}
If the text contains geographic references or proper names (e.g., cities, people, institutions), keep them in their original form.
"""
    return prompt

In [None]:
#1. build prompt for project_name
df['prompt'] = df.apply(
    lambda x: build_translation_prompt(x['project_name']),
    axis=1
)

In [22]:
print(df.prompt.iloc[0])

You are a professional translator. Please translate the following text from Polish to English, preserving the tone, style, and meaning as accurately as possible. Do not add any explanations or comments ‚Äî return only the translated text.
Text:
Drzewa dla Wroc≈Çawia - nasadzenia w ca≈Çym mie≈õcie!
If the text contains geographic references or proper names (e.g., cities, people, institutions), keep them in their original form.



In [18]:
df['n_tokens'] = df['prompt'].apply(lambda p: tokens_counter(p))
df['cost_usd'] = df['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(df.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(df.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(df.cost_usd.sum()))

mean tokens by prompt: 95.89
avg.cost of each prediction: $0.00
experiment total cost: 0.10


In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

df['out'] = df['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

df['out'] = df['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [23]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

df['out'] = df['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [26]:
#2. build prompt
df['prompt_2'] = df.apply(
    lambda x: build_translation_prompt(x['description']),
    axis=1
)

In [27]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

df['description_eng'] = df['prompt_2'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [31]:
df.filter(['project_id', 'out', 'description_eng', 'category','cost', 'district', 'votes', 'district_number', 'rank', 'year']).to_csv('data/wrc_projects_eng', sep=";", index=False)