#### TLS 24

In [5]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [7]:
#2. build prompt
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_simple_fr.txt',
        {
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [8]:
print(test['prompt'].iloc[0])

Vous êtes un modèle expert dans l’analyse des élections de budgets participatifs, spécifiquement dans le contexte de la ville de Toulouse, en France.

À Toulouse, la municipalité organise des élections de budget participatif pour financer des projets citoyens. Chaque habitant peut voter pour entre 1 et 3 initiatives sur le site web officiel. 

Une fois la période de vote terminée, les projets sont classés selon le nombre de voix obtenues et se voient attribuer un rang. Ensuite, les projets gagnants sont sélectionnés à l’aide d’un algorithme greedy : on commence par le projet ayant obtenu le plus de voix, puis on ajoute successivement les projets suivants les plus votés, tant que le budget disponible le permet.
Lorsque le budget restant ne permet plus de financer le projet suivant dans la liste, on passe au suivant qui peut l’être, et ainsi de suite, jusqu’à épuisement du budget total de 8000000 euros.


Votre tâche consiste à analyser le projet suivant, proposé dans le cadre d'une nouv

In [9]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 818.34
avg.cost of each prediction: $0.01
experiment total cost: 1.50


In [10]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [34]:
import json
import re

def get_json_from_llm_response(out):
    # 1. Primer intento: el string completo es un JSON válido
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass  # No es un JSON plano válido, continuamos

    # 2. Segundo intento: buscar bloque ```json {...} ```
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    # 3. No se pudo encontrar JSON válido
    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']


In [32]:
test['out_json'].iloc[0]

{'voix_estimées': 1500,
 'intervalle_confiance': [1200, 1800],
 'position_attendue': 25}

In [35]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))

In [None]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

In [None]:
results.to_csv('output/predictions/tls_simple_prompt_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


#### WRC 17

In [None]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')
test = load_test_dataset(df, rows = 50, city='Wroclaw')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

Unnamed: 0,project_id,project_name,description,cost,district,district_number,real_votes,real_rank,year
0,10,Toalety we Wrocławskich parkach i na terenach ...,Uzasadnienie\nW parkach i na terenach zielonyc...,900000,Śródmieście,4.0,10857,1,2017
1,50,Zielona rowerowo-piesza obwodnica Wrocławia; E...,Uzasadnienie\nWalczymy o II ETAP. Ideą jest po...,1000000,Stare Miasto,5.0,10796,2,2017
2,18,Rowerowy Wrocław 2017,Uzasadnienie\nProjekt przewiduje poprawę bezpi...,1000000,Stare Miasto,5.0,8640,3,2017
3,675,"Zieleń dla Wrocławia – parki kieszonkowe, nasa...",Uzasadnienie\nWe Wrocławiu nastąpiła deklaraty...,1000000,Psie Pole,3.0,5398,4,2017
4,260,Budki dla kotów wolno żyjących,Uzasadnienie,70000,Psie Pole,3.0,4998,5,2017
5,550,Pieszo z Brochowa - uzupełnienie brakującego c...,Uzasadnienie\nZ Brochowa nie sposób wydostać s...,1000000,Krzyki,2.0,4468,6,2017
6,12,Bulwar Fizyków - naukowy plac zabaw w przestrz...,Uzasadnienie\nProponujemy stworzenie instalacj...,1000000,Psie Pole,3.0,4149,7,2017
7,159,Baza małych astronautów czyli kosmiczny plac z...,Uzasadnienie\nW ramach Budżetu Obywatelskiego ...,1000000,Krzyki,2.0,4098,8,2017
8,656,Toalety miejskie w parkach na miarę Wrocławia.,Uzasadnienie\nInicjatywa jest odpowiedzią na p...,1000000,Śródmieście,4.0,4008,9,2017
9,499,"""Konikowo"" - Budowa autorskiego placu zabaw na...","Uzasadnienie\nDrodzy Wrocławianie, Drodzy Sąsi...",1000000,Stare Miasto,5.0,3949,10,2017


In [43]:
#2. build prompt
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_simple_pl.txt',
        {
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [45]:
print(test['prompt'].iloc[0])

You are an expert model in the analysis of participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.
In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen-led projects. Each resident can vote for between 1 and 3 initiatives on the official website.

Once the voting period ends, the projects are ranked according to the number of votes received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, projects are added one by one in decreasing order of votes, as long as the available budget allows. If the next project in the list exceeds the remaining budget, it is skipped in favor of the next affordable one, and so on, until the total budget of €8,000,000 is exhausted.
Your task is to analyze the following project, proposed as part of a new participatory budgeting election:

Project: Toalety we Wrocławskich parkach i na terenach rekreacyjnych.
Cost: 9

In [46]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 876.80
avg.cost of each prediction: $0.01
experiment total cost: 0.44


In [47]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [56]:
import json
import re

def get_json_from_llm_response(out):
    # 1. Primer intento: el string completo es un JSON válido
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass  # No es un JSON plano válido, continuamos

    # 2. Segundo intento: buscar bloque ```json {...} ```
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    # 3. No se pudo encontrar JSON válido
    return None

def get_predicted_votes(df, out_json, city = 'Wroclaw'):
    
    if city != 'Wroclaw':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2016 = df[df['year'] == 2016].votes.sum()
    votes_2017 = df[df['year'] == 2017].votes.sum()

    relative_estimation = out_json['estimated_votes'] / votes_2016
    adjusted_prediction = relative_estimation * votes_2017

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['expected_rank']


In [55]:
test['out_json'].iloc[0]

{'estimated_votes': 3500,
 'confidence_interval': [3000, 4000],
 'expected_rank': 5}

In [52]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [57]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))

Unnamed: 0,project_id,project_name,description,cost,district,district_number,real_votes,real_rank,year,prompt,n_tokens,cost_usd,out,out_json,predicted_votes,predicted_rank
0,10,Toalety we Wrocławskich parkach i na terenach ...,Uzasadnienie\nW parkach i na terenach zielonyc...,900000,Śródmieście,4.0,10857,1,2017,You are an expert model in the analysis of par...,699,0.00699,1. **Relevance and Demand**: The project addre...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
1,50,Zielona rowerowo-piesza obwodnica Wrocławia; E...,Uzasadnienie\nWalczymy o II ETAP. Ideą jest po...,1000000,Stare Miasto,5.0,10796,2,2017,You are an expert model in the analysis of par...,814,0.00814,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,15
2,18,Rowerowy Wrocław 2017,Uzasadnienie\nProjekt przewiduje poprawę bezpi...,1000000,Stare Miasto,5.0,8640,3,2017,You are an expert model in the analysis of par...,850,0.0085,"1. **Project Relevance**: The project ""Rowerow...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
3,675,"Zieleń dla Wrocławia – parki kieszonkowe, nasa...",Uzasadnienie\nWe Wrocławiu nastąpiła deklaraty...,1000000,Psie Pole,3.0,5398,4,2017,You are an expert model in the analysis of par...,994,0.00994,"1. **Relevance of the Project**: The project ""...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
4,260,Budki dla kotów wolno żyjących,Uzasadnienie,70000,Psie Pole,3.0,4998,5,2017,You are an expert model in the analysis of par...,445,0.00445,"1. **Project Theme and Appeal**: The project ""...","{'estimated_votes': 1500, 'confidence_interval...",1408.976123,25
5,550,Pieszo z Brochowa - uzupełnienie brakującego c...,Uzasadnienie\nZ Brochowa nie sposób wydostać s...,1000000,Krzyki,2.0,4468,6,2017,You are an expert model in the analysis of par...,930,0.0093,1. **Project Relevance**: The project addresse...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
6,12,Bulwar Fizyków - naukowy plac zabaw w przestrz...,Uzasadnienie\nProponujemy stworzenie instalacj...,1000000,Psie Pole,3.0,4149,7,2017,You are an expert model in the analysis of par...,853,0.00853,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
7,159,Baza małych astronautów czyli kosmiczny plac z...,Uzasadnienie\nW ramach Budżetu Obywatelskiego ...,1000000,Krzyki,2.0,4098,8,2017,You are an expert model in the analysis of par...,963,0.00963,"1. **Project Appeal**: The project, aimed at c...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,10
8,656,Toalety miejskie w parkach na miarę Wrocławia.,Uzasadnienie\nInicjatywa jest odpowiedzią na p...,1000000,Śródmieście,4.0,4008,9,2017,You are an expert model in the analysis of par...,1043,0.01043,1. **Relevance and Demand**: The project addre...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
9,499,"""Konikowo"" - Budowa autorskiego placu zabaw na...","Uzasadnienie\nDrodzy Wrocławianie, Drodzy Sąsi...",1000000,Stare Miasto,5.0,3949,10,2017,You are an expert model in the analysis of par...,1082,0.01082,1. The project is located in a popular and acc...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5


In [60]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank
0,10,10857,1,You are an expert model in the analysis of par...,1. **Relevance and Demand**: The project addre...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
1,50,10796,2,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,15
2,18,8640,3,You are an expert model in the analysis of par...,"1. **Project Relevance**: The project ""Rowerow...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
3,675,5398,4,You are an expert model in the analysis of par...,"1. **Relevance of the Project**: The project ""...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
4,260,4998,5,You are an expert model in the analysis of par...,"1. **Project Theme and Appeal**: The project ""...","{'estimated_votes': 1500, 'confidence_interval...",1408.976123,25
5,550,4468,6,You are an expert model in the analysis of par...,1. **Project Relevance**: The project addresse...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
6,12,4149,7,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
7,159,4098,8,You are an expert model in the analysis of par...,"1. **Project Appeal**: The project, aimed at c...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,10
8,656,4008,9,You are an expert model in the analysis of par...,1. **Relevance and Demand**: The project addre...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
9,499,3949,10,You are an expert model in the analysis of par...,1. The project is located in a popular and acc...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5


In [61]:
results.to_csv('output/predictions/wrc_simple_prompt_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!
