### TLS 24

In [14]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [15]:
#2. build prompt
last_projects_results = pb.get_all_projects_from_22_election(df22_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_in_context_fr.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [16]:
print(test['prompt'].iloc[0])

Vous êtes un modèle expert dans l’analyse des élections de budgets participatifs, spécifiquement dans le contexte de la ville de Toulouse, en France.

À Toulouse, la municipalité organise des élections de budget participatif pour financer des projets citoyens. Chaque habitant peut voter pour entre 1 et 3 initiatives sur le site web officiel. 
Les votes sont anonymes.

Une fois la période de vote terminée, les projets sont classés selon le nombre de voix obtenues et se voient attribuer un rang. Ensuite, les projets gagnants sont sélectionnés à l’aide d’un algorithme greedy : on commence par le projet ayant obtenu le plus de voix, puis on ajoute successivement les projets suivants les plus votés, tant que le budget disponible le permet.
Lorsque le budget restant ne permet plus de financer le projet suivant dans la liste, on passe au suivant qui peut l’être, et ainsi de suite, jusqu’à épuisement du budget total de 8000000 euros.

Voici la liste des 200 initiatives proposées lors de l’élec

In [17]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 13511.34
avg.cost of each prediction: $0.14
experiment total cost: 24.73


In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [None]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']

test['out_json'].iloc[0]

In [None]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [None]:
test['predicted_votes'] = test['out_json'].apply(lambda x: ut.get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: ut.get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: ut.get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: ut.get_if_is_top10(x))

In [None]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )

results

In [None]:
results.to_csv('output/predictions/tls_incontext_prompt.csv', sep=";", index=False)
print('results ready!')

#### TLS 24

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [4]:
#2. build prompt

last_projects_results = pb.get_all_projects_from_22_election(df22_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_in_context_fr.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [6]:
print(test['prompt'].iloc[0])

Vous êtes un modèle expert dans l’analyse des élections de budgets participatifs, spécifiquement dans le contexte de la ville de Toulouse, en France.

À Toulouse, la municipalité organise des élections de budget participatif pour financer des projets citoyens. Chaque habitant peut voter pour entre 1 et 3 initiatives sur le site web officiel. 
Les votes sont anonymes.

Une fois la période de vote terminée, les projets sont classés selon le nombre de voix obtenues et se voient attribuer un rang. Ensuite, les projets gagnants sont sélectionnés à l’aide d’un algorithme greedy : on commence par le projet ayant obtenu le plus de voix, puis on ajoute successivement les projets suivants les plus votés, tant que le budget disponible le permet.
Lorsque le budget restant ne permet plus de financer le projet suivant dans la liste, on passe au suivant qui peut l’être, et ainsi de suite, jusqu’à épuisement du budget total de 8000000 euros.

Voici la liste des 200 initiatives proposées lors de l’élec

In [7]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 13511.34
avg.cost of each prediction: $0.14
experiment total cost: 24.73


In [8]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [14]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']

In [15]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

test['out_json'].iloc[0]

{'voix_estimées': 75,
 'intervalle_confiance': [50, 100],
 'position_attendue': 50,
 'dans_top_5': 0,
 'dans_top_10': 0}

In [16]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [18]:
#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank,is_top5,is_top10
0,263,590,1,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet concerne l'installation de racks ...,"{'voix_estimées': 75, 'intervalle_confiance': ...",140.746166,50,0,0
1,320,559,2,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet de rénovation de la piste cyclabl...,"{'voix_estimées': 150, 'intervalle_confiance':...",281.492332,20,0,0
2,394,436,3,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet proposé concerne la lutte contre ...,"{'voix_estimées': 85, 'intervalle_confiance': ...",159.512321,45,0,0
3,333,366,4,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet de rénovation du skatepark à Rang...,"{'voix_estimées': 85, 'intervalle_confiance': ...",159.512321,45,0,0
4,265,334,5,Vous êtes un modèle expert dans l’analyse des ...,"Raisonnement pour l'analyse du projet ""Refaire...","{'voix_estimées': 120, 'intervalle_confiance':...",225.193865,20,0,0
...,...,...,...,...,...,...,...,...,...,...
178,372,20,179,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet proposé concerne l'installation d...,"{'voix_estimées': 75, 'intervalle_confiance': ...",140.746166,50,0,0
179,356,19,180,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet proposé concerne la débitumisatio...,"{'voix_estimées': 120, 'intervalle_confiance':...",225.193865,25,0,0
180,230,19,181,Vous êtes un modèle expert dans l’analyse des ...,"Raisonnement pour l'analyse du projet ""Install...","{'voix_estimées': 50, 'intervalle_confiance': ...",93.830777,80,0,0
181,370,18,182,Vous êtes un modèle expert dans l’analyse des ...,1. Le projet proposé concerne l'installation d...,"{'voix_estimées': 90, 'intervalle_confiance': ...",168.895399,40,0,0


In [19]:
results.to_csv('output/predictions/tls_in_context_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


##### WRC 17

In [21]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')
test = load_test_dataset(df, rows = 50, city='Wroclaw')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

test

Unnamed: 0,project_id,project_name,description,cost,district,district_number,real_votes,real_rank,year
0,10,Toalety we Wrocławskich parkach i na terenach ...,Uzasadnienie\nW parkach i na terenach zielonyc...,900000,Śródmieście,4.0,10857,1,2017
1,50,Zielona rowerowo-piesza obwodnica Wrocławia; E...,Uzasadnienie\nWalczymy o II ETAP. Ideą jest po...,1000000,Stare Miasto,5.0,10796,2,2017
2,18,Rowerowy Wrocław 2017,Uzasadnienie\nProjekt przewiduje poprawę bezpi...,1000000,Stare Miasto,5.0,8640,3,2017
3,675,"Zieleń dla Wrocławia – parki kieszonkowe, nasa...",Uzasadnienie\nWe Wrocławiu nastąpiła deklaraty...,1000000,Psie Pole,3.0,5398,4,2017
4,260,Budki dla kotów wolno żyjących,Uzasadnienie,70000,Psie Pole,3.0,4998,5,2017
5,550,Pieszo z Brochowa - uzupełnienie brakującego c...,Uzasadnienie\nZ Brochowa nie sposób wydostać s...,1000000,Krzyki,2.0,4468,6,2017
6,12,Bulwar Fizyków - naukowy plac zabaw w przestrz...,Uzasadnienie\nProponujemy stworzenie instalacj...,1000000,Psie Pole,3.0,4149,7,2017
7,159,Baza małych astronautów czyli kosmiczny plac z...,Uzasadnienie\nW ramach Budżetu Obywatelskiego ...,1000000,Krzyki,2.0,4098,8,2017
8,656,Toalety miejskie w parkach na miarę Wrocławia.,Uzasadnienie\nInicjatywa jest odpowiedzią na p...,1000000,Śródmieście,4.0,4008,9,2017
9,499,"""Konikowo"" - Budowa autorskiego placu zabaw na...","Uzasadnienie\nDrodzy Wrocławianie, Drodzy Sąsi...",1000000,Stare Miasto,5.0,3949,10,2017


In [22]:
#2. build prompt

last_projects_results = pb.get_all_projects_from_22_election(df16_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_in_context_pl.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [23]:
print(test['prompt'][0])

You are an expert model in analyzing participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.

In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen projects.

Once the voting period is over, the projects are ranked based on the number of votes they received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, the next most voted projects are added in sequence as long as the remaining budget allows it.
When the next project on the list exceeds the available budget, the algorithm skips to the next project that can be funded, and so on, until the total budget of 4500000 € is exhausted.

Here is the list of the 52 initiatives proposed during the election, including their name, cost, and district:
- Budowa wrocławskiej wypożyczalni rowerów integracyjnych typu handbike wraz z odpowiednim zapleczem socjalno-sanitarnym: (Coût: 550000 €, District: Psie 

In [24]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 3757.80
avg.cost of each prediction: $0.04
experiment total cost: 1.88


In [26]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [28]:
def get_json_from_llm_response(out):
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        data = json.loads(json_str)
        return data
    else:
        return "No JSON in LLM answer..."
    
def get_predicted_votes(df, out_json):
    relative_estimation = (out_json['estimated_votes'] / df[df['year']==2016].votes.sum())
    return relative_estimation*df[df['year']==2017].votes.sum()
    
def get_predicted_rank(out_json):
    return out_json['expected_rank']

def get_if_is_top5(out_json):
    return out_json['in_top_5']

def get_if_is_top10(out_json):
    return out_json['in_top_10']

test['out_json'] = test['out'].apply(
    lambda o: ut.get_json_from_llm_response(o))

test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [30]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank,is_top5,is_top10
0,10,10857,1,You are an expert model in analyzing participa...,1. **Project Relevance and Demand**: The proje...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,10,0,1
1,50,10796,2,You are an expert model in analyzing participa...,1. **Project Theme and Popularity**: The proje...,"{'estimated_votes': 11000, 'confidence_interva...",10332.491568,3,1,1
2,18,8640,3,You are an expert model in analyzing participa...,1. **Project Theme and Popularity**: The proje...,"{'estimated_votes': 5000, 'confidence_interval...",4696.587077,7,0,1
3,675,5398,4,You are an expert model in analyzing participa...,1. **Project Theme and Popularity**: The proje...,"{'estimated_votes': 8000, 'confidence_interval...",7514.539322,8,0,1
4,260,4998,5,You are an expert model in analyzing participa...,"1. **Project Theme and Cost**: The project ""Bu...","{'estimated_votes': 1200, 'confidence_interval...",1127.180898,25,0,0
5,550,4468,6,You are an expert model in analyzing participa...,1. **Project Relevance and Urgency**: The proj...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,8,0,1
6,12,4149,7,You are an expert model in analyzing participa...,1. **Project Theme and Appeal**: The proposed ...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,8,0,1
7,159,4098,8,You are an expert model in analyzing participa...,"1. **Project Theme and Appeal**: The project ""...","{'estimated_votes': 5000, 'confidence_interval...",4696.587077,8,0,1
8,656,4008,9,You are an expert model in analyzing participa...,1. **Project Relevance and Demand**: The proje...,"{'estimated_votes': 6500, 'confidence_interval...",6105.563199,8,0,1
9,499,3949,10,You are an expert model in analyzing participa...,1. **Project Theme and Appeal**: The project f...,"{'estimated_votes': 6500, 'confidence_interval...",6105.563199,8,0,1


In [None]:
results

In [31]:
results.to_csv('output/predictions/wrc_in_context_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


In [None]:
#test['predicted_votes'] = test['out_json'].apply(lambda x: ut.get_predicted_votes(x))



#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )

results.to_csv('output/predictions/wrc_simple_prompt.csv', sep=";", index=False)
print('results ready!')