### TLS 24

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [2]:
#2. build prompt
last_projects_results = pb.get_all_projects_from_22_election(df22_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/tls_prompt_incontext_fr.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [3]:
print(test['prompt'].iloc[0])

Vous êtes un modèle expert dans l’analyse des élections de budgets participatifs, spécifiquement dans le contexte de la ville de Toulouse, en France.

À Toulouse, la municipalité organise des élections de budget participatif pour financer des projets citoyens. Chaque habitant peut voter pour entre 1 et 3 initiatives sur le site web officiel. 
Les votes sont anonymes.

Une fois la période de vote terminée, les projets sont classés selon le nombre de voix obtenues et se voient attribuer un rang. Ensuite, les projets gagnants sont sélectionnés à l’aide d’un algorithme greedy : on commence par le projet ayant obtenu le plus de voix, puis on ajoute successivement les projets suivants les plus votés, tant que le budget disponible le permet.
Lorsque le budget restant ne permet plus de financer le projet suivant dans la liste, on passe au suivant qui peut l’être, et ainsi de suite, jusqu’à épuisement du budget total de 8000000 euros.

Voici la liste des 200 initiatives proposées lors de l’élec

In [4]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 13513.34
avg.cost of each prediction: $0.14
experiment total cost: 24.73


In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [None]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']

test['out_json'].iloc[0]

In [None]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [None]:
test['predicted_votes'] = test['out_json'].apply(lambda x: ut.get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: ut.get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: ut.get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: ut.get_if_is_top10(x))

In [None]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )

results

In [None]:
results.to_csv('output/predictions/tls_incontext_prompt.csv', sep=";", index=False)
print('results ready!')

#### TLS 24

In [None]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [None]:
#2. build prompt

last_projects_results = pb.get_all_projects_from_22_election(df22_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_in_context_fr.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [None]:
print(test['prompt'].iloc[0])

In [None]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [None]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']

In [None]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

test['out_json'].iloc[0]

In [None]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [None]:
#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

In [None]:
results.to_csv('output/predictions/tls_in_context_full_1.csv', sep=";", index=False)
print('results ready!')

#### Tls 24 - In-Context (English DB)

In [5]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

english_df  = pd.read_csv("data/tls_projects_translated_to_eng.csv", sep=";")

df = pd.merge(
    left=df,
    right=english_df,
    on=['project_id','year'],
    how='inner'
)

df22_shuffled = pd.merge(
    left=df22_shuffled,
    right=english_df,
    on=['project_id','year'],
    how='inner'
)

test = pd.merge(left=test,
         right=english_df,
         on=['project_id'],
         how='inner')

In [6]:
#2. build prompt
last_projects_results = pb.get_all_projects_from_22_election_eng(df22_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/tls_prompt_incontext_eng.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name_eng'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description_eng']
        }
    ),
    axis=1
)

In [10]:
print(test['prompt'].iloc[0])

You are an expert model in analyzing participatory budgeting elections, specifically in the context of the city of Toulouse, France.

In Toulouse, the municipality organizes participatory budgeting elections to fund citizen-led projects. Each resident can vote for between 1 and 3 initiatives via the official website.
Votes are anonymous.

Once the voting period ends, projects are ranked according to the number of votes received. Each project is assigned a position in the ranking. Then, the winning projects are selected using a greedy algorithm: funding starts with the most voted project and continues down the list, as long as the available budget allows.
If the next project cannot be funded due to insufficient remaining budget, the algorithm skips it and moves to the next affordable one — continuing this way until the entire €8,000,000 budget is allocated.

Below is the list of the 200 initiatives proposed in the previous election, including their name, cost, and district:
- Bike stree

In [7]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 11775.94
avg.cost of each prediction: $0.12
experiment total cost: 21.55


In [12]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [13]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['estimated_votes'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['expected_position']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['likely_in_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['likely_in_top_10']

In [14]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [15]:
test['out_json'].iloc[0]

{'estimated_votes': 75,
 'confidence_interval': [50, 100],
 'expected_position': 50,
 'likely_in_top_5': 0,
 'likely_in_top_10': 0}

In [16]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [17]:
#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank,is_top5,is_top10
0,263,590,1,You are an expert model in analyzing participa...,**Reasoning Summary:**\n\n1. **District and Th...,"{'estimated_votes': 75, 'confidence_interval':...",140.746166,50,0,0
1,320,559,2,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **Project T...,"{'estimated_votes': 180, 'confidence_interval'...",337.790798,15,0,0
2,394,436,3,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **Theme Rel...,"{'estimated_votes': 120, 'confidence_interval'...",225.193865,22,0,0
3,333,366,4,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **Project T...,"{'estimated_votes': 120, 'confidence_interval'...",225.193865,22,0,0
4,265,334,5,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **District ...,"{'estimated_votes': 350, 'confidence_interval'...",656.815440,15,0,0
...,...,...,...,...,...,...,...,...,...,...
178,372,20,179,You are an expert model in analyzing participa...,**Reasoning Summary:**\n\n1. **Project Theme a...,"{'estimated_votes': 85, 'confidence_interval':...",159.512321,45,0,0
179,356,19,180,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **Project T...,"{'estimated_votes': 120, 'confidence_interval'...",225.193865,35,0,0
180,230,19,181,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **Project T...,"{'estimated_votes': 55, 'confidence_interval':...",103.213855,85,0,0
181,370,18,182,You are an expert model in analyzing participa...,**Key Points for Analysis:**\n\n1. **District ...,"{'estimated_votes': 120, 'confidence_interval'...",225.193865,22,0,0


In [18]:
results.to_csv('output/predictions/tls_in_context_full_eng_1.csv', sep=";", index=False)
print('results ready!')

results ready!


##### WRC 17

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')
test = load_test_dataset(df, rows = 50, city='Wroclaw')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

test

Unnamed: 0,project_id,project_name,description,cost,district,district_number,real_votes,real_rank,year
0,10,Toalety we Wrocławskich parkach i na terenach ...,Uzasadnienie\nW parkach i na terenach zielonyc...,900000,Śródmieście,4.0,10857,1,2017
1,50,Zielona rowerowo-piesza obwodnica Wrocławia; E...,Uzasadnienie\nWalczymy o II ETAP. Ideą jest po...,1000000,Stare Miasto,5.0,10796,2,2017
2,18,Rowerowy Wrocław 2017,Uzasadnienie\nProjekt przewiduje poprawę bezpi...,1000000,Stare Miasto,5.0,8640,3,2017
3,675,"Zieleń dla Wrocławia – parki kieszonkowe, nasa...",Uzasadnienie\nWe Wrocławiu nastąpiła deklaraty...,1000000,Psie Pole,3.0,5398,4,2017
4,260,Budki dla kotów wolno żyjących,Uzasadnienie,70000,Psie Pole,3.0,4998,5,2017
5,550,Pieszo z Brochowa - uzupełnienie brakującego c...,Uzasadnienie\nZ Brochowa nie sposób wydostać s...,1000000,Krzyki,2.0,4468,6,2017
6,12,Bulwar Fizyków - naukowy plac zabaw w przestrz...,Uzasadnienie\nProponujemy stworzenie instalacj...,1000000,Psie Pole,3.0,4149,7,2017
7,159,Baza małych astronautów czyli kosmiczny plac z...,Uzasadnienie\nW ramach Budżetu Obywatelskiego ...,1000000,Krzyki,2.0,4098,8,2017
8,656,Toalety miejskie w parkach na miarę Wrocławia.,Uzasadnienie\nInicjatywa jest odpowiedzią na p...,1000000,Śródmieście,4.0,4008,9,2017
9,499,"""Konikowo"" - Budowa autorskiego placu zabaw na...","Uzasadnienie\nDrodzy Wrocławianie, Drodzy Sąsi...",1000000,Stare Miasto,5.0,3949,10,2017


In [2]:
#2. build prompt

last_projects_results = pb.get_all_projects_from_22_election(df16_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_in_context_pl.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [3]:
print(test['prompt'][0])

You are an expert model in analyzing participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.

In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen projects.

Once the voting period is over, the projects are ranked based on the number of votes they received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, the next most voted projects are added in sequence as long as the remaining budget allows it.
When the next project on the list exceeds the available budget, the algorithm skips to the next project that can be funded, and so on, until the total budget of 4500000 € is exhausted.

Here is the list of the 52 initiatives proposed during the election, including their name, cost, and district:
- Budowa wrocławskiej wypożyczalni rowerów integracyjnych typu handbike wraz z odpowiednim zapleczem socjalno-sanitarnym: (Coût: 550000 €, District: Psie 

In [4]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 3759.80
avg.cost of each prediction: $0.04
experiment total cost: 1.88


In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [None]:
def get_json_from_llm_response(out):
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        data = json.loads(json_str)
        return data
    else:
        return "No JSON in LLM answer..."
    
def get_predicted_votes(df, out_json):
    relative_estimation = (out_json['estimated_votes'] / df[df['year']==2016].votes.sum())
    return relative_estimation*df[df['year']==2017].votes.sum()
    
def get_predicted_rank(out_json):
    return out_json['expected_rank']

def get_if_is_top5(out_json):
    return out_json['in_top_5']

def get_if_is_top10(out_json):
    return out_json['in_top_10']

test['out_json'] = test['out'].apply(
    lambda o: ut.get_json_from_llm_response(o))

test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [None]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

In [None]:
results

In [None]:
results.to_csv('output/predictions/wrc_in_context_full_1.csv', sep=";", index=False)
print('results ready!')

In [None]:
#test['predicted_votes'] = test['out_json'].apply(lambda x: ut.get_predicted_votes(x))



#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )

results.to_csv('output/predictions/wrc_simple_prompt.csv', sep=";", index=False)
print('results ready!')

### WRC 17 (English)

In [5]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


df = pd.read_csv("data/wrc_projects_eng.csv", sep=";")
df16_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
test = df[df['year'] == 2017].sample(frac=1, random_state=42).reset_index(drop=True)


load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [6]:
#2. build prompt

last_projects_results = pb.get_all_projects_from_22_election(df16_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_in_context_pl.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [7]:
print(test['prompt'][0])

You are an expert model in analyzing participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.

In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen projects.

Once the voting period is over, the projects are ranked based on the number of votes they received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, the next most voted projects are added in sequence as long as the remaining budget allows it.
When the next project on the list exceeds the available budget, the algorithm skips to the next project that can be funded, and so on, until the total budget of 4500000 € is exhausted.

Here is the list of the 52 initiatives proposed during the election, including their name, cost, and district:
- Improvement of the Bogedaina Street surface: (Coût: 750000 €, District: Krzyki). 983 Voix, Classement: 31
- Renovation of the Historic Fredruś Bus: (Coû

In [8]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 5435.10
avg.cost of each prediction: $0.05
experiment total cost: 2.72


In [None]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

In [None]:
def get_json_from_llm_response(out):
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        data = json.loads(json_str)
        return data
    else:
        return "No JSON in LLM answer..."
    
def get_predicted_votes(df, out_json):
    relative_estimation = (out_json['estimated_votes'] / df[df['year']==2016].votes.sum())
    return relative_estimation*df[df['year']==2017].votes.sum()
    
def get_predicted_rank(out_json):
    return out_json['expected_rank']

def get_if_is_top5(out_json):
    return out_json['in_top_5']

def get_if_is_top10(out_json):
    return out_json['in_top_10']

test['out_json'] = test['out'].apply(
    lambda o: ut.get_json_from_llm_response(o))

In [None]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [None]:
results = test.filter(['project_id', 
             'votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

In [None]:
results.to_csv('output/predictions/wrc_in_context_full_eng_1.csv', sep=";", index=False)
print('results ready!')