### TLS 24 - Rag CoT

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [3]:
#2. build prompt
top_k_voted_22 = pb.get_top_k_projects_from_22_election(df, k=15)
proj_count_avg_by_district= pb.get_project_count_and_avg_votes_by_district(df)
proj_count_avg_by_categories = pb.get_project_count_and_avg_votes_by_categories(df)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_rag_CoT_fr.txt',
        {
            'top_k_voted_22': top_k_voted_22,
            'project_count_and_avg_votes_by_district': proj_count_avg_by_district,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description'],
            'top_k_similar_projects_in_22': pb.get_top_k_similar_projects_in_22(df,x['project_id'],conn_params, k=10),
            'count_of_projects22_in_quartier': pb.get_count_of_projects_in_quartier(df,x['district']),
            'top_k_voted_in_district': pb.get_top_k_voted_in_district(df, x['district'], k=5),
            'count_of_projects24_in_quartier': pb.get_count_of_projects_in_quartier(df,x['district'], year=2024),
            'top_k_similar_projects_in_district': pb.get_top_k_similar_projects_in_22_by_district(df, x['project_id'], conn_params,k=5)
        }
    ),
    axis=1
)

In [4]:
print(test['prompt'].iloc[0])

Vous êtes un modèle expert dans l’analyse des élections de budgets participatifs, spécifiquement dans le contexte de la ville de Toulouse, en France.

À Toulouse, la municipalité organise des élections de budget participatif pour financer des projets citoyens. Chaque habitant peut voter pour entre 1 et 3 initiatives sur le site web officiel. 
Les votes sont anonymes.

Voici comment les projets sont sélectionnés :
- Tous les projets sont classés selon le nombre de voix obtenues.
- Un algorithme « greedy » est utilisé : on finance les projets dans l’ordre décroissant des voix, tant que le budget total n’est pas dépassé.
- Si un projet est trop coûteux pour le budget restant, on passe au suivant.

---

Lors de l’élection de 2022 :
- Il y a eu **4532 votants**, chaque personne pouvant voter pour jusqu'à 3 projets.
- Au total, environ **11918 voix** ont été exprimées sur **200 projets**.
- Le nombre de voix par projet variait de *2* à 492, avec une moyenne autour de **58** voix et un écart-

In [5]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))


mean tokens by prompt: 4102.49
avg.cost of each prediction: $0.04
experiment total cost: 7.51


In [6]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [8]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']

In [10]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [11]:
test['out_json'].iloc[0]

{'voix_estimées': 150,
 'intervalle_confiance': [100, 200],
 'position_attendue': 8,
 'dans_top_5': 0,
 'dans_top_10': 1}

In [13]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [15]:
#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank,is_top5,is_top10
0,263,590,1,Vous êtes un modèle expert dans l’analyse des ...,1. Le quartier 1 - Capitole / Arnaud Bernard /...,"{'voix_estimées': 150, 'intervalle_confiance':...",281.492332,8,0,1
1,320,559,2,Vous êtes un modèle expert dans l’analyse des ...,1. **Analyse des projets similaires**: Le proj...,"{'voix_estimées': 480, 'intervalle_confiance':...",900.775461,2,1,1
2,394,436,3,Vous êtes un modèle expert dans l’analyse des ...,1. **Analyse des projets similaires**: Les pro...,"{'voix_estimées': 125, 'intervalle_confiance':...",234.576943,10,0,1
3,333,366,4,Vous êtes un modèle expert dans l’analyse des ...,"1. Le projet ""Rénovation skatepark Rangueil"" e...","{'voix_estimées': 100, 'intervalle_confiance':...",187.661554,30,0,0
4,265,334,5,Vous êtes un modèle expert dans l’analyse des ...,1. Le quartier Capitole / Arnaud Bernard / Car...,"{'voix_estimées': 180, 'intervalle_confiance':...",337.790798,8,0,1
...,...,...,...,...,...,...,...,...,...,...
178,372,20,179,Vous êtes un modèle expert dans l’analyse des ...,1. Le quartier 17 - Mirail-Université / Reyner...,"{'voix_estimées': 20, 'intervalle_confiance': ...",37.532311,150,0,0
179,356,19,180,Vous êtes un modèle expert dans l’analyse des ...,"1. Le projet ""Débitumiser les places de statio...","{'voix_estimées': 75, 'intervalle_confiance': ...",140.746166,45,0,0
180,230,19,181,Vous êtes un modèle expert dans l’analyse des ...,"1. Le projet ""Installation de poubelles à déje...","{'voix_estimées': 35, 'intervalle_confiance': ...",65.681544,100,0,0
181,370,18,182,Vous êtes un modèle expert dans l’analyse des ...,1. Le quartier 17 - Mirail-Université / Reyner...,"{'voix_estimées': 25, 'intervalle_confiance': ...",46.915389,130,0,0


In [16]:
results.to_csv('output/predictions/tls_rag_cot_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


### WRC 17 CoT

In [19]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')
test = load_test_dataset(df, rows = 50, city='Wroclaw')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

test

Unnamed: 0,project_id,project_name,description,cost,district,district_number,real_votes,real_rank,year
0,10,Toalety we Wrocławskich parkach i na terenach ...,Uzasadnienie\nW parkach i na terenach zielonyc...,900000,Śródmieście,4.0,10857,1,2017
1,50,Zielona rowerowo-piesza obwodnica Wrocławia; E...,Uzasadnienie\nWalczymy o II ETAP. Ideą jest po...,1000000,Stare Miasto,5.0,10796,2,2017
2,18,Rowerowy Wrocław 2017,Uzasadnienie\nProjekt przewiduje poprawę bezpi...,1000000,Stare Miasto,5.0,8640,3,2017
3,675,"Zieleń dla Wrocławia – parki kieszonkowe, nasa...",Uzasadnienie\nWe Wrocławiu nastąpiła deklaraty...,1000000,Psie Pole,3.0,5398,4,2017
4,260,Budki dla kotów wolno żyjących,Uzasadnienie,70000,Psie Pole,3.0,4998,5,2017
5,550,Pieszo z Brochowa - uzupełnienie brakującego c...,Uzasadnienie\nZ Brochowa nie sposób wydostać s...,1000000,Krzyki,2.0,4468,6,2017
6,12,Bulwar Fizyków - naukowy plac zabaw w przestrz...,Uzasadnienie\nProponujemy stworzenie instalacj...,1000000,Psie Pole,3.0,4149,7,2017
7,159,Baza małych astronautów czyli kosmiczny plac z...,Uzasadnienie\nW ramach Budżetu Obywatelskiego ...,1000000,Krzyki,2.0,4098,8,2017
8,656,Toalety miejskie w parkach na miarę Wrocławia.,Uzasadnienie\nInicjatywa jest odpowiedzią na p...,1000000,Śródmieście,4.0,4008,9,2017
9,499,"""Konikowo"" - Budowa autorskiego placu zabaw na...","Uzasadnienie\nDrodzy Wrocławianie, Drodzy Sąsi...",1000000,Stare Miasto,5.0,3949,10,2017


In [20]:
conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

#2. build prompt
top_k_voted_16 = pb.get_top_k_projects_from_16_election(df, k=15)
proj_count_avg_by_district= pb.get_project_count_and_avg_votes_by_district(df, city = 'Wroclaw')
proj_count_avg_by_categories = pb.get_project_count_and_avg_votes_by_categories(df, city = 'Wroclaw')


In [21]:
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_rag2_CoT_pl.txt',
        {
            'top_k_voted_16': top_k_voted_16,
            'project_count_and_avg_votes_by_district': proj_count_avg_by_district,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description'],
            'top_k_similar_projects_in_16': pb.get_top_k_similar_projects_in_16(df, x['project_id'], x['year'], conn_params,k=5),
            'count_of_projects16_in_quartier': pb.get_count_of_projects_in_quartier(df,x['district'], year=2016),
            'top_k_voted_in_district': pb.get_top_k_voted_in_district_eng(df, x['district'], k=5, year=2016),
            'count_of_projects17_in_quartier': pb.get_count_of_projects_in_quartier(df,x['district'], year=2017),
            'top_k_similar_projects_in_district': pb.get_top_k_similar_projects_in_16_by_district(df, x['project_id'], x['year'], conn_params,k=5)
        }
    ),
    axis=1
)

In [22]:
print(test['prompt'].iloc[0])

You are an expert model in the analysis of participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.

In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen-led projects. Each resident can vote for between 1 and 3 initiatives on the official website.
Votes are anonymous.

Here is how the projects are selected:
- All projects are ranked by the number of votes received.
- A "greedy" algorithm is used: projects are funded in descending order of votes, as long as the total budget is not exceeded.
- If a project is too expensive for the remaining budget, it is skipped and the next one is considered.

---
In the 2016 election:
- There were **67103 voters**, each allowed to vote for up to 3 projects.
- In total, approximately **119194 votes** were cast across **52 projets**.
- The number of votes per project ranged from **120** to **13938**, with an average of **2292.19** votes and an estimated standard deviation of **2784

In [24]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 2936.76
avg.cost of each prediction: $0.03
experiment total cost: 1.47


In [25]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))


LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [31]:
import json
import re

def get_json_from_llm_response(out):
    # 1. Primer intento: el string completo es un JSON válido
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass  # No es un JSON plano válido, continuamos

    # 2. Segundo intento: buscar bloque ```json {...} ```
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    # 3. No se pudo encontrar JSON válido
    return None

def get_predicted_votes(df, out_json, city = 'Wroclaw'):
    
    if city != 'Wroclaw':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2016 = df[df['year'] == 2016].votes.sum()
    votes_2017 = df[df['year'] == 2017].votes.sum()

    relative_estimation = out_json['estimated_votes'] / votes_2016
    adjusted_prediction = relative_estimation * votes_2017

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['expected_rank']

def get_if_is_top5(out_json):
    return out_json['in_top_5']

def get_if_is_top10(out_json):
    return out_json['in_top_10']


In [30]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [29]:
test['out_json'].iloc[0]

{'estimated_votes': 4000,
 'confidence_interval': [3000, 5000],
 'expected_rank': 10,
 'in_top_5': 0,
 'in_top_10': 1}

In [32]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [33]:
#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank,is_top5,is_top10
0,10,10857,1,You are an expert model in the analysis of par...,1. **Project Theme and Demand**: The proposed ...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,10,0,1
1,50,10796,2,You are an expert model in the analysis of par...,1. **Project Theme and Historical Success**: T...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,8,0,1
2,18,8640,3,You are an expert model in the analysis of par...,1. **Project Theme and Popularity**: The proje...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,8,0,1
3,675,5398,4,You are an expert model in the analysis of par...,1. **Project Theme and Popularity**: The proje...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,12,0,0
4,260,4998,5,You are an expert model in the analysis of par...,1. **District Voting Patterns**: Psie Pole has...,"{'estimated_votes': 1600, 'confidence_interval...",1502.907864,20,0,0
5,550,4468,6,You are an expert model in the analysis of par...,1. **District Voting Trends**: Krzyki district...,"{'estimated_votes': 6000, 'confidence_interval...",5635.904492,5,1,1
6,12,4149,7,You are an expert model in the analysis of par...,1. **District Voting Patterns**: Psie Pole has...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,12,0,1
7,159,4098,8,You are an expert model in the analysis of par...,1. **District Voting Trends**: The district of...,"{'estimated_votes': 6500, 'confidence_interval...",6105.563199,8,0,1
8,656,4008,9,You are an expert model in the analysis of par...,1. **Project Cost and District Analysis**: The...,"{'estimated_votes': 5000, 'confidence_interval...",4696.587077,8,0,1
9,499,3949,10,You are an expert model in the analysis of par...,1. **District Voting Trends**: Stare Miasto ha...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,8,0,1


In [34]:
results.to_csv('output/predictions/wrc_rag_cot_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


### SB

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_prediction_set
from prompt_utils import tokens_counter, prompt_cost

load_dotenv()

path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city="Wroclaw")

ids_to_predict_path = 'data/wrc17_projects_to_predict.csv'
test = load_prediction_set(df, ids_to_predict_path, city='Wroclaw')
df

Unnamed: 0,project_id,project_name,description,category,cost,district,votes,district_number,rank,year
0,710,Drzewa dla Wrocławia - nasadzenia w całym mieś...,Uzasadnienie\nZnikające drzewa z krajobrazu mi...,greenery/recreation,1000000,Fabryczna,13938,1.0,1,2016
1,15,Zielona rowerowo-piesza obwodnica Wrocławia; E...,Uzasadnienie,walking/cycling infrastructure,1000000,Krzyki,12348,2.0,2,2016
2,764,Oświetlenie Parku Grabiszyńskiego (Alei Romera...,Uzasadnienie\nTaki mamy klimat... że przez pół...,other,1000000,Krzyki,7291,2.0,3,2016
3,685,Plac Zabaw dla Starszaków w parku Grabiszyński...,Uzasadnienie\nPlac Zabaw dla Starszaków w park...,playgrounds,560000,Krzyki,6663,2.0,4,2016
4,379,"Parking przy ""Dobrzyńskiej"" - ułatwienie dojaz...",Uzasadnienie\nProjekt polega na utworzeniu w c...,roads,750000,Krzyki,6383,2.0,5,2016
...,...,...,...,...,...,...,...,...,...,...
97,422,"Akcja Plac - Gry uliczne ""Oswajamy beton""",Uzasadnienie\nProjekt zakłada lokowanie w prze...,greenery/recreation,150000,Śródmieście,236,4.0,46,2017
98,406,Budowa wrocławskiej wypożyczalni rowerów integ...,Uzasadnienie\nProjekt przeznaczony jest zarówn...,walking/cycling infrastructure,600000,Krzyki,178,2.0,47,2017
99,629,Wrocław na dotknięcie ręki – tylfograficzne ma...,"Uzasadnienie\nRynek i wszystkie miejsca, takie...",other,130000,Stare Miasto,156,5.0,48,2017
100,720,1997,"Uzasadnienie\nIdeą projektu ""1997"" było upamię...",other,50000,Krzyki,118,2.0,49,2017


In [2]:
conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

#2. build prompt
top_k_voted_16 = pb.get_top_k_projects_from_16_election(df, k=15)
proj_count_avg_by_district= pb.get_project_count_and_avg_votes_by_district(df, city = 'Wroclaw')
proj_count_avg_by_categories = pb.get_project_count_and_avg_votes_by_categories(df, city = 'Wroclaw')

In [3]:
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_rag2_SB_pl.txt',
        {
            'top_k_voted_16': top_k_voted_16,
            'project_count_and_avg_votes_by_district': proj_count_avg_by_district,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description'],
            'top_k_similar_projects_in_16': pb.get_top_k_similar_projects_in_16(df, x['project_id'], x['year'], conn_params,k=5),
            'count_of_projects16_in_quartier': pb.get_count_of_projects_in_quartier(df,x['district'], year=2016),
            'top_k_voted_in_district': pb.get_top_k_voted_in_district_eng(df, x['district'], k=5, year=2016),
            'count_of_projects17_in_quartier': pb.get_count_of_projects_in_quartier(df,x['district'], year=2017),
            'top_k_similar_projects_in_district': pb.get_top_k_similar_projects_in_16_by_district(df, x['project_id'], x['year'], conn_params,k=5)
        }
    ),
    axis=1
)

In [7]:
print(test['prompt'].iloc[0])

You are an expert model in the analysis of participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.

In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen-led projects. Each resident can vote for between 1 and 3 initiatives on the official website.
Votes are anonymous.

Here is how the projects are selected:
- All projects are ranked by the number of votes received.
- A "greedy" algorithm is used: projects are funded in descending order of votes, as long as the total budget is not exceeded.
- If a project is too expensive for the remaining budget, it is skipped and the next one is considered.

---
In the 2016 election:
- There were **67103 voters**, each allowed to vote for up to 3 projects.
- In total, approximately **119194 votes** were cast across **52 projets**.
- The number of votes per project ranged from **120** to **13938**, with an average of **2292.19** votes and an estimated standard deviation of **2784