In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_prediction_set
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')

ids_to_predict_path = 'data/wrc17_projects_to_predict.csv'
test = load_prediction_set(df, ids_to_predict_path)

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [9]:
df[df['year'] == 2017]['votes'].sum()

np.int64(111961)

In [28]:
#2. build prompt

last_projects_results = pb.get_all_projects_from_22_election(df16_shuffled)

test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/prompt_template1_pl.txt',
        {
            'last_projects_results': last_projects_results,
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [29]:
print(test['prompt'][0])

You are an expert model in analyzing participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.

In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen projects.
Votes are anonymous.

Once the voting period is over, the projects are ranked based on the number of votes they received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, the next most voted projects are added in sequence as long as the remaining budget allows it.
When the next project on the list exceeds the available budget, the algorithm skips to the next project that can be funded, and so on, until the total budget of 4500000 euros is exhausted.

Here is the list of the 52 initiatives proposed during the election, including their name, cost, and district:
- Budowa wrocławskiej wypożyczalni rowerów integracyjnych typu handbike wraz z odpowiednim zapleczem socjalno-sanitarnym: (Coût: 

In [30]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 5240.31
avg.cost of each prediction: $0.05
experiment total cost: 1.89


In [31]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!


In [34]:
print(test['out'][0])

1. **Project Appeal and Context**: The project "Plac Zabaw dla Starszaków w parku Grabiszyńskim i coś ekstra dla maluchów" targets children and families, a significant demographic in Wroclaw. It builds on a previously successful project, indicating community interest and potential support.

2. **Cost and Accessibility**: The project cost (560,000 €) is moderate compared to other projects in previous elections, making it financially feasible within the budget constraints. Its location in multiple districts enhances its accessibility and appeal to a broader audience.

3. **Comparison with Similar Projects**: A similar project, "Plac Zabaw dla Starszaków w parku Grabiszyńskim i coś ekstra dla maluchów," ranked 4th with 6663 votes in a previous election. This suggests a strong precedent for community support for playground projects in this area.

4. **Voting Dynamics**: Projects that cater to children and families, especially those enhancing urban green spaces and recreational facilities, 

In [64]:
def get_json_from_llm_response(out):
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        data = json.loads(json_str)
        return data
    else:
        return "No JSON in LLM answer..."
    
def get_predicted_votes(df, out_json):
    relative_estimation = (out_json['estimated_votes'] / df[df['year']==2016].votes.sum())
    return relative_estimation*df[df['year']==2017].votes.sum()
    
def get_predicted_rank(out_json):
    return out_json['expected_rank']

def get_if_is_top5(out_json):
    return out_json['in_top_5']

def get_if_is_top10(out_json):
    return out_json['in_top_10']

test['out_json'] = test['out'].apply(
    lambda o: ut.get_json_from_llm_response(o))

test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))
test['is_top5'] = test['out_json'].apply(lambda x: get_if_is_top5(x))
test['is_top10'] = test['out_json'].apply(lambda x: get_if_is_top10(x))

In [67]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )

In [66]:
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank,is_top5,is_top10
0,685,6663,4,You are an expert model in analyzing participa...,1. **Project Appeal and Context**: The project...,"{'estimated_votes': 6500, 'confidence_interval...",6105.563199,5,1,1
1,10,10857,1,You are an expert model in analyzing participa...,1. **Project Theme and Relevance**: The projec...,"{'estimated_votes': 6500, 'confidence_interval...",6105.563199,8,0,1
2,50,10796,2,You are an expert model in analyzing participa...,1. **Project Theme and Popularity**: The proje...,"{'estimated_votes': 10000, 'confidence_interva...",9393.174153,3,1,1
3,675,5398,4,You are an expert model in analyzing participa...,1. **Project Theme and Relevance**: The projec...,"{'estimated_votes': 7500, 'confidence_interval...",7044.880615,8,0,1
4,260,4998,5,You are an expert model in analyzing participa...,"1. **Project Theme and Cost**: The project ""Bu...","{'estimated_votes': 500, 'confidence_interval'...",469.658708,35,0,0
5,550,4468,6,You are an expert model in analyzing participa...,1. **Project Cost and Scope**: The project cos...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,12,0,0
6,12,4149,7,You are an expert model in analyzing participa...,1. **Project Theme and Appeal**: The proposed ...,"{'estimated_votes': 4000, 'confidence_interval...",3757.269661,8,0,1
7,656,4008,9,You are an expert model in analyzing participa...,1. **Project Cost and Scope**: The project cos...,"{'estimated_votes': 7500, 'confidence_interval...",7044.880615,5,1,1
8,499,3949,10,You are an expert model in analyzing participa...,"1. **Project Theme and Appeal**: The project ""...","{'estimated_votes': 5000, 'confidence_interval...",4696.587077,8,0,1
9,451,3368,12,You are an expert model in analyzing participa...,1. **Project Cost and Appeal**: The project co...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,8,0,1


In [68]:
results.to_csv('output/predictions/wrc_simple_prompt.csv', sep=";", index=False)
print('results ready!')

results ready!


In [None]:
#test['predicted_votes'] = test['out_json'].apply(lambda x: ut.get_predicted_votes(x))



#4. save results.
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank', 
             'is_top5', 
             'is_top10']
             )

results.to_csv('output/predictions/wrc_simple_prompt.csv', sep=";", index=False)
print('results ready!')

KeyError: 'position_attendue'