#### TLS 24 - French Prompt and DB

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [3]:
#2. build prompt
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/tls_prompt_simple_fr.txt',
        {
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [4]:
print(test['prompt'].iloc[0])

Vous êtes un modèle expert dans l’analyse des élections de budgets participatifs, spécifiquement dans le contexte de la ville de Toulouse, en France.

À Toulouse, la municipalité organise des élections de budget participatif pour financer des projets citoyens. Chaque habitant peut voter pour entre 1 et 3 initiatives sur le site web officiel. 

Une fois la période de vote terminée, les projets sont classés selon le nombre de voix obtenues et se voient attribuer un rang. Ensuite, les projets gagnants sont sélectionnés à l’aide d’un algorithme greedy : on commence par le projet ayant obtenu le plus de voix, puis on ajoute successivement les projets suivants les plus votés, tant que le budget disponible le permet.
Lorsque le budget restant ne permet plus de financer le projet suivant dans la liste, on passe au suivant qui peut l’être, et ainsi de suite, jusqu’à épuisement du budget total de 8000000 euros.


Votre tâche consiste à analyser le projet suivant, proposé dans le cadre d'une nouv

In [5]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 820.34
avg.cost of each prediction: $0.01
experiment total cost: 1.50


In [10]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [None]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['voix_estimées'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['position_attendue']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']


In [32]:
test['out_json'].iloc[0]

{'voix_estimées': 1500,
 'intervalle_confiance': [1200, 1800],
 'position_attendue': 25}

In [35]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))

In [None]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

In [39]:
results.to_csv('output/predictions/tls_simple_prompt_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


#### Toulouse English Prompt & Database

In [6]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

path_22 = 'data/tls22_projects.csv'
path_24 = 'data/tls24_projects.csv'

df, df22_shuffled = load_and_prepare_projects(path_22,path_24, city = "Toulouse")
test = load_test_dataset(df, rows = len(df[df['year'] == 2024]), city = 'Toulouse')

english_df  = pd.read_csv("data/tls_projects_translated_to_eng.csv", sep=";")

df = pd.merge(
    left=df,
    right=english_df,
    on=['project_id','year'],
    how='inner'
)

test = pd.merge(left=test,
         right=english_df,
         on=['project_id'],
         how='inner')

In [7]:
#2. build prompt
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/tls_prompt_simple_eng.txt',
        {
            'project_name': x['project_name_eng'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description_eng']
        }
    ),
    axis=1
)

In [8]:
print(test['prompt'].iloc[0])

You are an expert model in analyzing participatory budgeting elections, specifically in the context of the city of Toulouse, France.

In Toulouse, the municipality organizes participatory budgeting elections to fund citizen projects. Each resident can vote for between 1 and 3 initiatives on the official website. A total of 4,532 people voted in the 2022 election..

Once the voting period ends, the projects are ranked according to the number of votes received. The winning projects are then selected using a greedy algorithm: starting with the project that received the most votes, projects are successively added in order of votes, as long as the available budget allows. 

When the remaining budget is not sufficient to fund the next project on the list, the algorithm skips to the next one that can be funded, and so on, until the total budget of 8,000,000 euros is exhausted.

Your task is to analyze the following project, proposed as part of a new participatory budgeting election in year 20

In [9]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 631.94
avg.cost of each prediction: $0.01
experiment total cost: 1.16


In [13]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [None]:
import json
import re

def get_json_from_llm_response(out):
    
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass

    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    return None

def get_predicted_votes(df, out_json, city = 'Toulouse'):
    
    if city != 'Toulouse':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2022 = df[df['year'] == 2022].votes.sum()
    votes_2024 = df[df['year'] == 2024].votes.sum()

    relative_estimation = out_json['estimated_votes'] / votes_2022
    adjusted_prediction = relative_estimation * votes_2024

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['expected_position']


def get_if_is_top5(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_5']

def get_if_is_top10(out_json):

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['dans_top_10']


In [21]:
test['out_json'].iloc[0]

{'estimated_votes': 450,
 'confidence_interval': [300, 600],
 'expected_position': 15}

In [None]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))

In [25]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank
0,263,590,1,You are an expert model in analyzing participa...,- **Relevance and Demand**: The project addres...,"{'estimated_votes': 450, 'confidence_interval'...",844.476995,15
1,320,559,2,You are an expert model in analyzing participa...,**Reasoning:**\n\n1. **Relevance and Urgency**...,"{'estimated_votes': 650, 'confidence_interval'...",1219.800103,15
2,394,436,3,You are an expert model in analyzing participa...,- **Relevance to Local Issues**: The project a...,"{'estimated_votes': 450, 'confidence_interval'...",844.476995,15
3,333,366,4,You are an expert model in analyzing participa...,- **Popularity and Utility**: The Rangueil Ska...,"{'estimated_votes': 650, 'confidence_interval'...",1219.800103,15
4,265,334,5,You are an expert model in analyzing participa...,- **Relevance and Popularity**: Cycling is a p...,"{'estimated_votes': 750, 'confidence_interval'...",1407.461658,15
...,...,...,...,...,...,...,...,...
178,372,20,179,You are an expert model in analyzing participa...,- **Local Relevance**: The project is located ...,"{'estimated_votes': 350, 'confidence_interval'...",656.815440,15
179,356,19,180,You are an expert model in analyzing participa...,- **District and Theme Relevance**: The projec...,"{'estimated_votes': 800, 'confidence_interval'...",1501.292435,15
180,230,19,181,You are an expert model in analyzing participa...,- **Local Needs and Project Relevance**: The p...,"{'estimated_votes': 350, 'confidence_interval'...",656.815440,30
181,370,18,182,You are an expert model in analyzing participa...,- **District Characteristics**: The project is...,"{'estimated_votes': 350, 'confidence_interval'...",656.815440,15


In [26]:
results.to_csv('output/predictions/tls_simple_prompt_full_eng_1.csv', sep=";", index=False)
print('results ready!')

results ready!


#### WRC 17

In [1]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


path_16 = 'data/wrc16_projects.csv'
path_17 = 'data/wrc17_projects.csv'

df, df16_shuffled = load_and_prepare_projects(path_16,path_17, city = 'Wroclaw')
test = load_test_dataset(df, rows = 50, city='Wroclaw')

load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}

In [2]:
#2. build prompt
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/wro_prompt_simple_pl.txt',
        {
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [3]:
print(test['prompt'].iloc[0])

You are an expert model in the analysis of participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.
In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen-led projects. Each resident can vote for between 1 and 3 initiatives on the official website.

Once the voting period ends, the projects are ranked according to the number of votes received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, projects are added one by one in decreasing order of votes, as long as the available budget allows. If the next project in the list exceeds the remaining budget, it is skipped in favor of the next affordable one, and so on, until the total budget of 4500000 € is exhausted.
Your task is to analyze the following project, proposed as part of a new participatory budgeting election:

Project: Toalety we Wrocławskich parkach i na terenach rekreacyjnych.
Cost: 90

In [4]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 875.80
avg.cost of each prediction: $0.01
experiment total cost: 0.44


In [47]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [56]:
import json
import re

def get_json_from_llm_response(out):
    # 1. Primer intento: el string completo es un JSON válido
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass  # No es un JSON plano válido, continuamos

    # 2. Segundo intento: buscar bloque ```json {...} ```
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    # 3. No se pudo encontrar JSON válido
    return None

def get_predicted_votes(df, out_json, city = 'Wroclaw'):
    
    if city != 'Wroclaw':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2016 = df[df['year'] == 2016].votes.sum()
    votes_2017 = df[df['year'] == 2017].votes.sum()

    relative_estimation = out_json['estimated_votes'] / votes_2016
    adjusted_prediction = relative_estimation * votes_2017

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['expected_rank']


In [None]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [57]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))

In [60]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

Unnamed: 0,project_id,real_votes,real_rank,prompt,out,out_json,predicted_votes,predicted_rank
0,10,10857,1,You are an expert model in the analysis of par...,1. **Relevance and Demand**: The project addre...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
1,50,10796,2,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,15
2,18,8640,3,You are an expert model in the analysis of par...,"1. **Project Relevance**: The project ""Rowerow...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
3,675,5398,4,You are an expert model in the analysis of par...,"1. **Relevance of the Project**: The project ""...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
4,260,4998,5,You are an expert model in the analysis of par...,"1. **Project Theme and Appeal**: The project ""...","{'estimated_votes': 1500, 'confidence_interval...",1408.976123,25
5,550,4468,6,You are an expert model in the analysis of par...,1. **Project Relevance**: The project addresse...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
6,12,4149,7,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
7,159,4098,8,You are an expert model in the analysis of par...,"1. **Project Appeal**: The project, aimed at c...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,10
8,656,4008,9,You are an expert model in the analysis of par...,1. **Relevance and Demand**: The project addre...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
9,499,3949,10,You are an expert model in the analysis of par...,1. The project is located in a popular and acc...,"{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5


In [61]:
results.to_csv('output/predictions/wrc_simple_prompt_full_1.csv', sep=";", index=False)
print('results ready!')

results ready!


#### WRC 17 Eng Db

In [5]:
# 1. load data
import os
import pandas as pd
import src.utils as ut
import prompt_builder as pb

from dotenv import load_dotenv
from src.data_loader import load_and_prepare_projects, load_test_dataset
from prompt_utils import tokens_counter, prompt_cost


df = pd.read_csv("data/wrc_projects_eng.csv", sep=";")
df16_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
test = df[df['year'] == 2017].sample(frac=1, random_state=42).reset_index(drop=True)


load_dotenv()

conn_params = {
    "host": os.environ["PG_HOST"],
    "database": os.environ["PG_DATABASE"],
    "user": os.environ["PG_USER"],
    "password": os.environ['PG_PASSWORD']
}


In [7]:
#2. build prompt
test['prompt'] = test.apply(
    lambda x: pb.build_prompt(
        'prompts/wro_prompt_simple_pl.txt',
        {
            'project_name': x['project_name'],
            'cost': x['cost'],
            'district': x['district'],
            'description': x['description']
        }
    ),
    axis=1
)

In [8]:
print(test.prompt.iloc[0])

You are an expert model in the analysis of participatory budgeting elections, specifically in the context of the city of Wroclaw, Poland.
In Wroclaw, the municipality organizes participatory budgeting elections to fund citizen-led projects. Each resident can vote for between 1 and 3 initiatives on the official website.

Once the voting period ends, the projects are ranked according to the number of votes received. Then, the winning projects are selected using a greedy algorithm: starting with the project that received the most votes, projects are added one by one in decreasing order of votes, as long as the available budget allows. If the next project in the list exceeds the remaining budget, it is skipped in favor of the next affordable one, and so on, until the total budget of 4500000 € is exhausted.
Your task is to analyze the following project, proposed as part of a new participatory budgeting election:

Project: Rędziński Park
Cost: 1000000 €
District: Krzyki
Description: Justific

In [9]:
test['n_tokens'] = test['prompt'].apply(lambda p: tokens_counter(p))
test['cost_usd'] = test['n_tokens'].apply(lambda n_tokens: prompt_cost(n_tokens, 'gpt-4-turbo'))

print('mean tokens by prompt: {:.2f}'.format(test.n_tokens.mean()))
print('avg.cost of each prediction: ${:.2f}'.format(test.cost_usd.mean()))
print('experiment total cost: {:.2f}'.format(test.cost_usd.sum()))

mean tokens by prompt: 693.10
avg.cost of each prediction: $0.01
experiment total cost: 0.35


In [12]:
#3. run experiment! :)
api_key=os.getenv('OPENAI_API_KEY')
from llm_client import call_openai_model 

test['out'] = test['prompt'].apply(lambda prompt: 
                     call_openai_model(prompt=prompt, api_key=api_key))

LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM response receive!
LLM respon

In [13]:
import json
import re

def get_json_from_llm_response(out):
    # 1. Primer intento: el string completo es un JSON válido
    if isinstance(out, str):
        try:
            data = json.loads(out)
            if isinstance(data, dict):
                return data
        except json.JSONDecodeError:
            pass  # No es un JSON plano válido, continuamos

    # 2. Segundo intento: buscar bloque ```json {...} ```
    match = re.search(r"```json\s*({.*?})\s*```", out, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None

    # 3. No se pudo encontrar JSON válido
    return None

def get_predicted_votes(df, out_json, city = 'Wroclaw'):
    
    if city != 'Wroclaw':
        raise ValueError(f"City '{city}' not supported")

    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
    
    votes_2016 = df[df['year'] == 2016].votes.sum()
    votes_2017 = df[df['year'] == 2017].votes.sum()

    relative_estimation = out_json['estimated_votes'] / votes_2016
    adjusted_prediction = relative_estimation * votes_2017

    return adjusted_prediction

def get_predicted_rank(out_json):
    
    if isinstance(out_json, str):
        try:
            out_json = json.loads(out_json)
        except json.JSONDecodeError:
            return None
        
    return out_json['expected_rank']

In [14]:
test['out_json'] = test['out'].apply(
    lambda o: get_json_from_llm_response(o))

In [16]:
test['predicted_votes'] = test['out_json'].apply(lambda x: get_predicted_votes(df, x))
test['predicted_rank'] = test['out_json'].apply(lambda x: get_predicted_rank(x))

In [17]:
results = test.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

Unnamed: 0,project_id,prompt,out,out_json,predicted_votes,predicted_rank
0,345,You are an expert model in the analysis of par...,1. **District and Demographics**: Krzyki is on...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
1,334,You are an expert model in the analysis of par...,1. **City-wide Impact**: The project impacts a...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
2,668,You are an expert model in the analysis of par...,"1. **Project Appeal and Relevance**: The ""Park...","{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
3,422,You are an expert model in the analysis of par...,1. **District and Demographics**: Śródmieście ...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,15
4,590,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,10
5,720,You are an expert model in the analysis of par...,1. **Historical Context & Emotional Appeal**: ...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,15
6,83,You are an expert model in the analysis of par...,1. **Project Relevance and Appeal**: The proje...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
7,555,You are an expert model in the analysis of par...,1. **Project Appeal and District**: The projec...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
8,694,You are an expert model in the analysis of par...,"1. **Project Appeal and Relevance**: The ""Gree...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
9,46,You are an expert model in the analysis of par...,1. **Historical Voting Data**: The project is ...,"{'estimated_votes': 1300, 'confidence_interval...",1221.11264,5


In [18]:
results.to_csv('output/predictions/wrc_simple_prompt_full_eng_1.csv', sep=";", index=False)
print('results ready!')

results ready!


In [28]:
import pandas as pd

t1 = pd.read_csv("output/predictions/wrc_simple_prompt_full_eng_1.csv", sep=";")

In [30]:
t1

Unnamed: 0,project_id,prompt,out,out_json,predicted_votes,predicted_rank
0,345,You are an expert model in the analysis of par...,1. **District and Demographics**: Krzyki is on...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
1,334,You are an expert model in the analysis of par...,1. **City-wide Impact**: The project impacts a...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
2,668,You are an expert model in the analysis of par...,"1. **Project Appeal and Relevance**: The ""Park...","{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
3,422,You are an expert model in the analysis of par...,1. **District and Demographics**: Śródmieście ...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,15
4,590,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,10
5,720,You are an expert model in the analysis of par...,1. **Historical Context & Emotional Appeal**: ...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,15
6,83,You are an expert model in the analysis of par...,1. **Project Relevance and Appeal**: The proje...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
7,555,You are an expert model in the analysis of par...,1. **Project Appeal and District**: The projec...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
8,694,You are an expert model in the analysis of par...,"1. **Project Appeal and Relevance**: The ""Gree...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
9,46,You are an expert model in the analysis of par...,1. **Historical Voting Data**: The project is ...,"{'estimated_votes': 1300, 'confidence_interval...",1221.11264,5


In [32]:
results_corr = pd.merge(left=t1, right=df[df['year']==2017], on=['project_id'], how='left')

In [33]:
results = results_corr.filter(['project_id', 
             'real_votes', 
             'real_rank',
             'prompt',
             'out',
             'out_json', 
             'predicted_votes', 
             'predicted_rank']
             )
results

Unnamed: 0,project_id,real_votes,prompt,out,out_json,predicted_votes,predicted_rank
0,345,2484,You are an expert model in the analysis of par...,1. **District and Demographics**: Krzyki is on...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
1,334,480,You are an expert model in the analysis of par...,1. **City-wide Impact**: The project impacts a...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
2,668,1060,You are an expert model in the analysis of par...,"1. **Project Appeal and Relevance**: The ""Park...","{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
3,422,236,You are an expert model in the analysis of par...,1. **District and Demographics**: Śródmieście ...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,15
4,590,2001,You are an expert model in the analysis of par...,1. **Project Appeal and Relevance**: The proje...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,10
5,720,118,You are an expert model in the analysis of par...,1. **Historical Context & Emotional Appeal**: ...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,15
6,83,1455,You are an expert model in the analysis of par...,1. **Project Relevance and Appeal**: The proje...,"{'estimated_votes': 3200, 'confidence_interval...",3005.815729,5
7,555,1512,You are an expert model in the analysis of par...,1. **Project Appeal and District**: The projec...,"{'estimated_votes': 3500, 'confidence_interval...",3287.610954,5
8,694,943,You are an expert model in the analysis of par...,"1. **Project Appeal and Relevance**: The ""Gree...","{'estimated_votes': 4500, 'confidence_interval...",4226.928369,5
9,46,1839,You are an expert model in the analysis of par...,1. **Historical Voting Data**: The project is ...,"{'estimated_votes': 1300, 'confidence_interval...",1221.11264,5


In [34]:
results.to_csv('output/predictions/wrc_simple_prompt_full_eng_1.csv', sep=";", index=False)
print('results ready!')

results ready!
