## Setup

In [None]:
import tio4550_housing_energy.utils as u

# This will now work!
u.a_test_function()

# Test of OpenAI's Responses API

In [None]:
import os
import pandas as pd
import json
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- 1. SETUP & AUTHENTICATION ---
# Load the environment variable for the API key from a .env file or system environment
load_dotenv()

# Instantiate the OpenAI client.
# It will automatically use the OPENAI_API_KEY from your environment.
try:
    client = OpenAI()
    if not os.getenv("OPENAI_API_KEY"):
        raise ValueError("OPENAI_API_KEY environment variable not found.")
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    print(
        "Please ensure your OPENAI_API_KEY is set correctly as an environment variable."
    )
    exit()

# --- 2. DEFINE A MORE ROBUST SYSTEM PROMPT ---
# This version is more direct and uses English, which models are highly optimized for,
# ensuring more consistent JSON output even with Norwegian input text.
system_prompt = """
You are an expert in analyzing Norwegian housing advertisements. Your task is to extract specific information about renovations from the provided text and return it as a JSON object.

**Rules:**
1. If a feature is not mentioned, its `_bool` value is `0` and its `_år` value is `0`.
2. If a feature is mentioned but no year is given, its `_bool` value is `1` and its `_år` value is `0`.
3. If a feature and a year are mentioned, its `_bool` value is `1` and its `_år` value is the year.
4. If multiple years are mentioned for one feature, use the most recent year.
5. Treat 'oppussing' and 'oppgradering' as the same concept.
6. Your final output MUST be a single, valid JSON object with ONLY the keys specified below.

**JSON Schema:**
- `vp_bool`, `vp_år` (varmepumpe)
- `el_bool`, `el_år` (etterisolering_loft)
- `e_bool`, `e_år` (etterisolering_generelt, not loft)
- `v_bool`, `v_år` (vinduer)
- `k_bool`, `k_år` (kledning)
- `t_bool`, `t_år` (tak)
- `s_bool`, `s_år` (solceller)
- `ot_bool`, `ot_år` (oppussing_totalrenovering)
- `ob_bool`, `ob_år` (oppgradering_bad)
- `ok_bool`, `ok_år` (oppgradering_kjøkken)
- `og_bool`, `og_år` (oppgradering_gulv)
"""

# --- 3. LOAD DATASET ---
input_data_path = "../data/2_interim/relevant_ads_sampled_for_chatgpt.csv"
try:
    df = pd.read_csv(input_data_path, delimiter=";")
    print(f"Successfully loaded {len(df)} ads from {input_data_path}")
except FileNotFoundError:
    print(f"Error: The file was not found at {input_data_path}")
    exit()


# --- 4. DEFINE API CALL FUNCTION ---
def analyze_ad_description(description):
    """Calls the OpenAI API. Includes robust error handling."""
    if not isinstance(description, str) or not description.strip():
        return {}  # Return an empty dict for invalid input
    try:
        response = client.responses.create(
            model="gpt-5-nano",
            reasoning={"effort": "low"},
            # temperature=0.1, TODO Find a parameter that controls temperature. Apparently, "Temperature" dosent work right now.
            input=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": description},
            ],
            text={"format": {"type": "json_object"}},
        )
        json_output = response.output_text
        return json.loads(json_output)
    except json.JSONDecodeError:
        print("Warning: Failed to parse JSON from API response for one ad.")
        return {}  # Return empty dict if JSON is malformed
    except Exception as e:
        print(f"An API error occurred: {e}")
        return {}  # Return empty dict on API failure


# --- 5. PROCESS THE DATAFRAME IN PARALLEL ---
descriptions = df["description"].tolist()
api_results = [{} for _ in descriptions]  # Initialize with empty dicts

with ThreadPoolExecutor(
    max_workers=5
) as executor:  # Increased workers slightly for speed
    future_to_idx = {
        executor.submit(analyze_ad_description, desc): idx
        for idx, desc in enumerate(descriptions)
    }
    for future in tqdm(
        as_completed(future_to_idx), total=len(descriptions), desc="Analyzing Ads"
    ):
        idx = future_to_idx[future]
        try:
            api_results[idx] = future.result()
        except Exception as e:
            print(f"Error processing result for index {idx}: {e}")
            api_results[
                idx
            ] = {}  # Ensure it's an empty dict on any future-related error

# --- 6. ROBUSTLY PARSE RESULTS AND SAVE ---
# Define all the columns you expect to get from the API
expected_columns = [
    "vp_bool",
    "vp_år",
    "el_bool",
    "el_år",
    "e_bool",
    "e_år",
    "v_bool",
    "v_år",
    "k_bool",
    "k_år",
    "t_bool",
    "t_år",
    "s_bool",
    "s_år",
    "ot_bool",
    "ot_år",
    "ob_bool",
    "ob_år",
    "ok_bool",
    "ok_år",
    "og_bool",
    "og_år",
]

# Create a clean list of dictionaries, ensuring every dictionary has all expected keys
clean_results = []
for result in api_results:
    clean_dict = {}
    for col in expected_columns:
        # Use .get() with a default value of 0 to handle missing keys gracefully
        clean_dict[col] = result.get(col, 0)
    clean_results.append(clean_dict)

# Create the results DataFrame from the clean list
results_df = pd.DataFrame(clean_results)

# Combine the original DataFrame with the new results DataFrame
final_df = pd.concat([df, results_df], axis=1)

# --- THIS IS THE NEW STEP ---
# Drop the original text columns as they are no longer needed for the final dataset
final_df = final_df.drop(columns=["description", "title"])

# Define the output path
output_path = "../data/3_processed/analyzed_ads_data_clean.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save the final DataFrame
final_df.to_csv(output_path, index=False)

print(f"\n✅ Processing complete! Clean data saved to {output_path}")
print("\nHere's a preview of the clean final data (without description and title):")
print(final_df.head())

Successfully loaded 150 ads from ../data/2_interim/relevant_ads_sampled_for_chatgpt.csv


Analyzing Ads:   5%|▌         | 8/150 [00:13<03:55,  1.66s/it]


In [6]:
final_df.head()

Unnamed: 0,ad_code,vp_bool,vp_år,el_bool,el_år,e_bool,e_år,v_bool,v_år,k_bool,...,s_bool,s_år,ot_bool,ot_år,ob_bool,ob_år,ok_bool,ok_år,og_bool,og_år
0,33485571,0,0,0,0,1,0,1,0,1,...,0,0,1,0,1,2011,1,2010,1,0
1,251771188,1,0,1,0,1,0,1,0,1,...,0,0,1,2018,1,0,1,0,1,0
2,226107740,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,246084291,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,2018,0,0
4,10897928,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
