In [3]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import re
import time
import pickle
import numpy as np
import pandas as pd
import string
from tqdm import tqdm

import openai
from utils import query_gpt

%load_ext autoreload
%autoreload 2

In [4]:
gpt_model = 'gpt-3.5-turbo'
api_key = API_KEY
save_dir = 'results'

In [52]:
target_ccode = 'KOR' # KOR / MWI / KHM
target_var = 'GRDP' # GRDP / population / education_high

max_try_num = 20
max_categorization_try_num = 10
max_interaction_try_num = 10

ensemble_num = 10
num_voting = 5

In [53]:
mapping = {
    'unemployment_rate': 'UNEMP_T'
}

country_name_mapping = {
    'KOR': 'South Korea',
    'VNM': 'Viet Nam',
    'KHM': 'Cambodia'
}

file_path_mapping = {
    'GRDP': 'GRDP',
    'population': 'Population',
    'education_high': 'Education'
}

target_var_desc_mapping = {
    'GRDP': 'a regional GDP',
    'population': 'the population',
    'education_high': 'the proportion of highly-educated individuals (at least university graduates)'
}

target_var_detailed_desc_mapping = {
    'GRDP': 'the total economic output of a specific region within a country, reflecting the value of all goods and services over a specific period.',
    'population': 'a group of individuals or entities of the same species living in a specific area or considered for a study.',
    'education_high': 'the percentage of a population that has attained at least a university-level degree.'
}

module_mapping = {
    'get_address(Loc)': ['address'],
    'get_area(Loc)': ['area'],
    'get_distance_to_nearest_target(Loc, "airport")': ['distance_airport'],
    'get_distance_to_nearest_target(Loc, "port")': ['distance_port'],
    'get_night_light(Loc)': ['Nightlight_Sum', 'Nightlight_Average'],
    'count_area(Loc, "agricultural")': ['area_agricultural'],
    'count_area(Loc, "bareland")': ['area_bareland'],
    'count_area(Loc, "building")': ['area_building'],
    'count_area(Loc, "development")': ['area_development'],
    'count_area(Loc, "rangeland")': ['area_rangeland'],
    'count_area(Loc, "road")': ['area_road'],
    'count_area(Loc, "tree")': ['area_tree'],
    'count_area(Loc, "water")': ['area_water'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))': ['neighbor_Nightlight_Sum', 
                                                                           'neighbor_Nightlight_Average'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_area(loc=x))': ['neighbor_area'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))': ['neighbor_distance_airport'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "port"))': ['neighbor_distance_port'],
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))': ['neighbor_area_agricultural'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "bareland"))': ['neighbor_area_bareland'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "building"))': ['neighbor_area_building'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "development"))': ['neighbor_area_development'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "rangeland"))': ['neighbor_area_rangeland'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "road"))': ['neighbor_area_road'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "tree"))': ['neighbor_area_tree'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "water"))': ['neighbor_area_water'],
}

module_desc_mapping = {
    'address': 'address',
    'area': 'area size',
    'distance_airport': 'distance to airport',
    'distance_port': 'distance to port',
    'Nightlight_Sum': 'sum of nightlight intensity',
    'Nightlight_Average': 'average of nightlight intensity',
    'area_agricultural': 'ratio of agricultural area',
    'area_bareland': 'ratio of bareland area',
    'area_building': 'ratio of building area',
    'area_development': 'ratio of development area (e.g., sidewalks, pavements, footpaths, and artificial grass areas from tennis courts and baseball fields)',
    'area_rangeland': 'ratio of rangeland area',
    'area_road': 'ratio of road area',
    'area_tree': 'ratio of tree area',
    'area_water': 'ratio of water area',
    'neighbor_Nightlight_Sum': 'sum of nightlight intensity of surrounding regions',
    'neighbor_Nightlight_Average': 'average of nightlight intensity of surrounding regions',
    'neighbor_area': 'average of area size of surrounding regions',
    'neighbor_distance_airport': 'average of disatance of airport of surrounding regions',
    'neighbor_distance_port': 'average of disatance of port of surrounding regions',
    'neighbor_area_agricultural': 'ratio of agricultural area of surrounding regions',
    'neighbor_area_bareland': 'ratio of bareland area of surrounding regions',
    'neighbor_area_building': 'ratio of building area of surrounding regions',
    'neighbor_area_development': 'ratio of development area of surrounding regions (e.g., sidewalks, pavements, footpaths, and artificial grass areas from tennis courts and baseball fields)',
    'neighbor_area_rangeland': 'ratio of rangeland area of surrounding regions',
    'neighbor_area_road': 'ratio of road area of surrounding regions',
    'neighbor_area_tree': 'ratio of tree area of surrounding regions',
    'neighbor_area_water': 'ratio of water area of surrounding regions',
}

module_detailed_desc_mapping = {
    'area': 'the area size of a given region',
    'distance_airport': 'the distance to airport of a given region',
    'distance_port': 'the distance to port of given location of the region',
    'nightlight': 'the sum/average of nightlight intensity of a given region',
    'area_agricultural': 'the ratio of agricultural area of a given region',
    'area_bareland': 'the ratio of bareland area of a given region',
    'area_building': 'the ratio of building area of a given region',
    'area_development': 'the ratio of development area of a given region (i.e., development involves areas such as sidewalks, pavements, footpaths, and artificial grass areas from tennis courts and baseball fields)',
    'area_rangeland': 'the ratio of rangeland area of a given region',
    'area_road': 'the ratio of road area of a given region',
    'area_tree': 'the ratio of tree area of a given region',
    'area_water': 'the ratio of water area of a given region',
    'neighbor_nightlight': 'the sum/average of nightlight intensity of surrounding regions',
    'neighbor_area': 'the average of area size of surrounding regions',
    'neighbor_distance_airport': 'the average of disatance of airport of surrounding regions',
    'neighbor_distance_port': 'the average of disatance of port of surrounding regions',
    'neighbor_area_agricultural': 'the ratio of agricultural area of surrounding regions',
    'neighbor_area_bareland': 'the ratio of bareland area of surrounding regions',
    'neighbor_area_building': 'the ratio of building area of surrounding regions',
    'neighbor_area_development': 'the ratio of development area of surrounding regions (i.e., development involves areas such as sidewalks, pavements, footpaths, and artificial grass areas from tennis courts and baseball fields)',
    'neighbor_area_rangeland': 'the ratio of rangeland area of surrounding regions',
    'neighbor_area_road': 'the ratio of road area of surrounding regions',
    'neighbor_area_tree': 'the ratio of tree area of surrounding regions',
    'neighbor_area_water': 'the ratio of water area of surrounding regions',
}

In [54]:
module_set = []
for key, value in zip(module_detailed_desc_mapping.keys(), module_detailed_desc_mapping.values()):
    module_set.append(f'- {key}: {value}')
all_modules = set(list(module_mapping.keys())[1:])

In [55]:
if target_ccode == 'VNM':
    adm_offset = '_adm1'
else:
    adm_offset = ''

if (target_ccode != 'KOR' and target_ccode != 'VNM') and target_var == 'GRDP':
    assert(0)

if target_var in list(mapping.keys()):
    col_var = mapping[target_var]
else:
    col_var = target_var

file_path_var = file_path_mapping[target_var]
country_name = country_name_mapping[target_ccode]
target_var_desc = target_var_desc_mapping[target_var]
target_var_detailed_desc = target_var_detailed_desc_mapping[target_var]

In [56]:
file_path_var = file_path_mapping[target_var]
gt = pd.read_csv(f'./data/labels/{target_ccode}_{file_path_var}{adm_offset}.csv', encoding='utf-8')[['area_id', col_var]]
all_results_path = f'./data/all_features/{target_ccode}.csv'
all_df = pd.read_csv(all_results_path)

In [57]:
print(file_path_var)
print(country_name)
print(target_var_desc)
print(target_var_detailed_desc)

GRDP
South Korea
a regional GDP
the total economic output of a specific region within a country, reflecting the value of all goods and services over a specific period.


In [58]:
INDICATOR = file_path_var
COUNTRY = country_name
TASK_DETAILED_DESC = target_var_detailed_desc

In [59]:
TASK_DESC = f'Estimate "{INDICATOR}" in {COUNTRY}. "{INDICATOR}" refers to {TASK_DETAILED_DESC}'

In [60]:
print(TASK_DESC)
print(TASK_DETAILED_DESC)

Estimate "GRDP" in South Korea. "GRDP" refers to the total economic output of a specific region within a country, reflecting the value of all goods and services over a specific period.
the total economic output of a specific region within a country, reflecting the value of all goods and services over a specific period.


In [61]:
all_modules = list(module_mapping.keys())[1:]
for i in all_modules:
    print(i)

get_area(Loc)
get_distance_to_nearest_target(Loc, "airport")
get_distance_to_nearest_target(Loc, "port")
get_night_light(Loc)
count_area(Loc, "agricultural")
count_area(Loc, "bareland")
count_area(Loc, "building")
count_area(Loc, "development")
count_area(Loc, "rangeland")
count_area(Loc, "road")
count_area(Loc, "tree")
count_area(Loc, "water")
get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))
get_aggregate_neighbor_info(Loc, lambda x: get_area(loc=x))
get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))
get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "port"))
get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))
get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "bareland"))
get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "building"))
get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "development"))
get_aggregate_neighbor_info(Loc, lambda 

# Prompts

In [67]:
def generation_prompt_for_module_categorization(MODULE, MODULE_DETAILED_DESC):

    prompt_module_categorization = \
    f"""Assign the correlation type between "{MODULE}" and "{INDICATOR}" in {COUNTRY}. Specifically, "{MODULE}" refers to {MODULE_DETAILED_DESC} and "{INDICATOR}" refers to {TASK_DETAILED_DESC} Think step by step, and YOU MUST DETERMINE one of the following types: 

    Type A - Positively correlated (i.e., A higher value of "{MODULE}" leads to a higher value of "{INDICATOR}"); 
    Type B - Negatively correlated (i.e., A higher value of "{MODULE}" leads to a lower value of "{INDICATOR}");
    Type C - Completely not correlated.

    --- Response ---
    - Explanation: [EXPLANATION]
    - Answer: [Type A/Type B/Type C] (without the definition of above type)
    """
    
    return prompt_module_categorization

In [68]:
def generation_prompt_for_feature_discovery(MODULE_LIST):
    
    prompt_feature_discovery = \
    f"""Given a task description and a list of modules along with their descriptions, generate several non-duplicated new columns, prioritizing those that are most informative for solving the task. Think step by step for answers.

    Task description: {TASK_DESC}

    Module list:
    {MODULE_LIST}

    Refer to the examples below when generating new columns. Here, there are m example modules along with their potential interaction
types. If a single module is available, return the module itself. No need to state any reasons. Do not use the description of module when generating new columns.
    
    --- Example modules ---
    - Module 1: Description of module 1
    - Module 2: Description of module 2
    - Module 3: Description of module 3
    - Module 4: Description of module 4

    --- Possible types of interaction ---
    * Multiplication of Two Modules
    - Multiplication: (Module 1)*(Module 2)
    - Multiplication: (Module 1)*(Module 3)
    ...

    * Multiplication of Three Modules
    - Multiplication: (Module 1)*(Module 2)*(Module 3)
    ...

    * Multiplication of Over Three Modules
    - Multiplication: (Module 1)*(Module 2)*(Module 3)*...
    ...

    (Do not allow reversed new columns e.g., (Module 2)*(Module 1), (Module 3)*(Module 1), ...)

    --- Answers ---
    New column 1: COLUMN NAME (One Line Detailed Pseudo Code) | EXPLNATION
    New column 2: ...
    """
    
    return prompt_feature_discovery

In [69]:
def serialize(values):
    target_str = f""
    for attr_idx, attr_name in enumerate(values):
        target_str += f"{attr_name}"
        target_str += f": {module_desc_mapping[attr_name]} (numerical variable within range [{round(all_df[attr_name].min(), 2)}, {round(all_df[attr_name].max(), 2)}])"
        if attr_idx != len(values) -1:
            target_str += "\n"
    return target_str

def parse_interaction(sentence):
    
    feature_list = []

    if 'New column 1:' not in sentence:
        sentence = 'New column 1:' + '\n' + sentence

    sentence = sentence.replace(':', '.')
    new_features = sentence.split('. ')[1:]
    
    for feature in new_features:
        if len(feature.split('|')) != 3:
            continue
        
        feature_type = feature.split('|')[1].strip()

        if '\n\n' in feature.split('|')[2]:
            feature = feature.split('|')[2].split('\n\n')[0].strip()
        elif '\n' in feature.split('|')[2]:
            feature = feature.split('|')[2].split('\n')[0].strip()
        else:
            feature = feature.split('|')[2].strip()
        feature_list.append((feature_type, feature))

    return feature_list

# Categorization and Feature Discovery

In [1]:
all_results = []
response_results = []

for random_seed in [111, 222, 333]:
    
    for ensemble in range(ensemble_num):
        
        print(f"*********** Seed: {random_seed}, Ensemble: {ensemble} ***********")
        
        try_num = 0
        while try_num < max_try_num:
            try:
                
                # module categorization
                answer_list = {'positive':[], 'negative': [], 'mixed': []} 
                for module_item in module_set:
                    
                    num_pos = 0
                    num_neg = 0
                    num_mix = 0
                    num_no = 0

                    module = module_item.split(':')[0].replace('-', '').strip()
                    module_desc = module_detailed_desc_mapping[module]
                    
                    for i_v in range(num_voting):
                        
                        categorization_try_num = 0
                        while categorization_try_num < max_categorization_try_num:
                            try:
                                
                                prompt_categorization = generation_prompt_for_module_categorization(module, module_desc)
                                answer = query_gpt([prompt_categorization], api_key, temperature=0.5, max_tokens=1500, model=gpt_model)
                                answer = answer[0].split('Answer: ')[1]

                                if 'Type A' in answer:
                                    num_pos += 1
                                elif 'Type B' in answer:
                                    num_neg += 1
                                elif 'Type C' in answer:
                                    num_no += 1
                                else:
                                    assert(0)

                                break

                            except:
                                print(f"sepeartion failed - {categorization_try_num}/{max_categorization_try_num}")
                                categorization_try_num += 1
                                continue
                            
                    print(f'num_pos: {num_pos}, num_neg: {num_neg}, num_no: {num_no}')

                    # voting
                    if num_no < 2:
                        if num_pos == 2 or num_neg == 2:
                            answer_list['mixed'].append(module)
                        elif num_pos >= 3:
                            answer_list['positive'].append(module)
                        elif num_neg >= 3:
                            answer_list['negative'].append(module)

                module_dict = answer_list
                
                print("module_dict")
                print(module_dict)
                print('\n')
                
                assert(len(module_dict) > 0)
                
                module_columns = {}
                for key, val in module_dict.items():
                    flag = False
                    added_columns = []
                    for m in val:
                        if m == 'nightlight':
                            added_columns.append('Nightlight_Sum')
                            added_columns.append('Nightlight_Average')
                        elif m == 'neighbor_nightlight':
                            added_columns.append('neighbor_Nightlight_Sum')
                            added_columns.append('neighbor_Nightlight_Average')
                        else:
                            added_columns.append(m)
                        flag = True

                    if len(module_dict[key]) == 0:
                        module_dict[key] = []
                        module_columns[key] = []  
                    else:
                        if flag == False:
                            print("error val")
                            print(val)
                            assert(0)
                        module_dict[key] = added_columns
                        module_columns[key] = added_columns  
                    
                address_df = all_df['address']
                if len(module_columns['positive']) != 0:
                    positive_df = all_df[module_columns['positive']]
                if len(module_columns['negative']) != 0:
                    negative_df = all_df[module_columns['negative']]
                if len(module_columns['mixed']) != 0:
                    mixed_df = all_df[module_columns['mixed']]
                
                print("PASS 1: Categorization")
                
                # feature interaction
                interaction_try_num = 0
                if len(module_columns['positive']) != 0:
                    while interaction_try_num < max_interaction_try_num:
                        try:
                            MODULE_LIST = serialize(positive_df.columns)
                            prompt_feature_discovery = generation_prompt_for_feature_discovery(MODULE_LIST)
                            positive_answers = query_gpt([prompt_feature_discovery], api_key, temperature=0.5, max_tokens=2000, model=gpt_model)
                            positive_feature_list = parse_interaction(positive_answers[0])
                            print("positive_feature_list")
                            print(positive_feature_list)
                            if len(positive_feature_list) == 0:
                                assert(0)
                            break
                        except:
                            print(f"pos failed - {interaction_try_num}/{max_interaction_try_num}")
                            interaction_try_num += 1
                            continue
                else:
                    positive_feature_list = []

                interaction_try_num = 0
                if len(module_columns['negative']) != 0:
                    while interaction_try_num < max_interaction_try_num:
                        try:
                            MODULE_LIST = serialize(negative_df.columns)
                            prompt_feature_discovery = generation_prompt_for_feature_discovery(MODULE_LIST)
                            negative_answers = query_gpt([prompt_feature_discovery], api_key, temperature=0.5, max_tokens=2000, model=gpt_model)
                            negative_feature_list = parse_interaction(negative_answers[0])
                            print("negative_feature_list")
                            print(negative_feature_list)
                            if len(negative_feature_list) == 0:
                                assert(0)
                            break
                        except:
                            print(f"neg failed - {interaction_try_num}/{max_interaction_try_num}")
                            interaction_try_num += 1
                            continue
                else:
                    negative_feature_list = []

                interaction_try_num = 0
                if len(module_columns['mixed']) != 0:
                    while interaction_try_num < max_interaction_try_num:
                        try:
                            MODULE_LIST = serialize(mixed_df.columns)
                            prompt_feature_discovery = generation_prompt_for_feature_discovery(MODULE_LIST)
                            mixed_answers = query_gpt([prompt_feature_discovery], api_key, temperature=0.5, max_tokens=2000, model=gpt_model)
                            mixed_feature_list = parse_interaction(mixed_answers[0])
                            print("mixed_feature_list")
                            print(mixed_feature_list)
                            if len(mixed_feature_list) == 0:
                                assert(0)
                            break
                        except:
                            print(f"mix failed - {interaction_try_num}/{max_interaction_try_num}")
                            interaction_try_num += 1
                            continue
                else:
                    mixed_feature_list = []

                print("PASS 2: Feature Interaction")
                
                response_results.append([random_seed, ensemble, module_dict, positive_feature_list, negative_feature_list, mixed_feature_list])

                break
                    
            except:
                print(f"failed - {try_num}/{max_try_num}")
                try_num += 1
                continue

df = pd.DataFrame(response_results)
df.columns = ['seed', 'model', 'module_dict', 'pos_list', 'neg_list', 'mix_list']
df.to_csv(f'./data/{save_dir}/{target_ccode}_{target_var}_output_{ensemble_num}.csv', index=False, header=True)

AssertionError: 