In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

import re
import time
import openai
import pickle
import numpy as np
import pandas as pd
import string

from tqdm import tqdm
from utils import query_gpt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from scipy.optimize import minimize
from sklearn.preprocessing import PolynomialFeatures

%load_ext autoreload
%autoreload 2

In [2]:
target_ccode = 'KOR' # KOR
target_var = 'unemployment_rate'

max_try_num = 20
ensemble_num = 5

max_interaction_terms = 45

In [3]:
mapping = {
    'unemployment_rate': 'UNEMP_T',
    'labour_force_participation_rate': 'LFPR_T',
    'literacy_rate': 'LIT_R_18',
    'under_five_mortality': 'U5MR',
    'infant_mortality': 'IMR'    
}

file_path_mapping = {
    'GRDP': 'GRDP',
    'population': 'Population', # (good)
    'age_working_population': 'Population',
    'age_old_dependency_ratio': 'Population', # reverse
    'education_high': 'Education',
    'education_secondary': 'Education',
    'education_primary': 'Education',
    'unemployment_rate': 'Employment', # reverse 
    'labour_force_participation_rate': 'Employment', # (bad)
    'literacy_rate': 'LIT',
    'under_five_mortality': 'MR',
    'infant_mortality': 'MR' # reverse
}

target_var_desc_mapping = {
    'GRDP': 'a regional GDP',
    'population': 'the population',
    'age_working_population': 'the working-age population',
    'age_old': 'the population of elderly people (aged 60 and older)',
    'age_old_dependency_ratio': 'the old-age dependency ratio',
    'education_high': 'the proportion of highly-educated individuals (at least university graduates)',
    'education_secondary': 'the proportion of secondary school graduates',
    'education_primary': 'the proportion of people with minimal educational attainment',
    'unemployment_rate': 'the unemployment rate',
    'labour_force_participation_rate': 'the labour force participation rate',
    'literacy_rate': 'the literacy rate',
    'under_five_mortality': 'the mortality rate of children under the age of 5',
    'infant_mortality': 'the infant mortality rate'    
}

country_name_mapping = {
    'VNM': 'Viet Nam',
    'KOR': 'South Korea',
    'MWI': 'Malawi',
    'KHM': 'Cambodia'
}

module_mapping = {
    'get_address(Loc)': ['address'],
    'get_area(Loc)': ['area'],
    'get_distance_to_nearest_target(Loc, "airport")': ['distance_airport'],
    'get_distance_to_nearest_target(Loc, "port")': ['distance_port'],
    'get_night_light(Loc)': ['Nightlight_Sum', 'Nightlight_Average'],
    'count_area(Loc, "agricultural")': ['area_agricultural'],
    'count_area(Loc, "bareland")': ['area_bareland'],
    'count_area(Loc, "building")': ['area_building'],
    'count_area(Loc, "development")': ['area_development'],
    'count_area(Loc, "rangeland")': ['area_rangeland'],
    'count_area(Loc, "road")': ['area_road'],
    'count_area(Loc, "tree")': ['area_tree'],
    'count_area(Loc, "water")': ['area_water'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))': ['neighbor_Nightlight_Sum', 
                                                                           'neighbor_Nightlight_Average'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_area(loc=x))': ['neighbor_area'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))': ['neighbor_distance_airport'],
    'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "port"))': ['neighbor_distance_port'],
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))': ['neighbor_area_agricultural'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "bareland"))': ['neighbor_area_bareland'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "building"))': ['neighbor_area_building'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "development"))': ['neighbor_area_development'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "rangeland"))': ['neighbor_area_rangeland'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "road"))': ['neighbor_area_road'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "tree"))': ['neighbor_area_tree'], 
    'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "water"))': ['neighbor_area_water'],
}

module_desc_mapping = {
    'address': 'address',
    'area': 'area size',
    'distance_airport': 'distance to airport',
    'distance_port': 'distance to port',
    'Nightlight_Sum': 'sum of nightlight intensity',
    'Nightlight_Average': 'average of nightlight intensity',
    'area_agricultural': 'ratio of agricultural area',
    'area_bareland': 'ratio of bareland area',
    'area_building': 'ratio of building area',
    'area_development': 'ratio of development area',
    'area_rangeland': 'ratio of rangeland area',
    'area_road': 'ratio of road area',
    'area_tree': 'ratio of tree area',
    'area_water': 'ratio of water area',
    'neighbor_Nightlight_Sum': 'sum of nightlight intensity of surrounding regions',
    'neighbor_Nightlight_Average': 'average of nightlight intensity of surrounding regions',
    'neighbor_area': 'average of area size of surrounding regions',
    'neighbor_distance_airport': 'average of disatance of airport of surrounding regions',
    'neighbor_distance_port': 'average of disatance of port of surrounding regions',
    'neighbor_area_agricultural': 'ratio of agricultural area of surrounding regions',
    'neighbor_area_bareland': 'ratio of bareland area of surrounding regions',
    'neighbor_area_building': 'ratio of building area of surrounding regions',
    'neighbor_area_development': 'ratio of development area of surrounding regions',
    'neighbor_area_rangeland': 'ratio of rangeland area of surrounding regions',
    'neighbor_area_road': 'ratio of road area of surrounding regions',
    'neighbor_area_tree': 'ratio of tree area of surrounding regions',
    'neighbor_area_water': 'ratio of water area of surrounding regions',
}

In [4]:
if target_ccode == 'VNM':
    adm_offset = '_adm1'
else:
    adm_offset = ''

if (target_ccode != 'KOR' and target_ccode != 'VNM') and target_var == 'GRDP':
    assert(0)
elif target_ccode == 'KOR' and target_var == 'literacy_rate':
    assert(0)

if target_var in list(mapping.keys()):
    col_var = mapping[target_var]
else:
    col_var = target_var

file_path_var = file_path_mapping[target_var]

if target_var == 'literacy_rate' and target_ccode == 'VNM':
    col_var = 'LIT_15'
elif target_var == 'literacy_rate' and target_ccode == 'KHM':
    col_var = 'LIT_T'

target_var_desc = target_var_desc_mapping[target_var]
country_name = country_name_mapping[target_ccode]

In [5]:
Inference_desc = f'Estimate {target_var_desc} of a given district. Answer the numeric score only'
TASK_DESC = f'Estimate {target_var_desc} of a given district in {country_name}.'

In [6]:
print(Inference_desc)
print(TASK_DESC)

Estimate the unemployment rate of a given district. Answer the numeric score only
Estimate the unemployment rate of a given district in South Korea.


In [7]:
prompt1 = \
f"""Given a modular set and the target variable to estimate, we want to extract a list of modules that have a positive, negative, or mixed relationship with the target variable.

Task description: {TASK_DESC}

Module description:
- count_area(Loc, Class): Count the pixels of the given class in the location image. Class should be one of the element in ["bareland", "rangeland", "development", "road", "tree", "water", "agricultural", "building"].
- get_area(Loc): Get area size of given location's district.
- get_night_light(Loc): Get nightlight intensity of given location.
- get_distance_to_nearest_target(Loc, Class): Get distance of given location to class. Class should be one of the element in ["airport", "port"].
- get_aggregate_neighbor_info(Loc, Func): Get neighbor districts' information of given location, using functions defined above. The format of Func must be the lambda function (i.e., lambda x: [function name](loc=x, ...)). E.g., 'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))', 'get_aggregate_neighbor_info(Loc, lambda x: get_area(loc=x))', 'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))' ...

Input: 
- Location of the region - [Loc]

--- Format for response ---
- Positive relationship
1. MODULE 1
2. MODULE 2
...

- Negative relationship
1. Module 1
2. Module 2
...

- Mixed relationship
1. Module 1
2. Module 2
...
--- End of format of response ---

Answer: Format for response
"""

prompt2 = \
f"""You are a data engineer. Given the task description and the list of modules along with their descriptions, you are making a new column for the data which is informative to solve the task.

Task description: {TASK_DESC}

Module description:
#####

Given the types of operations listed below, generate n={max_interaction_terms+10} non-duplicated new columns in the order that are most informative for solving the task. Refer to the examples when generating new columns. Only use the modules listed in the module description above. If only one module exists, only a single operation is used. Note that multiple operations can be nested to generate a new column.

--- Possible type of operations ---
* 3 single operation: Logarithm, Square Root, Power
- Logarithm: log(a), log(b), ...
- Square Root: sqrt(a), sqrt(b), ...
- Power: a^2, b^2, ...

* 1 composite operation
- Multiplication: a*b, a*c, ... (Do not allow reversed new columns e.g., b*a, c*a, ..)

--- End of format of response for n={max_interaction_terms+10} new columns ---
Thought 1: [Any reasons based on examples above why the following new column would be helpful for the task]
New column 1: [Type of operation] | New_column_name | One line detailed pseudo code for generating columns
...
Thought n: ...
New column n: ...

--- Answer ---
Thought 1:

"""

In [8]:
def parse_module(answer):
    
    answer = answer.replace('**', '')
    answer = answer.replace('`', '')
    answers = answer.split('\n1. ')[1:]
    answer_type = ['positive', 'negative', 'mixed']
    answer_dict = {}
    
    for idx, answer_one in enumerate(answers):
        answer_one = '1. ' + answer_one
        answer_one = answer_one.strip()
        cnt = 1
        parsed_ans = []
        for ans in answer_one.split('\n'):
            ans = ans.strip()
            answer_num = ans.split('.')[0]
            
            if ans == '':
                continue
            else:
                if len(ans.split('.')) <= 1:
                    continue
                    
                if '):' in ans:
                    final_ans = ans.split('.')[1].split('):')[0].strip() + ')'
                elif ') -' in ans:
                    final_ans = ans.split('.')[1].split(') -')[0].strip() + ')'
                elif ')-' in ans:
                    final_ans = ans.split('.')[1].split(')-')[0].strip() + ')'
                else:
                    final_ans = ans.split('.')[1].strip()
                    
            if final_ans == '':
                continue
            if answer_num == str(cnt):
                parsed_ans.append(final_ans)
                cnt += 1
            else:
                continue
        answer_dict[answer_type[idx]] = parsed_ans

    return answer_dict


def serialize(values):
    target_str = f""
    for attr_idx, attr_name in enumerate(values):
        target_str += f"{string.ascii_lowercase[attr_idx]}"
        target_str += f": {module_desc_mapping[attr_name]} (numerical variable within range [{round(all_df[attr_name].min(), 2)}, {round(all_df[attr_name].max(), 2)}])"
        if attr_idx != len(values) -1:
            target_str += "\n"
    return target_str


def prompt_generation(prompt2, temp_df):
    MODULE_DESC = serialize(temp_df.columns)
    l, r = prompt2.split('#####')
    ret = l + MODULE_DESC + r
    return ret


def parse_interaction(sentence):  
    feature_list = []
    new_features = sentence.split('New column')[1:]
    for feature in new_features:
        if len(feature.split('|')) != 3:
            continue
        
        feature_type = feature.split('|')[0].split(':')[1].strip()
        feature = feature.split('|')[2].split('\n\n')[0].strip()
        feature_list.append((feature_type, feature))

    return feature_list


def interaction(answers, temp_df):
    feature_list = parse_interaction(answers[0])
    
    print("feature_list")
    print(feature_list)
    
    return feature_list


def ridge_objective_function(coeffs, X, y, lmbda):
    predictions = X @ coeffs[:-1] + coeffs[-1]
    mse = ((y - predictions) ** 2).mean()
    ridge_penalty = lmbda * (coeffs[:-1] ** 2).sum()
    return mse + ridge_penalty

In [9]:
file_path_var = file_path_mapping[target_var]
api_key = 'sk-proj-80zLe4NaCoKc2FPKTBCdT3BlbkFJN9WK5rGP5Y0medWdXysU'

gt = pd.read_csv(f'./data/label/{target_ccode}_{file_path_var}{adm_offset}.csv', encoding='utf-8')[['area_id', col_var]]
all_results_path = f'./data/all_features/{target_ccode}.csv'
all_df = pd.read_csv(all_results_path)

In [10]:
all_results = []
desc_results = []
for random_seed in [111, 222, 333, 444, 555]:
# for random_seed in [555]:
    test_val_list = []
    for ensemble in range(ensemble_num):
        print(f"Seed: {random_seed}, ensemble num: {ensemble}")
        try_num = 0
        while try_num < max_try_num:
            try:
                answers = query_gpt([prompt1], api_key, temperature=0.5, max_tokens=1500, model='gpt-4o') # -4-turbo
                module_dict = parse_module(answers[0])
                
                print("module_dict")
                print(module_dict)
                print('\n')
                
                assert(len(module_dict) > 0)
                    
                module_columns = {}
                for key, val in module_dict.items():
                    s_columns = []
                    flag = False
                    new_s_m = []
                    for s_m in val:
                        if s_m in module_mapping.keys():
                            if s_m == 'get_aggregate_neighbor_info(Loc, Func)':
                                assert(0)
                            s_columns.extend(module_mapping[s_m])
                            new_s_m.append(s_m)
                            flag = True

                    module_dict[key] = new_s_m

                    if flag == False:
                        print("error val")
                        print(val)
                        assert(0)
                    module_columns[key] = s_columns
                    
                address_df = all_df['address']
                positive_df = all_df[module_columns['positive']]
                negative_df = all_df[module_columns['negative']]
                mixed_df = all_df[module_columns['mixed']]
                
                # interaction terms
                positive_answers = query_gpt([prompt_generation(prompt2, positive_df)], api_key, temperature=0, max_tokens=2000, model='gpt-4o')
                negative_answers = query_gpt([prompt_generation(prompt2, negative_df)], api_key, temperature=0, max_tokens=2000, model='gpt-4o')
                mixed_answers = query_gpt([prompt_generation(prompt2, mixed_df)], api_key, temperature=0, max_tokens=2000, model='gpt-4o')
                
                positive_feature_list = parse_interaction(positive_answers[0])
                negative_feature_list = parse_interaction(negative_answers[0])
                mixed_feature_list = parse_interaction(mixed_answers[0])
                
                desc_results.append([random_seed, ensemble, module_dict, positive_feature_list, negative_feature_list, mixed_feature_list])

                break
                    
            except:
                print(f"failed - {try_num}/{max_try_num}")
                try_num += 1
                continue

df = pd.DataFrame(desc_results)
df.columns = ['seed', 'model', 'module_dict', 'pos_list', 'neg_list', 'mixed_list']
df.to_csv(f'./data/all_modules/{target_ccode}_{target_var}_extension_paragraph_output_{max_interaction_terms+10}.csv', index=False, header=True)
# df.to_csv(f'./data/all_modules/{target_ccode}_{target_var}_extension_paragraph_output_temp.csv', index=False, header=True)

Seed: 555, ensemble num: 0


100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


module_dict
{'positive': ['count_area(Loc, "development")', 'get_night_light(Loc)', 'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "development"))'], 'negative': ['count_area(Loc, "agricultural")', 'count_area(Loc, "rangeland")', 'count_area(Loc, "tree")', 'count_area(Loc, "water")', 'get_area(Loc)', 'get_distance_to_nearest_target(Loc, "airport")', 'get_distance_to_nearest_target(Loc, "port")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "rangeland"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "tree"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "water"))', 'get_aggregate_neighbor_info(Loc, lambda x: get_area(loc=x))', 'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))', 'get_aggregate_neighbor_info(Loc, lambda x: get_dis

100%|██████████| 1/1 [00:36<00:00, 36.68s/it]
100%|██████████| 1/1 [00:30<00:00, 30.63s/it]
100%|██████████| 1/1 [00:37<00:00, 37.69s/it]


Seed: 555, ensemble num: 1


100%|██████████| 1/1 [00:08<00:00,  8.19s/it]


module_dict
{'positive': ['get_night_light(Loc)', 'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))'], 'negative': ['count_area(Loc, "development")', 'count_area(Loc, "building")', 'get_area(Loc)', 'get_distance_to_nearest_target(Loc, "airport")', 'get_distance_to_nearest_target(Loc, "port")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "development"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "building"))', 'get_aggregate_neighbor_info(Loc, lambda x: get_area(loc=x))', 'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))', 'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "port"))'], 'mixed': ['count_area(Loc, "bareland")', 'count_area(Loc, "rangeland")', 'count_area(Loc, "road")', 'count_area(Loc, "tree")', 'count_area(Loc, "water")', 'count_area(Loc, "agricultural")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "bareland"))', 'get_aggre

100%|██████████| 1/1 [01:13<00:00, 73.92s/it]
100%|██████████| 1/1 [00:41<00:00, 41.11s/it]
100%|██████████| 1/1 [00:27<00:00, 27.47s/it]


Seed: 555, ensemble num: 2


100%|██████████| 1/1 [00:08<00:00,  8.67s/it]


module_dict
{'positive': ['count_area(Loc, "development")', 'get_night_light(Loc)', 'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))', 'count_area(Loc, "building")'], 'negative': ['count_area(Loc, "agricultural")', 'count_area(Loc, "rangeland")', 'count_area(Loc, "tree")', 'get_distance_to_nearest_target(Loc, "airport")', 'get_distance_to_nearest_target(Loc, "port")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))', 'get_area(Loc)'], 'mixed': ['count_area(Loc, "bareland")', 'count_area(Loc, "road")', 'count_area(Loc, "water")']}




100%|██████████| 1/1 [00:45<00:00, 45.33s/it]
100%|██████████| 1/1 [00:32<00:00, 32.06s/it]
100%|██████████| 1/1 [00:28<00:00, 28.42s/it]


Seed: 555, ensemble num: 3


100%|██████████| 1/1 [00:07<00:00,  7.21s/it]


module_dict
{'positive': ['count_area(Loc, "development")', 'get_night_light(Loc)', 'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "development"))'], 'negative': ['count_area(Loc, "agricultural")', 'count_area(Loc, "tree")', 'count_area(Loc, "water")', 'get_distance_to_nearest_target(Loc, "airport")', 'get_distance_to_nearest_target(Loc, "port")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "tree"))', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "water"))', 'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "airport"))', 'get_aggregate_neighbor_info(Loc, lambda x: get_distance_to_nearest_target(loc=x, "port"))'], 'mixed': ['count_area(Loc, "bareland")', 'count_area(Loc, "rangeland")', 'count_area(Loc, "road")', 'count_area(Loc, "building")', 'get_area(Loc)',

100%|██████████| 1/1 [00:58<00:00, 59.00s/it]
100%|██████████| 1/1 [01:06<00:00, 66.73s/it]
100%|██████████| 1/1 [00:32<00:00, 32.18s/it]


Seed: 555, ensemble num: 4


100%|██████████| 1/1 [00:07<00:00,  7.74s/it]


module_dict
{'positive': ['get_area(Loc)', 'get_night_light(Loc)', 'get_distance_to_nearest_target(Loc, "airport")', 'get_distance_to_nearest_target(Loc, "port")', 'get_aggregate_neighbor_info(Loc, lambda x: get_night_light(loc=x))'], 'negative': ['count_area(Loc, "bareland")', 'count_area(Loc, "rangeland")', 'count_area(Loc, "development")', 'count_area(Loc, "road")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "bareland"))'], 'mixed': ['count_area(Loc, "tree")', 'count_area(Loc, "water")', 'count_area(Loc, "agricultural")', 'count_area(Loc, "building")', 'get_aggregate_neighbor_info(Loc, lambda x: count_area(loc=x, "agricultural"))']}




100%|██████████| 1/1 [00:30<00:00, 30.20s/it]
100%|██████████| 1/1 [00:47<00:00, 47.78s/it]
100%|██████████| 1/1 [00:26<00:00, 26.83s/it]
