## Data Selection

In [27]:
import pandas as pd
import os
import sys

#allowing for switch from interactive env to non-interactive
if '__file__' in globals():
    base_path = os.path.dirname(__file__)
else:
    base_path = os.getcwd()

#Pulling the data functions from the other dir
data_functions_path = os.path.abspath(os.path.join(base_path, '../cfpr_data_processing'))
sys.path.append(data_functions_path)

import importlib
import data_functions
importlib.reload(data_functions)
from data_functions import remove_capitals

raw_path = "../data/raw_data/"
processed_path = "../data/processed_data/"
data_utils_path = "../data/utils_data/"

This block of code is manually selecting and aggregating fine grain data to create the pool of 30 variales for the data selection experiment

In [28]:
pool_list = open(data_utils_path + "data_pool_list.txt", "r").read().split()

selects = {}
date_range = pd.date_range(start="1986-01-01", end="2024-07-01", freq="MS")

for file in pool_list:

    selects[file] = pd.read_csv(processed_path + file, index_col=0)
    selects[file].index = pd.to_datetime(selects[file].index)
    selects[file] = selects[file].reindex(date_range)

rm_list = [
    "CDEC_swe_nc_processed.csv",
    "CDEC_swe_nl_processed.csv",
    "CDEC_swe_sj_processed.csv",
    "CDEC_swe_sl_processed.csv",
    "CDEC_swe_sr_processed.csv",
    "CDEC_swe_tl_processed.csv",
    "NCEI_pdsi_processed.csv",
    "STATSCAN_fppi_processed.csv",
    "STATSCAN_milk_sold_processed.csv",
    "STATSCAN_num_work_stop_processed.csv",
    "STATSCAN_rmpi_processed.csv",
    "WB_commodity_price_index_processed.csv",
]

selects["CDEC_swe_total_processed.csv"] = pd.DataFrame(
    pd.DataFrame(
        pd.concat(
            [
#                 selects["CDEC_swe_nc_processed.csv"],
#                 selects["CDEC_swe_nl_processed.csv"],
                selects["CDEC_swe_sj_processed.csv"],
#                 selects["CDEC_swe_sl_processed.csv"],
                selects["CDEC_swe_sr_processed.csv"],
#                 selects["CDEC_swe_tl_processed.csv"],
            ],
            axis=1,
        )
    ).aggregate(func="mean", axis=1)
)
selects["NCEI_pdsi_total_processed.csv"] = pd.DataFrame(
    selects["NCEI_pdsi_processed.csv"].aggregate(func="mean", axis=1)
)
selects["STATSCAN_fppi_total_processed.csv"] = pd.DataFrame(
    selects["STATSCAN_fppi_processed.csv"]["Total index"]
)
selects["STATSCAN_milk_sold_total_processed.csv"] = pd.DataFrame(
    selects["STATSCAN_milk_sold_processed.csv"]["Milk sold off farms, total"]
)
selects["STATSCAN_work_stop_total_processed.csv"] = pd.DataFrame(
    selects["STATSCAN_num_work_stop_processed.csv"]["All industries"]
)
selects["STATSCAN_rmpi_total_processed.csv"] = pd.DataFrame(
    selects["STATSCAN_rmpi_processed.csv"]["Total, excluding crude energy products"]
)
selects["WB_commodity_food_processed.csv"] = pd.DataFrame(
    selects["WB_commodity_price_index_processed.csv"]["FOOD"]
)
selects["WB_commodity_fertilizer_processed.csv"] = pd.DataFrame(
    selects["WB_commodity_price_index_processed.csv"]["FERTILIZERS"]
)

targets = ['food', 'meat', 'fish', 'dairy', 'bakery', 'fruit', 'vegetables', 'other', 'restaurants']

# for rm in rm_list:
#     selects.pop(rm)

rm_list = ["STATSCAN_milk_sold_processed.csv", "STATSCAN_num_work_stop_processed.csv"]
for rm in rm_list:
    selects.pop(rm)

for file in (selects.keys()):
    file_name = file[:-14]  # i.e. STATSCAN_energy
    var_name = remove_capitals(file_name)[1:]  # i.e. energy

    col_names = []
    df_cols = (selects[file].columns)

    for i, col in enumerate(df_cols):
        if (var_name == 'food_cpi') | (var_name == 'rmpi') | (var_name == 'fppi') | (var_name == 'commodity_price_index') | (var_name == 'pdsi'):
            name = var_name + ": " + str(col)
        else:
            name = var_name
        col_names.append(name)
    selects[file].columns = col_names


all_select = pd.concat(selects.values(), axis=1)
all_select.index.name = 'index'

all_select.to_csv(os.path.join("/h/kupfersk/cfpr_2025/data/processed_data/all_select.csv"), index=True)

display(all_select)


Unnamed: 0_level_0,swe_sj,swe_sr,policy_uncertainty,apu000072610,cwur0000sa0,excaus,fedfunds,fmpi,impca,impch,...,food_cpi: Other food products and non-alcoholic beverages,food_cpi: Food purchased from restaurants,swe_total,pdsi_total,fppi_total,milk_sold_total,work_stop_total,rmpi_total,commodity_food,commodity_fertilizer
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1986-01-01,10.753226,14.725986,91.105314,0.081,108.900,1.4070,,99.800,5680.700000,459.500000,...,77.5,59.1,12.739606,1.966250,77.5,576001,128.0,47.5,46.12,30.05
1986-02-01,18.899366,26.887587,96.968702,0.075,108.500,1.4043,,99.200,5659.500000,376.600000,...,78.1,59.1,22.893477,2.256250,77.4,527618,146.0,47.5,44.79,29.26
1986-03-01,23.348971,39.425513,86.906782,0.075,107.900,1.4009,,99.000,5922.900000,401.800000,...,78.6,59.3,31.387242,1.541250,76.4,606383,144.0,47.6,45.40,29.52
1986-04-01,16.509811,38.585833,69.223383,0.074,107.600,1.3879,,98.600,5821.100000,264.900000,...,79.5,59.7,27.547822,1.323750,75.3,634262,157.0,47.3,45.51,28.63
1986-05-01,10.428860,25.978136,93.354333,0.074,107.900,1.3757,,99.300,5922.500000,319.000000,...,79.8,59.9,18.203498,1.456875,75.9,682766,180.0,47.6,43.92,27.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-01,36.504988,27.910696,283.283057,0.174,306.502,1.3536,5.33,255.445,34218.094789,29940.596826,...,176.6,193.1,32.207842,-1.967500,181.5,819861,95.0,135.0,117.62,115.33
2024-04-01,36.778885,30.292675,164.408376,0.173,307.811,1.3674,5.33,256.523,34883.560333,31630.360595,...,176.2,193.1,33.535780,-0.395625,185.7,796959,106.0,141.5,116.64,113.80
2024-05-01,20.660710,14.939886,248.883113,0.175,308.163,1.3667,5.33,257.218,35669.381089,35037.005823,...,177.9,194.2,17.800298,0.243750,189.4,823558,,143.8,118.11,108.45
2024-06-01,2.536432,0.934703,195.109713,0.178,308.054,1.3705,5.33,258.949,34393.224478,34113.997176,...,178.5,195.0,1.735568,0.020000,187.1,795313,104.0,141.3,115.12,118.20


In [29]:
pool_list

['CDEC_swe_sj_processed.csv',
 'CDEC_swe_sr_processed.csv',
 'EPU_policy_uncertainty_processed.csv',
 'FRED_apu000072610_processed.csv',
 'FRED_cwur0000sa0_processed.csv',
 'FRED_excaus_processed.csv',
 'FRED_fedfunds_processed.csv',
 'FRED_fmpi_processed.csv',
 'FRED_impca_processed.csv',
 'FRED_impch_processed.csv',
 'FRED_impmx_processed.csv',
 'FRED_irltlt01cam156n_processed.csv',
 'FRED_paynsa_processed.csv',
 'FRED_pcu32533253_processed.csv',
 'FRED_pcu324191324191_processed.csv',
 'FRED_pcu325311325311_processed.csv',
 'FRED_pcu333132333132_processed.csv',
 'FRED_pcu482111482111_processed.csv',
 'FRED_recprousm156n_processed.csv',
 'FRED_unratensa_processed.csv',
 'FRED_wtisplc_processed.csv',
 'NCEI_pdsi_processed.csv',
 'NOAA_enso_processed.csv',
 'STATSCAN_canola_oil_processed.csv',
 'STATSCAN_energy_cpi_processed.csv',
 'STATSCAN_fppi_processed.csv',
 'STATSCAN_milk_sold_processed.csv',
 'STATSCAN_num_work_stop_processed.csv',
 'STATSCAN_rmpi_processed.csv',
 'WB_commodity_p

In [30]:
all_select.columns

Index(['swe_sj', 'swe_sr', 'policy_uncertainty', 'apu000072610', 'cwur0000sa0',
       'excaus', 'fedfunds', 'fmpi', 'impca', 'impch',
       ...
       'food_cpi: Other food products and non-alcoholic beverages',
       'food_cpi: Food purchased from restaurants', 'swe_total', 'pdsi_total',
       'fppi_total', 'milk_sold_total', 'work_stop_total', 'rmpi_total',
       'commodity_food', 'commodity_fertilizer'],
      dtype='object', length=109)

Data description table

In [31]:
desc = pd.read_csv('llmp_desc.csv', index_col=0)
desc

Unnamed: 0_level_0,type,source,description
timeseries_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
food_cpi_food,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_meat,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_fish,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_dairy,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_bakery,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_fruit,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_vegetables,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_other,target,Statistics Canada,Food CPI measures changes in food prices exper...
food_cpi_restaurants,target,Statistics Canada,Food CPI measures changes in food prices exper...
policy_uncertainty,economic,Policy Uncertainty,Economic Policy Uncertainty Index tracks polic...


Reading in selected lists

For the processing of the data selected by the 4 data selection methods:
* Human-selected
* LLM-selected
* Correlation-selected
* All data

Each selection method has a corresponding .txt file listing each dataset selected. These files will then be read in and parsed as lists. The lists will be used to select the datasets from the data pool.

In [32]:
#var_list = ['corr_select', 'human_select', 'llm_select']
#var_list = ['R1','R2','R3','R4','R5','R6']
#var_list = ['H1','H2','H3','H4','H5','H6']
#var_list = ['GPT','GPT_P1','GPT_P2','GPT_P3','GPT_P4','CmdR', 'Claude']
#var_list = ['economic', 'geopolitical', 'climate', 'manufacturing', 'agriculture']
#var_list = ['llm_meat', 'llm_dairy', 'llm_fish', 'llm_bakery', 'llm_vegetable', 'llm_fruit']
#var_list = ['8_meat', '8_dairy', '8_fish', '8_bakery', '8_vegetable', '8_fruit']
#var_list = ['7_P1', '7_P2', '7_P3', '7_P4']
#var_list = ['9_corr', '9_llm', '9_human']
#var_list = ['feat_imp']

select_path = 'select_lists/'

for method in var_list:
    df = all_select.copy()
    select_list = open(select_path + method + '.txt', 'r').read().splitlines()
    food_cpi = open(select_path + 'food_cpi.txt', 'r').read().splitlines()
    full_list = food_cpi + select_list
    df = df[full_list]
    df.to_csv(method + '.csv')


NameError: name 'var_list' is not defined

In [None]:
# Test dataselection

geopolitical = ["policy_uncertainty", "excaus", "impca", "impch", "impmx", "wtisplc", "commodity_food", "commodity_fertilizer"]
climate = ["enso", "swe_total", "pdsi_total"]
manufacturing = [
]
llm1 = 

all_select[climate]

In [None]:
all_select['swe_total'].plot()

In [None]:
all_select[geopolitical]

In [None]:
for column in all_select.columns:
        if pd.isna(all_select[column].iloc[-1]):
            print(column)
            print(f"value is {all_select[column].iloc[-2]}\n")

            
    
    