# Exogenous Variable Correlational Analysis

This notebook takes the all_df created in the data_processing notebook and breaks it into a dictionary that is then passed into the full_corr_selection function. This function creates a lagged target and an exogenous variable df which are concatenated and correlated. This is then passed through a threshold and only the variables satifying the threshold are returned. 

In [30]:
import pandas as pd
import numpy as np
import os
import glob
import sys
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

# allowing for switch from interactive env to non-interactive
if "__file__" in globals():
    base_path = os.path.dirname(__file__)
else:
    base_path = os.getcwd()

# Pulling the data functions from the other dir
data_functions_path = os.path.abspath(
    os.path.join(base_path, "../cfpr_data_processing")
)
sys.path.append(data_functions_path)

import importlib
import data_functions

importlib.reload(data_functions)
from data_functions import split_dataframe

import corr_var_functions

importlib.reload(corr_var_functions)
from corr_var_functions import *

raw_path = "../data/raw_data/"
processed_path = "../data/processed_data/"
data_utils_path = "../data/utils_data/"
curated_path = "../data/curated_data/"

In [31]:
# This list includes the all_data dfs and datasets that were collected in the initial sweep,
# but have data that ends early and/or datetime frequencies that are not monthly
ignore_list = open(data_utils_path + "file_ignore_list.txt", "r").read().split()

### Read in data

In [32]:
all_df = pd.read_csv(processed_path + "data_pool.csv", index_col=0)
all_df.index = pd.to_datetime(all_df.index)

all_dict = split_dataframe(all_df)

### Running variable selection and correlation

**Available Parameters:**

* **dict:** the dictionary holding all of the variable dataframes
* **target_name:** the name of the df that will be the target (ie 'food_cpi: Meat')
* **start:** date to start range (str)
* **end:** date to end range (str)
* **ignore:** a list of df titles you want ignored from the correlation (ie 'food_cpi: Food') (auto set to None)
* **thresh:** the correlation coefficient threshold (ie 0.6)
* **select_unique:** the number of best lags to include in the final corr_selection (auto set to 1, just shows the best lag for the associated variable/correlation)
* **freq:** the datetime frequency of the data (auto set to 'MS')
* **lag_start:** At what month will the future lags start
* **lag_end:** At what month will the future lags end 
* **interpolate_cutoff:** defines whether or not you want to use a hard cuttoff date range or a soft cutoff date range (auto set to False - hard cutoff)
* **log:** what log type to use (None, log, log10, log2)
* **scaler:** what scaler to use (None, ss, mm)


In [33]:
start_range = "1986-01-01"
end_range = "2024-01-01"

In [34]:
targ_names = ['food_cpi_food',
 'food_cpi_meat',
 'food_cpi_fish',
 'food_cpi_dairy',
 'food_cpi_bakery',
 'food_cpi_fruit',
 'food_cpi_vegetables',
 'food_cpi_other',
 'food_cpi_restaurants']

ig = ["food_cpi"]

corr_selection = [[] for _ in targ_names]
selects = [[] for _ in targ_names]

for i in range(len(targ_names)):
    corr_selection[i], selects[i] = full_corr_selection(
        all_dict,
        targ_names[i],
        start_range,
        end_range,
        thresh=0.0,
        log=None,
        scaler='ss',
        freq='MS',
        interpolate_cutoff=False,
        ignore=ig,
        lag_start=1,
    )


scaling

Correlation Summary for food_cpi_food

Threshold: 0.0
Log: None
Scaler: ss
Date Range: 1986-01-01 to 2024-01-01
Cutoff Interpolated: False
Number of exogenous regressors considered: 37
Number of exogenous regressors above threshold: 29

         Target Lag          Exo Variable      Corr
0    Lagged_1_Month           cwur0000sa0  0.988692
1    Lagged_7_Month                  fmpi  0.986118
2    Lagged_8_Month                 impmx  0.983109
3    Lagged_7_Month       pcu482111482111  0.981784
4    Lagged_3_Month       pcu324191324191  0.980160
5    Lagged_1_Month       pcu333132333132  0.967558
6    Lagged_1_Month          apu000072610  0.965797
7   Lagged_11_Month            rmpi_total  0.963280
8    Lagged_8_Month            energy_cpi  0.960349
9   Lagged_16_Month                 impch  0.950420
10  Lagged_10_Month            fppi_total  0.944577
11  Lagged_24_Month            canola_oil  0.940083
12   Lagged_8_Month           pcu32533253  0.917218
13   Lagged_1_Month       