**Variable Analysis for Provincial Data**

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import glob
import os
import re

import importlib
import corr_var_functions

importlib.reload(corr_var_functions)
from corr_var_functions import *

scaler_mm = MinMaxScaler()
scaler_ss = StandardScaler()

processed_path = "../data/processed_data/"
raw_path = "../data/raw_data/"
features_path = "feature_selection_"
prov_path = "../data/provincial_data/"

In [25]:
def replace_und(text):
    return re.sub(r"_", ":" + " ", text)

In [26]:
file_list = [os.path.basename(x) for x in glob.glob(prov_path + "*.csv")]

file_names = []
for i in range(len(file_list)):
    file_names.append(replace_und(file_list[i][:-4]))

In [27]:
prov_dfs = {}

ignore = ["STATSCAN"]  # igrnoring the statscan raw files

for j in range(len(file_list)):
    if any(ign in file_names[j] for ign in ignore):
        print("File %s ignored manually" % file_names[j])

    else:
        prov_dfs[file_names[j]] = pd.read_csv(
            prov_path + file_list[j], header=0, index_col=0
        )
        try:
            pd.to_datetime(prov_dfs[file_names[j]].index, format="%Y-%m-%d")
        except Exception:
            print(j)
            print(
                "File %s skipped due to alternate datetime formatting" % file_names[j]
            )
            prov_dfs.pop(file_names[j], None)
            j = j - 1
        else:
            prov_dfs[file_names[j]].index = pd.to_datetime(
                prov_dfs[file_names[j]].index, format="%Y-%m-%d"
            )

file_names = list(prov_dfs.keys())

File STATSCAN: prov: fppi ignored manually
File STATSCAN: cpi: prov ignored manually
File STATSCAN: prov: energy ignored manually


In [28]:
# Need to split apart the prov_dfs that have multiple columns within them
for k in range(len(file_names)):
    df = prov_dfs[file_names[k]]
    if df.shape[1] == 1:
        # the df only has one column and we can skip this
        continue
    else:
        for l in range(prov_dfs[file_names[k]].shape[1]):
            prov_dfs[file_names[k] + ":" + " " + df.columns[l]] = df.iloc[:, l]
        prov_dfs.pop(
            file_names[k], None
        )  # this removes the dfs with multiple columns, just leaving behind the specific columns

file_names = list(prov_dfs.keys())
file_names

['ns: fppi',
 'nb: energy',
 'on: energy',
 'nl: fppi',
 'mb: fppi',
 'nt: energy',
 'ns: energy',
 'ab: energy',
 'sk: energy',
 'ab: fppi',
 'mb: energy',
 'qc: fppi',
 'bc: energy',
 'yt: energy',
 'pe: fppi',
 'bc: fppi',
 'sk: fppi',
 'qc: energy',
 'on: fppi',
 'pe: energy',
 'nl: energy',
 'nb: fppi',
 'nb: cpi: Meat',
 'nb: cpi: Fish, seafood and other marine products',
 'nb: cpi: Dairy products and eggs',
 'nb: cpi: Bakery and cereal products (excluding baby food)',
 'nb: cpi: Fruit, fruit preparations and nuts',
 'nb: cpi: Vegetables and vegetable preparations',
 'nb: cpi: Other food products and non-alcoholic beverages',
 'nb: cpi: Food purchased from restaurants ',
 'qc: cpi: Meat',
 'qc: cpi: Fish, seafood and other marine products',
 'qc: cpi: Dairy products and eggs',
 'qc: cpi: Bakery and cereal products (excluding baby food)',
 'qc: cpi: Fruit, fruit preparations and nuts',
 'qc: cpi: Vegetables and vegetable preparations',
 'qc: cpi: Other food products and non-alcoho

**Correlational Analysis**

In [29]:
start_range = "1987-01-01"
end_range = "2024-01-01"

In [30]:
provs = ["nl", "pe", "ns", "nb", "qc", "on", "mb", "sk", "ab", "bc", "yt", "nt"]

targ_names = [
    "cpi: Meat",
    "cpi: Fish, seafood and other marine products",
    "cpi: Dairy products and eggs",
    "cpi: Bakery and cereal products (excluding baby food)",
    "cpi: Fruit, fruit preparations and nuts",
    "cpi: Vegetables and vegetable preparations",
    "cpi: Other food products and non-alcoholic beverages",
    "cpi: Food purchased from restaurants ",
]

prov_targ = {}

for i, prov in enumerate(provs):
    prov_targ[i] = pd.DataFrame(
        {"Target Name": "a"}, index=[0]
    )  # dummy row to stop future warning
    prov_targ[i] = prov_targ[i].reset_index()
    prov_targ[i].drop(columns=prov_targ[i].columns[0], inplace=True)
    for j in range(len(targ_names)):
        new_row = {"Target Name": prov + ": " + targ_names[j]}
        prov_targ[i] = pd.concat(
            [prov_targ[i], pd.DataFrame([new_row])], ignore_index=True
        )
    prov_targ[i].drop(axis=0, index=0, inplace=True)
    prov_targ[i] = list(prov_targ[i].iloc[:, 0])

prov_targ[0]

['nl: cpi: Meat',
 'nl: cpi: Fish, seafood and other marine products',
 'nl: cpi: Dairy products and eggs',
 'nl: cpi: Bakery and cereal products (excluding baby food)',
 'nl: cpi: Fruit, fruit preparations and nuts',
 'nl: cpi: Vegetables and vegetable preparations',
 'nl: cpi: Other food products and non-alcoholic beverages',
 'nl: cpi: Food purchased from restaurants ']

In [31]:
nb_feature_descriptions = [[] for _ in prov_targ[3]]
nb_feature_dicts = [{} for _ in prov_targ[3]]

for i in range(len(prov_targ[3])):
    target, exo, cut_dfs = exo_date_select_interpolate(
        prov_dfs,
        prov_targ[3][i],
        start_range,
        end_range,
        log=None,
        scaler="ss",
        freq="MS",
    )
    nb_feature_descriptions[i], nb_feature_dicts[i] = full_exo_var_analysis(
        cut_dfs, target, exo, start_range, end_range, thresh=0.5, ignore=["cpi"]
    )

In [32]:
nb_feature_descriptions[0]

Unnamed: 0,Target Lag,Exo Variable,Corr
98,Lagged_1_Month,bc: fppi,0.980721
100,Lagged_1_Month,qc: fppi,0.96353
96,Lagged_1_Month,on: fppi,0.963352
103,Lagged_1_Month,nl: fppi,0.962862
95,Lagged_1_Month,nb: fppi,0.954293
130,Lagged_1_Month,bc: energy,0.946762
137,Lagged_1_Month,nl: energy,0.937209
135,Lagged_1_Month,nt: energy,0.93489
101,Lagged_1_Month,ab: fppi,0.931784
129,Lagged_1_Month,yt: energy,0.925212
