# Data Processing

This notebook reads in all files from data/raw_data (excluding those in data/utils_data/file_ignore_list.txt) and processes them according to process_csv from data_functions.py into a dictionary (dataframes) which holds each time series.

There is room for individual exploratory analysis of the dfs in the dataframes dictionary.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime
import glob

import importlib
import data_functions

importlib.reload(data_functions)
from data_functions import process_csv, plot_dataframe, remove_capitals

raw_path = "../data/raw_data/"
processed_path = "../data/processed_data/"
data_utils_path = "../data/utils_data/"

In [2]:
# This list includes the all_data dfs and datasets that were collected in the initial sweep,
# but have data that ends early and/or datetime frequencies that are not monthly
ignore_list = open(data_utils_path + "file_ignore_list.txt", "r").read().split()

## Data Uploading and Updating

Updating the processed data and concatenating into all_dfs

In [3]:
dataframes = {}
all_files = sorted(os.listdir(raw_path))

csv_files = [f for f in all_files if f not in ignore_list]
date_range = pd.date_range(start="1986-01-01", end="2024-09-01", freq="MS")

for file in csv_files:
    file_name = file[:-4]  # i.e. STATSCAN_energy
    var_name = remove_capitals(file_name)[1:]  # i.e. energy
    

    # processing all the data and uploading anything new
    print(file_name)
    dataframes[var_name] = process_csv(file_name, processed_path)

    # adding the original name before the inner column names
    col_names = []
    df_cols = list(dataframes[var_name].columns)
    for col in df_cols:
        col_names.append(var_name + ": " + col)

    dataframes[var_name].columns = col_names

    dataframes[var_name].index = pd.to_datetime(dataframes[var_name].index)
    dataframes[var_name] = dataframes[var_name].reindex(date_range)
    


all_df = pd.concat(dataframes.values(), axis=1)


CDEC_swe_nc
CDEC_swe_nl
CDEC_swe_sj
CDEC_swe_sl
CDEC_swe_sr


ValueError: time data "2000--" doesn't match format "%Y-%m-%d", at position 224712. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

# We are having issues with a few of the time series so we need to check pre-processing

### SWE - filter to only include SJ (San Joaquin) 
Should include STATION_ID = SDF,SLT,STM,BLA,MED,ADM,SNM,LLP,HRK,MDW,RCC,SSM,LOS,CAP,FRN,SIL,HYS,GKS,BLC


### SWE-sr (Sacremento River ) as suggested by Cody
Should include STATION_ID = BLK,DDM,GNL,REL,SLM,BLD,BLS,TES,DAN,SLI,TUM,HRS,WHW,PDS,KIB,STR,TNY,GIN,VLC,AGP,KSP,GRM,DPO,TMR,CHM,HNT,GRV,PSR


In [None]:
#keep only necessary columns
columns_to_drop = []

series_to_keep = ['swe_sr: SWE', 'swe_sj: SWE']
for column in all_df.columns:
    if 'swe' in column:
        if column not in series_to_keep:  # Check if the column is not in the keep list
            print(f"Dropping column: {column}")  # Print which column will be dropped
            columns_to_drop.append(column)  # Add to drop list


all_df = all_df.drop(columns=columns_to_drop)
all_df
        


# Save the processed dataframe 

In [None]:
all_df.to_csv(os.path.join(processed_path, "all_data_processed.csv"), index=True)
all_df

## Exploration of individual dataframes

In [None]:
(dataframes['food_cpi']).plot()