---

## Integrate Generic Dataset

---

Given a time-correlated dataset, do the following:
* visualize
* expose and deal with missing data
* create a date column as a merge point
* merge w/ given aggregate dataset


Requires:
* Project_Util

In [2]:
import pandas as pd
from datetime import datetime as dt
import datetime

import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
plt.rcParams["figure.figsize"] = (10,6)
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn')
%matplotlib inline

In [23]:
def merge_dataset(data_path, df_aggr, start_date, end_date, feature_map, impute_method='bfill', date_map=None, date_col=None):
  """
  Load data from the given data_path.
  Date bounds.
  If date_col is set, will use this to create dates.
  Otherwise supply a  date_map, mapping columns to 'year','month', and 'day'.
  Re-code 'date' as a pd.timestamp.
  Merge into given df_aggr.
  """
  print(f'### Loading data{data_path}::')
  df = pd.read_csv(data_path)

  # This is the common merge date field name
  DATE_COL='date'

  # Grab real column names
  COLS = []
  for col in feature_map.values():
    COLS.append(col)

  # GUARD
  if (len(COLS) <= 0):
    raise AssertionError(f'{data_path} - Provide at least one feature_map entry')

  # Rename columns
  df.rename(columns=feature_map, inplace=True)
  if (date_map is not None):
    df.rename(columns=date_map, inplace=True)

  # Handle missing values
  for col in COLS:
    df[col].fillna(method=impute_method, inplace=True)

  df.info()

  print('isna() value counts::')
  #df.isna().value_counts()

  # Standardize dates
  if (date_col is None):
    # Create a date format string
    datestr = []
    if ('day' in df.columns):
      # Recode as a formatted string
      df['day'] = df['day'].astype(dtype='int32')
      df['days'] = df['day'].apply(lambda x: f'{x:02}')
      datestr.append('{days}/')
    else:
      datestr.append('01/')

    if ('month' in df.columns):
      # Recode as a formatted string
      df['month'] = df['month'].astype(dtype='int32')
      df['months'] = df['month'].apply(lambda x: f'{x:02}')
      #print(f'month has nans:: {df.month.hasnans()}')
      datestr.append('{months}/')
    else:
      datestr.append('01/')

    # Recode as a formatted string
    df['year'] = df['year'].astype(dtype='int32')
    df['years'] = df['year'].apply(lambda x: f'{x:4}')
    datestr.append('{years}')
    # Finalize format string
    datestr = "".join(datestr)

    print(f'Using date format: {datestr}')
    date_col = 'dt'
    df[date_col] = df.apply(lambda x: pd.to_datetime(datestr.format_map(x)), axis=1)

  print('Doing date conversion ------')
  # NOW - convert date to pd.Timestamp
  df[DATE_COL] = pd.to_datetime(df[date_col])

  # Now we can...Truncate by date
  df = df[df[DATE_COL] >= start_date]
  df = df[df[DATE_COL] <= end_date]

  plt.rcParams["figure.figsize"] = [8,6]
  plt.plot(df[DATE_COL], df[COLS])
  plt.xlabel('Date')
  plt.ylabel('CO2 (ppm)')
  plt.title('CO2 over Time')
  plt.legend(COLS)
  plt.show()

  # Check time intervals
  df['interval'] = df.date - df.date.shift(1)
  df[[DATE_COL, 'interval']].head()
  print("------ Interval Counts - should be on the month ------")
  print(f"{df['interval'].value_counts()}")
  df.drop(columns=['interval'], inplace=True)

  # Drop other unnecessasry columns
  COLS.append(DATE_COL)
  df = df_retain(df, COLS)

  df.info()
  print('Merging by date ------')

  # Merge dataset
  return pd.merge(df_aggr, df, on=DATE_COL, how='inner')

In [None]:
if False:
  debug = True

  DRIVE_PATH = "/content/drive/MyDrive/data606"

  # Set the location of this script in GDrive
  SCRIPT_PATH = DRIVE_PATH + "/src/"

  # Root Path of the data on the cloud drive
  DATA_ROOT = DRIVE_PATH + "/data/"
  data_path = DATA_ROOT + "atmospheric-co2.csv"

  # Start including data from this date
  start_date =  pd.to_datetime(dt.fromisoformat('1950-01-01'))
  # Stop including data after this date
  end_date = pd.to_datetime(dt.fromisoformat('2022-12-31'))

  features = ['Seasonally Adjusted CO2 Fit (ppm)']

  # Mount drive
  from google.colab import drive
  drive.mount('/content/drive')

  %cd $SCRIPT_PATH

  # Load util class
  %run -i "./ProjectUtil.ipynb"

