---

## Integrate Dataset

---

Given a time-correlated dataset, do the following:
* visualize
* expose and deal with missing data
* create a date column as a merge point
* merge w/ given aggregate dataset


Requires:
* Project_Util

In [None]:
import pandas as pd
from datetime import datetime as dt
import datetime

import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
plt.rcParams["figure.figsize"] = (10,6)
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn')
%matplotlib inline

In [None]:
def merge_co2_dataset(data_path, df_aggr, start_date, end_date, feature_map, impute_method='bfill', DATE_COL='date'):
  """
  Load data from the given data_path.
  Locate the date column and re-code as a pd.timestamp.
  Merge into given df_aggr.
  """
  print(f'### Loading data{data_path}::')
  df = pd.read_csv(data_path)

  df.describe().T

  df.info()
  df.tail(3)

  # set types
  df['Year'] = df['Year'].astype(dtype='int32')
  df['Month'] = df['Month'].astype(dtype='int32')

  df[DATE_COL] = df.apply(lambda x: pd.to_datetime(f'{int(x.Month):02}/01/{int(x.Year)}'), axis=1)

  # Truncate by date
  df = df[df[DATE_COL] >= start_date]
  df = df[df[DATE_COL] <= end_date]

  # Rename columns
  df.rename(columns=feature_map, inplace=True)

  # Grab real column names
  COLS = []
  for col in feature_map.values():
    COLS.append(col)

  # Handle missing values
  for col in COLS:
    df[col].fillna(method=impute_method, inplace=True)

  plt.rcParams["figure.figsize"] = [8,6]
  plt.plot(df['date'], df[COLS])
  plt.xlabel('Date')
  plt.ylabel('CO2 (ppm)')
  plt.title('CO2 over Time')
  plt.legend(COLS)
  plt.show()

  # Check time intervals
  df['interval'] = df.date - df.date.shift(1)
  df[['date', 'interval']].head()
  print("------ Interval Counts - should be on the month ------")
  print(f"{df['interval'].value_counts()}")
  df.drop(columns=['interval'], inplace=True)


  # Drop other unnecessasry columns
  COLS.append(DATE_COL)
  df = df_retain(df, COLS)
  df.head()

  # Merge dataset
  return pd.merge(df_aggr, df, on=DATE_COL, how='inner')

In [None]:
d = {}
d['k1'] = 'val1'
d['k2'] = 22


In [None]:
if False:
  debug = True

  DRIVE_PATH = "/content/drive/MyDrive/data606"

  # Set the location of this script in GDrive
  SCRIPT_PATH = DRIVE_PATH + "/src/"

  # Root Path of the data on the cloud drive
  DATA_ROOT = DRIVE_PATH + "/data/"
  data_path = DATA_ROOT + "atmospheric-co2.csv"

  # Start including data from this date
  start_date =  pd.to_datetime(dt.fromisoformat('1950-01-01'))
  # Stop including data after this date
  end_date = pd.to_datetime(dt.fromisoformat('2022-12-31'))

  features = ['Seasonally Adjusted CO2 Fit (ppm)']

  # Mount drive
  from google.colab import drive
  drive.mount('/content/drive')

  %cd $SCRIPT_PATH

  # Load util class
  %run -i "./ProjectUtil.ipynb"

