# Pull Newest Full Data, make Train Test split and Track those.

- Clean data
    - Drop columns not required for training
    - Drop rows with null valus where it makes sense 
    (river discharge may be NaN where there is no river. It makes sense to keep these rows for the model to learn where rivers are)
- Think about whether or not to have separate notebooks for new data retrievals and prep
- Version Control the data
- Train test splitting
- Version control again??

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm-cos-sdk ibm_watson_studio_pipelines 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

In [None]:
from ibm_watson_studio_pipelines import WSPipelines
import ibm_boto3

from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

import pickle
import dvc.api
import io

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [None]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [None]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
DATA_FILENAME = os.getenv("serialized_data_filename")

### DVC Pull and Deserialize Data

In [None]:
# TODO: Make pipeline param
repo = \
    os.getenv("GIT_REPOSITORY")

In [None]:
# Retrieve dataset from tracking information in git. The repository itself contains the remote storage info and credentials.
data = pickle.load(io.BytesIO(dvc.api.read(f"data/{DATA_FILENAME}",repo=repo, mode="rb")))

In [None]:
# Replace NaN values with 0. 
# Instead of dropping rows with NaN value alltogether, we want to keep them.
# Data here may indicate where there are no rivers (river discharge always = 0), (where it rarely rains = mostly 0 etc.)

data.fillna(0, inplace=True)
data

In [None]:
from sklearn.model_selection import train_test_split

# Assuming your large table is stored in a pandas DataFrame called 'df'
X = data.drop('dis24', axis=1)  # Extract input features by dropping the target column
y = data['dis24']  # Extract the target column


# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def serialize(obj, target_path):
    try:
        with open(target_path, 'wb') as _file:
            pickle.dump(obj, _file)
    except Exception as e:
        print(e)

In [None]:
train_target = "dvc-testing/data/train_package.pkl"

train_package = {}
train_package["X_train"] = X_train
train_package["y_train"] = y_train

serialize(train_package, train_target)

In [None]:
test_target = "dvc-testing/data/test_package.pkl"

test_package = {}
test_package["X_test"] = X_test
test_package["y_test"] = y_test

serialize(test_package, test_target)

###  Setup DVC Situation

Since we assume CPDaaS as environment, we will need to clone the dvc setup repository again.
Run the line shown below.

```
!git clone https://[GIT_TOKEN]@github.com/[GIT_REPOSITORY].git
````


In [None]:
# @hidden_cell
!git clone $GIT_REPOSITORY

In [None]:
!cd dvc-testing && mkdir data

In [None]:
!cd dvc-testing && dvc get $GIT_REPOSITORY data/era5-glofas-merged.pkl -o data/era5-glofas-merged.pkl

In [None]:
!cd dvc-testing && dvc add data/train_package.pkl data/test_package.pkl

In [None]:
!cd dvc-testing && git add data/.gitignore data/train_package.pkl.dvc data/test_package.pkl.dvc

In [None]:
!cd dvc-testing && git config --global user.email "ilias.ennmouri@ibm.com"
!cd dvc-testing && git config --global user.name "Ilias Ennmouri"

In [None]:
!cd dvc-testing && git commit -m "New train test subsets"

In [None]:
!cd dvc-testing && dvc push && git push

In [None]:
# TODO: Pass to pipeline params
train_package_path = "data/train_package.pkl"
test_package_path = "data/test_package.pkl"

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np


In [None]:
# Extract the relevant columns for input features and target variable
input_columns = ['time', 'latitude', 'longitude', 'tp', 'swvl1', 'stl1', 'surface', 'valid_time']
target_column = 'dis24'

In [None]:
# Convert non-numeric columns to numeric values
data['time'] = pd.to_datetime(data['time'])  # Convert dates to datetime objects



#data['latitude'] = data['latitude'].astype('category').cat.codes  # Encode coordinates as categorical codes
#data['longitude'] = data['longitude'].astype('category').cat.codes  # Encode coordinates as categorical codes

In [None]:
data

In [None]:
# E.g. col 'step' has only a single unique value. Its existence has no effect on training is solely a waste of resources.
# Therefore we will drop all cols with that characteristic
for key in data.keys():
    if len(data[key].unique()) < 2:
        print(f"col '{key}' dropped because it bears no more than one unique value.")
        data = data.drop(key, axis=1)

In [None]:
!python3 -m pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def generate_heatmap(X, y, descr='description'):
    df = pd.DataFrame(data=X.values, columns=X.columns.values, index=X.time.values)
    df['dis24'] = y
    plt.figure(figsize=(25,25))
    cor = df.corr()
    sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
    plt.show()
    cor_predictand = abs(cor['dis24'])
    feature_importance = cor_predictand[cor_predictand > 0.2]
    print(descr)
    print(feature_importance)
    return feature_importance

In [None]:
generate_heatmap(X, y)

In [None]:
X.columns