# Exlploratory Data Analysis

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import datetime

from matplotlib import pyplot as plt

import pandas_datareader.data as web
import requests

import mlflow

from data_utils import get_train_test_split_for_stock
from config import *

np.set_printoptions(threshold=sys.maxsize)

## MLflow Naming Convention and Meta-data Organization (Taxonomy)

```
# Multi-project, single MLflow instance approach used in the demo (there are other ways to organize)
# Experiment => <Project callsign>_<type of activity / phase>
    - SP = Stock Prediction
    - EXP = Experimentation
    - TDP = Training Data Pipeline
    - MT = Model Training
    - MM = Model Monitoring
    Examples:
    - SP_EXP_EDA
    - SP_EXP_Modelling
    - SP_EXP_HyperParam_Tuning
    - SP_Training_Data_Pipeline
    - SP_Model_Training
    - SP_Model_Monitoring
# Run name => model name / sub-activity
# Tags => other things we want to track

Standardized approach is key!
```

## Set experiment

In [None]:
mlflow.set_experiment('SP_EXP_EDA')
mlflow.start_run(run_name="Data Preparation")

# Do not forget about mlflow.end_run()

## Get data

In [None]:
# Workaround to handle issue https://github.com/pydata/pandas-datareader/issues/868
USER_AGENT = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                ' Chrome/91.0.4472.124 Safari/537.36')
    }
sesh = requests.Session()
sesh.headers.update(USER_AGENT)

start = datetime.datetime(2014, 1, 1)
end = datetime.datetime(2021, 12, 31)

df = web.DataReader("BTC-USD", 'yahoo', start, end, session=sesh)
df

## Visualize Open values

In [None]:
df['Open'].plot()
plt.savefig("img/btc-usd.png", format="png", dpi=600)

mlflow.log_artifact("img/btc-usd.png")

## Data preparation for modelling

### Create labels (target value)

In [None]:
# Daily pct changes
df['Delta Pct'] = (df['Close'] - df['Open'])/df['Open']
df

In [None]:
df['Going Up'] = df['Delta Pct'].apply(lambda d: 1 if d > 0. else 0)
df

In [None]:
df['Going Up'].to_numpy()[:10]

In [None]:
# We want t-1 : t-10 historical Going Ups and a current/to be predicted value = 11 in total
WINDOW_SIZE

In [None]:
def rolling_window(a, window):
    """
    Takes np.array 'a' and size 'window' as parameters 
    Outputs an np.array with all the ordered sequences of values of 'a' of size 'window'
        e.g. Input: ( np.array([1, 2, 3, 4, 5, 6]), 4 )
             Output: 
                     array([[1, 2, 3, 4],
                           [2, 3, 4, 5],
                           [3, 4, 5, 6]])
    """
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    print(shape)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

In [None]:
# t-10 | t-9 | ... | t-2 | t-1 | label (Going Up)
training_dataset = rolling_window(df['Going Up'].to_numpy(), WINDOW_SIZE)
training_dataset[:8]

In [None]:
cols = ["t-{}".format(10-i) for i in range(0, 10)] + ["target"]
df = pd.DataFrame(training_dataset, columns=cols)
df.iloc[:8]

## Save data

In [None]:
df.to_csv(PATH_TO_DATA_FILE, index=False)

## Log data and data transformation logic

In [None]:
# To be able to reproduce the EDA/modelling and be have a golder dataset for DRIFT assessment
mlflow.log_artifact(PATH_TO_DATA_FILE)

mlflow.log_artifact("010_EXP_EDA_and_data_preparation.ipynb")

## Train-Test Split
We want to keep consistency while comparing different models, so we have single copy of data and single piece of code splitting the data into independent and dependent features.

In [None]:
X = df.iloc[:, :-1]
X[:8]

In [None]:
Y = df.iloc[:, -1]
Y[:8]

In [None]:
# In the src/data_utils.py
# DO NOT RUN this function definition from the notebook

import pandas as pd
from sklearn.model_selection import train_test_split

#def get_train_test_split_for_stock(data_file):
    """
    Takes... csv file
    Outputs... X_train, X_test, y_train, y_test split
    """
    
    data = pd.read_csv(data_file)
    
    X = data.iloc[:, :-1]
    Y = data.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=SPLIT_RATIO, random_state=RANDOM_STATE, stratify=Y)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_split_for_stock(PATH_TO_DATA_FILE)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
mlflow.log_metric("random_state", RANDOM_STATE)
mlflow.log_metric("test_size", SPLIT_RATIO)
mlflow.set_tag("EDA", "Data preprocessing")

In [None]:
mlflow.end_run()