# Predict the exchange rate

## Objectives

- Train an ML pipeline using hyperparameter optimization
- Use the best feature to predict exchange rate

## Input
- Jupyter_notebooks/outputs/dataset/collection/Bitcoin_Price_Data.csv

## Output
- Create feature is in a more suitable format for ML model to learn paterns from

# Change the working directory

## Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.pipeline import Pipeline

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [2]:
current_dir = os.getcwd()
current_dir

'/workspace/fifth-milestone-project-bitcoin/jupyter_notebooks'

We want to make the parent of the current directory the new current directory

In [3]:
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
print()




Confirm the new current directory

In [4]:
current_dir = os.getcwd()
current_dir

'/workspace/fifth-milestone-project-bitcoin'

# Load data

In [5]:
import pandas as pd
df = pd.read_csv("/workspace/fifth-milestone-project-bitcoin/jupyter_notebooks/outputs/dataset/collection/Bitcoin_Price_Data.csv")
df.head

<bound method NDFrame.head of       Unnamed: 0        Date  Closing Price (USD)  24h Open (USD)  \
0              0  2014-03-14           124.654990      125.304660   
1              1  2014-03-15           126.455000      124.654990   
2              2  2014-03-16           109.584830      126.455000   
3              3  2014-03-17           119.674660      109.584830   
4              4  2014-03-18           122.338660      119.674660   
...          ...         ...                  ...             ...   
2782        2782  2021-10-25         49765.132082    49597.778891   
2783        2783  2021-10-26         50033.693137    49718.354353   
2784        2784  2021-10-27         47886.625255    49927.035067   
2785        2785  2021-10-28         45605.615754    46806.537852   
2786        2786  2021-10-29         43145.471291    46440.336570   

      24h High (USD)  24h Low (USD)  
0         125.751660     123.563490  
1         126.758500     124.633830  
2         126.665660      8

# ML pipeline with regression method

## Create ML pipeline

### Install xgboost

In [7]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3
Note: you may need to restart the kernel to use updated packages.


In [8]:
from sklearn.pipeline import Pipeline

# Feature Engineering
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection

# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor


def PipelineOptimization(model):
    pipeline_base = Pipeline([

        ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
                                                     variables=['Closing Price (USD)', '24h Open (USD)'])),


        ("SmartCorrelatedSelection", SmartCorrelatedSelection(variables=None,
         method="spearman", threshold=0.6, selection_method="variance")),

        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base

Hyperparameter optimisation with custom class

In [9]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model = PipelineOptimization(self.models[key])

            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches

Split database to train and test set

In [10]:
from sklearn.model_selection import train_test_split

df = (pd.read_csv("/workspace/fifth-milestone-project-bitcoin/jupyter_notebooks/outputs/dataset/collection/Bitcoin_Price_Data.csv"))

X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['Closing Price (USD)'],axis=1),
                                    df['Closing Price (USD)'],
                                    test_size = 0.2,
                                    random_state = 0
                                    )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2229, 5) (2229,) (558, 5) (558,)


In [11]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}