# import libraries and data

In [1]:
from features import Dataframe
from utils import *
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd
import numpy as np

## create features and targets

In [2]:
df = Dataframe().get_features()

In [3]:
X = df.drop(columns=['predict', 'date', 'Value_classification'])
y = df['predict']

## create x data that has no target

In [4]:
new_X = Dataframe().get_x_to_predict()

In [5]:
new_X.head()

Unnamed: 0,date,current_price,current_price_sats,market_cap,reddit_post_48h,reddit_comment_48h,reddit_subscribers,reddit_active_accounts,public_interest_stats,Value,...,sats_change_2_weeks,price_change_2_days,price_change_1_week,price_change_2_weeks,percent_change_2_days,percent_change_1_week,percent_change_2_weeks,percent_sats_2_days,percent_sats_1_week,percent_sats_2_weeks
624,2022-05-05,16.292368,41043.849063,18086810000.0,1.0,13.111,39880.0,65.3,38454.0,27.0,...,-5075.092515,1.299615,-0.645908,-2.7961,0.086683,-0.038133,-0.146481,0.055773,-0.049306,-0.110044
625,2022-05-06,14.598846,39868.599811,16139790000.0,2.5,14.5,39883.0,52.428571,38454.0,22.0,...,-5029.04015,-0.128516,-2.384108,-3.585874,-0.008726,-0.140382,-0.197192,0.022216,-0.067011,-0.112011
626,2022-05-07,14.355718,39757.183449,15902170000.0,1.364,6.273,39881.0,58.583333,38454.0,23.0,...,-6299.140105,-1.93665,-1.818523,-3.962852,-0.118869,-0.112433,-0.21633,-0.031349,-0.050062,-0.13677
627,2022-05-08,13.800377,38810.570189,15342350000.0,0.846,6.154,39879.0,59.857143,38454.0,18.0,...,-8614.010277,-0.798469,-0.769511,-4.940943,-0.054694,-0.052815,-0.263639,-0.026538,0.005946,-0.181636
628,2022-05-09,13.229361,38851.622851,14693130000.0,0.6,5.1,39889.0,60.363636,38454.0,11.0,...,-7147.612935,-1.126357,-2.189413,-4.931293,-0.07846,-0.141997,-0.271537,-0.022777,-0.030164,-0.155385


In [28]:
new_X.columns

Index(['date', 'current_price', 'current_price_sats', 'market_cap',
       'reddit_post_48h', 'reddit_comment_48h', 'reddit_subscribers',
       'reddit_active_accounts', 'public_interest_stats', 'Value',
       'Value_classification', 'new_accounts', 'active_accounts', 'fees',
       'price_ema5', 'price_ema20', 'price_ema50', 'fear_ema5', 'fear_ema20',
       'fear_ema50', 'fear_change_2_days', 'fear_change_1_week',
       'fear_change_2_weeks', 'reddit_post_ema5', 'reddit_post_ema20',
       'reddit_post_ema50', 'reddit_change_2_days', 'reddit_change_1_week',
       'reddit_change_2_weeks', 'sats_ema5', 'sats_ema20', 'sats_ema50',
       'sats_change_2_days', 'sats_change_1_week', 'sats_change_2_weeks',
       'price_change_2_days', 'price_change_1_week', 'price_change_2_weeks',
       'percent_change_2_days', 'percent_change_1_week',
       'percent_change_2_weeks', 'percent_sats_2_days', 'percent_sats_1_week',
       'percent_sats_2_weeks'],
      dtype='object')

# create Pipelines

## determine initial value for k in SelectKBest

In [6]:
scaling_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', RobustScaler())])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Xk_train = scaling_pipeline.fit_transform(X_train)

In [8]:
grad_k, grad_score = Utils().find_best_features_gradient(Xk_train, y_train)
rand_k, rand_score = Utils().find_best_features_r_forest(Xk_train, y_train)

In [9]:
initial_grad_k = grad_k[np.argmax(grad_score)]
initial_rand_k = rand_k[np.argmax(rand_score)]

In [10]:
display(f"The initial value for k in the Gradient Boosting pipeline will be: {initial_grad_k}")
display(f"The initial value for k in the Random Forest pipeline will be: {initial_rand_k}")

'The initial value for k in the Gradient Boosting pipeline will be: 7'

'The initial value for k in the Random Forest pipeline will be: 13'

## create pipeline for Gradient Boosting Classifier

In [11]:
gradient_pipe = Pipeline([('imputer', SimpleImputer()),
                ('scaler', RobustScaler()), 
                ('feature_select', SelectKBest(score_func=mutual_info_classif, k=7)),
                ('clf_grad', GradientBoostingClassifier(learning_rate=0.1, loss='log_loss',
                                                       max_depth=10, min_samples_leaf=5))])

## create pipeline for Random Forest Classifier

In [12]:
random_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler()), 
                ('feature_select', SelectKBest(score_func=mutual_info_classif, k=13)),
                ('clf_rand', RandomForestClassifier(criterion='log_loss',
                                                   max_depth=10, min_samples_split=3))])

## calculate initial cross-validation scores

In [13]:
grad_initial_score = cross_val_score(gradient_pipe, X_train, y_train, cv=5).mean()
rand_initial_score = cross_val_score(random_pipe, X_train, y_train, cv=5).mean()

In [14]:
display(f"The initial score for the Gradient Boosting Pipeline is {grad_initial_score}.")
display(f"The initial score for the Random Forest Pipeline is {rand_initial_score}.")

'The initial score for the Gradient Boosting Pipeline is 0.8877171717171717.'

'The initial score for the Random Forest Pipeline is 0.8997575757575758.'

# Grid-Search to find the best parameters for each pipeline

## Grid-Search for Gradient Boosting Classifier

In [15]:
param_grid = {'imputer__strategy':['mean', 'median', 'most_frequent'],
             'clf_grad__learning_rate': [0.1, 0.15, 0.2, 0.3],
             'clf_grad__loss': ['log_loss', 'exponential'],
             'clf_grad__max_depth': [3, 10, 20, 50, 100],
              'clf_grad__min_samples_leaf': [1, 3, 5, 8, 10]}



In [16]:
search = GridSearchCV(gradient_pipe, param_grid=param_grid, cv=5, n_jobs=-1)

In [17]:
search.fit(X_train, y_train)

In [18]:
search.best_params_

{'clf_grad__learning_rate': 0.15,
 'clf_grad__loss': 'log_loss',
 'clf_grad__max_depth': 10,
 'clf_grad__min_samples_leaf': 3,
 'imputer__strategy': 'mean'}

In [19]:
search.best_score_

0.9057777777777778

At this stage, the best parameters for the Gradient Boosting Classifier are:

> 'learning_rate': 0.15,

> 'loss': 'log_loss',
 
> 'max_depth': 20,
 
> 'min_samples_leaf': 5

> 'strategy': 'median'

The best score is:
> 0.9017979797979798


## Grid-Search for Random Forest Classifier

In [20]:
param_grid_rand = {'imputer__strategy':['mean', 'median', 'most_frequent'],
             'clf_rand__criterion': ['gini', 'entropy', 'log_loss'],
             'clf_rand__max_depth': [3, 10, 20, 50, 100],
              'clf_rand__min_samples_split': [1, 3, 5, 8, 10]}

In [21]:
search_rand = GridSearchCV(random_pipe, param_grid=param_grid_rand, cv=5, n_jobs=-1)

In [22]:
search_rand.fit(X_train, y_train)

225 fits failed out of a total of 1125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "/home/inherentspice/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/inherentspice/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/inherentspice/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
    trees = Parallel(
  File "/home/inhe

In [23]:
search_rand.best_params_

{'clf_rand__criterion': 'gini',
 'clf_rand__max_depth': 100,
 'clf_rand__min_samples_split': 8,
 'imputer__strategy': 'most_frequent'}

In [24]:
search_rand.best_score_

0.9117575757575758

At this stage, the best parameters for the Random Forest Classifier are:
> 'criterion: 'log_loss',

> 'max_depth: 10,

> 'min_samples_split': 3,

> 'strategy': 'median'

The best score is:

> 0.9137575757575759


# Random Forest to predict Test Set

In [25]:
random_pipe.fit(X_train, y_train)
predict = random_pipe.score(X_test, y_test)

In [26]:
predict

0.888

# Random Forest to predict unseen data

In [29]:
predict_unseen = random_pipe.predict(new_X.drop(columns=['date', 'Value_classification']))

In [30]:
predict_unseen

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [33]:
df = pd.DataFrame(new_X['date'])

In [34]:
df['prediction'] = predict_unseen

In [35]:
df

Unnamed: 0,date,prediction
624,2022-05-05,0
625,2022-05-06,0
626,2022-05-07,0
627,2022-05-08,0
628,2022-05-09,0
629,2022-05-10,0
630,2022-05-11,0
631,2022-05-12,0
632,2022-05-13,0
633,2022-05-14,0
