# import libraries and data

In [58]:
from features import Dataframe
from utils import *
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd
import numpy as np

## create features and targets

In [16]:
df = Dataframe().get_features()

In [28]:
X = df.drop(columns=['predict', 'date', 'Value_classification'])
y = df['predict']

## create x data that has no target

In [21]:
new_x = Dataframe().get_x_to_predict()

In [23]:
new_x.head()

Unnamed: 0,date,current_price,current_price_sats,market_cap,reddit_post_48h,reddit_comment_48h,reddit_subscribers,reddit_active_accounts,public_interest_stats,Value,...,sats_change_2_weeks,price_change_2_days,price_change_1_week,price_change_2_weeks,percent_change_2_days,percent_change_1_week,percent_change_2_weeks,percent_sats_2_days,percent_sats_1_week,percent_sats_2_weeks
624,2022-05-05,16.292368,41043.849063,18086810000.0,1.0,13.111,39880.0,65.3,38454.0,27.0,...,-5075.092515,1.299615,-0.645908,-2.7961,0.086683,-0.038133,-0.146481,0.055773,-0.049306,-0.110044
625,2022-05-06,14.598846,39868.599811,16139790000.0,2.5,14.5,39883.0,52.428571,38454.0,22.0,...,-5029.04015,-0.128516,-2.384108,-3.585874,-0.008726,-0.140382,-0.197192,0.022216,-0.067011,-0.112011
626,2022-05-07,14.355718,39757.183449,15902170000.0,1.364,6.273,39881.0,58.583333,38454.0,23.0,...,-6299.140105,-1.93665,-1.818523,-3.962852,-0.118869,-0.112433,-0.21633,-0.031349,-0.050062,-0.13677
627,2022-05-08,13.800377,38810.570189,15342350000.0,0.846,6.154,39879.0,59.857143,38454.0,18.0,...,-8614.010277,-0.798469,-0.769511,-4.940943,-0.054694,-0.052815,-0.263639,-0.026538,0.005946,-0.181636
628,2022-05-09,13.229361,38851.622851,14693130000.0,0.6,5.1,39889.0,60.363636,38454.0,11.0,...,-7147.612935,-1.126357,-2.189413,-4.931293,-0.07846,-0.141997,-0.271537,-0.022777,-0.030164,-0.155385


# create Pipelines

## determine initial value for k in SelectKBest

In [51]:
scaling_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', RobustScaler())])

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Xk_train = scaling_pipeline.fit_transform(X_train)

In [69]:
grad_k, grad_score = Utils().find_best_features_gradient(Xk_train, yk_train)
rand_k, rand_score = Utils().find_best_features_r_forest(Xk_train, yk_train)

In [70]:
initial_grad_k = grad_k[np.argmax(grad_score)]
initial_rand_k = rand_k[np.argmax(rand_score)]

In [71]:
display(f"The initial value for k in the Gradient Boosting pipeline will be: {initial_grad_k}")
display(f"The initial value for k in the Random Forest pipeline will be: {initial_rand_k}")

'The initial value for k in the Gradient Boosting pipeline will be: 7'

'The initial value for k in the Random Forest pipeline will be: 13'

## create pipeline for Gradient Boosting Classifier

In [81]:
gradient_pipe = Pipeline([('imputer', SimpleImputer()),
                ('scaler', RobustScaler()), 
                ('feature_select', SelectKBest(score_func=mutual_info_classif, k=7)),
                ('clf', GradientBoostingClassifier())])

## create pipeline for Random Forest Classifier

In [83]:
random_pipe = Pipeline([('imputer', SimpleImputer()),
                ('scaler', RobustScaler()), 
                ('feature_select', SelectKBest(score_func=mutual_info_classif, k=13)),
                ('clf', RandomForestClassifier())])

## calculate initial cross-validation scores

In [84]:
grad_initial_score = cross_val_score(gradient_pipe, X_train, y_train, cv=5).mean()
rand_initial_score = cross_val_score(random_pipe, X_train, y_train, cv=5).mean()

In [85]:
display(f"The initial score for the Gradient Boosting Pipeline is {grad_initial_score}.")
display(f"The initial score for the Random Forest Pipeline is {rand_initial_score}.")

'The initial score for the Gradient Boosting Pipeline is 0.8737171717171718.'

'The initial score for the Random Forest Pipeline is 0.9057575757575757.'