In [1]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import metrics
import pandas as pd
import numpy as np 
import math

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [5]:
features_df = pd.read_csv("bitcoin_train.csv")
features_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,...,Open_Gold,Daily_Change_Gold,Daily_Change_Perc_Gold,Increased_Gold,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500,label
0,793,2017-12-28,15864.099609,15888.400391,13937.299805,14606.5,12336499712,-1232.0,0.0,636.405515,...,1292.0,5.2,0.004025,1,2687.54,2686.1,1.44,0.000536,1,0.0
1,908,2018-06-14,6342.75,6707.140137,6334.459961,6675.350098,5138710016,325.450196,1.0,-383.20157,...,1303.1,5.2,0.00399,1,2782.49,2783.21,-0.72,-0.000259,0,0.0
2,224,2015-09-25,234.362,237.427002,233.684006,235.143997,22363600,0.61499,1.0,-1.731076,...,1151.0,-5.4,-0.004692,0,1931.34,1935.93,-4.59,-0.002371,0,1.0
3,1042,2018-12-26,3819.666748,3893.359619,3769.86377,3857.297607,5326547918,41.806884,1.0,-83.043066,...,1273.5,-0.5,-0.000393,0,2467.7,2363.12,104.58,0.044255,1,1.0
4,563,2017-01-31,920.958984,972.018982,920.958984,970.403015,164582000,50.020996,1.0,12.846431,...,1197.7,13.7,0.011439,1,2278.87,2274.02,4.85,0.002133,1,1.0


In [12]:
features_df.columns

feature_lst = ['Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5','PROC_10', 'wpr',\
                'sto_os', 'goog_trend_score', 'count', 'compound', 'retweets_count', 'likes_count', 'replies_count',\
                'compound_weighted_replies', 'compound_weighted_likes','compound_weighted_retweets',\
                'Daily_Change_Perc', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold', 'Increased_Gold', \
                'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500']

outcome = features_df.columns[-1]

print(feature_lst)
print(outcome)

['Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5', 'PROC_10', 'wpr', 'sto_os', 'goog_trend_score', 'count', 'compound', 'retweets_count', 'likes_count', 'replies_count', 'compound_weighted_replies', 'compound_weighted_likes', 'compound_weighted_retweets', 'Daily_Change_Perc', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold', 'Increased_Gold', 'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500']
label


In [13]:
X = features_df[feature_lst]
y = features_df[outcome]
print(X.shape, y.shape)

(1253, 24) (1253,)


In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 1234)

In [15]:
# Figure out what the tree depth should be (maximum score)
results_dict = {}
cx_validation = KFold(n_splits = 10, shuffle = True, random_state = 1)
for depth in range(1, 11):
    base_estimator = DecisionTreeClassifier(max_depth = depth, random_state = 1234)
    if base_estimator.fit(X,y).tree_.max_depth < depth:
        break
    accuracy_score = np.mean(cross_val_score(base_estimator, X, y, cv = cx_validation, scoring = 'accuracy', n_jobs = 1))
    print("Depth:", depth, ", Score:", accuracy_score)

    results_dict[depth] = accuracy_score

max_val = -1
max_depth_val = None
for key, val in results_dict.items():
    if val > max_val:
        max_val = val
        max_depth_val = key

print("Max depth should be:", max_depth_val)

    # https://educationalresearchtechniques.com/2019/01/02/adaboost-classification-in-python/
    # https://towardsdatascience.com/boosting-and-adaboost-clearly-explained-856e21152d3e
    # https://python-bloggers.com/2019/01/adaboost-classification-in-python/ # this one for 
    

Depth: 1 , Score: 0.6327555555555555
Depth: 2 , Score: 0.6455619047619047
Depth: 3 , Score: 0.6512507936507935
Depth: 4 , Score: 0.646431746031746
Depth: 5 , Score: 0.6440253968253967
Depth: 6 , Score: 0.6313523809523809
Depth: 7 , Score: 0.6184888888888889
Depth: 8 , Score: 0.6025269841269841
Depth: 9 , Score: 0.5969523809523809
Depth: 10 , Score: 0.6049015873015873
Max depth should be: 3


In [16]:
# Create adaboost classifer object
base_estimator = DecisionTreeClassifier(max_depth = max_depth_val, random_state = 1234)

adaboost = AdaBoostClassifier(base_estimator = base_estimator, n_estimators = 20,
                              learning_rate = 1, random_state = 1234)

# Train Adaboost Classifer
model = adaboost.fit(X_train, y_train)

# Predict the response for valid dataset
y_hat = model.predict(X_valid)

In [17]:
# Model Accuracy with chosen decision tree depth and randomly chosen hyperparameters
accuracy_score = metrics.accuracy_score(y_valid, y_hat)
mse = metrics.mean_squared_error(y_valid, y_hat)
confusion = metrics.confusion_matrix(y_valid, y_hat) 

print("Accuracy:", accuracy_score)
print("MSE:", mse)
print("CONFUSION:", confusion)


Accuracy: 0.589171974522293
MSE: 0.410828025477707
CONFUSION: [[ 75  61]
 [ 68 110]]


In [50]:
# ask irsa about this part
# new = model.predict(adaboost.feature_importances_)
# for i in range(len(feature_lst)):
#     print(f"{feature_lst[i]}: {feature_import[i]}")



ValueError: Expected 2D array, got 1D array instead:
array=[0.   0.   0.1  0.05 0.15 0.05 0.   0.05 0.05 0.   0.05 0.05 0.   0.
 0.1  0.05 0.   0.15 0.05 0.05 0.   0.05 0.   0.  ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

hyper parametertuning (number estimators and learning rate)


In [38]:
# hyper parametertuning (number estimators and learning rate)
ada = AdaBoostClassifier()
search_grid = {'n_estimators':[500,1000,2000], 'learning_rate':[.001,0.01,.1]}
search = GridSearchCV(estimator = ada, param_grid = search_grid, scoring = 'accuracy', n_jobs = 1, cv = cx_validation)

#https://python-bloggers.com/2019/01/adaboost-classification-in-python/

In [None]:
search.fit(X,y)
search.best_params_
print(search.best_params_)
search.best_score_
print(search.best_score_)
#Out[34]: 0.7425149700598802



# https://python-bloggers.com/2019/01/adaboost-classification-in-python/

In [196]:
score = np.mean(cross_val_score(ada, X, y, scoring='accuracy', cv = cx_validation, n_jobs = 1))
score


#https://python-bloggers.com/2019/01/adaboost-classification-in-python/

0.6472126984126984

In [18]:
# https://machinelearningmastery.com/adaboost-ensemble-in-python/

# alternative way to check n_estiamtors and learning_Rate
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.681812 using {'learning_rate': 0.01, 'n_estimators': 500}
0.640842 (0.034224) with: {'learning_rate': 0.001, 'n_estimators': 10}
0.644830 (0.031782) with: {'learning_rate': 0.001, 'n_estimators': 50}
0.648298 (0.031138) with: {'learning_rate': 0.001, 'n_estimators': 100}
0.670358 (0.042053) with: {'learning_rate': 0.001, 'n_estimators': 500}
0.677280 (0.037879) with: {'learning_rate': 0.001, 'n_estimators': 1000}
0.643767 (0.030720) with: {'learning_rate': 0.01, 'n_estimators': 10}
0.670093 (0.041589) with: {'learning_rate': 0.01, 'n_estimators': 50}
0.677280 (0.037879) with: {'learning_rate': 0.01, 'n_estimators': 100}
0.681812 (0.033829) with: {'learning_rate': 0.01, 'n_estimators': 500}
0.678356 (0.037315) with: {'learning_rate': 0.01, 'n_estimators': 1000}
0.679943 (0.039659) with: {'learning_rate': 0.1, 'n_estimators': 10}
0.680222 (0.034767) with: {'learning_rate': 0.1, 'n_estimators': 50}
0.679689 (0.037879) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.674343 (0.0

Rerun adaboost with chosen hyperparameters

In [19]:
base_estimator = DecisionTreeClassifier(max_depth = max_depth_val, random_state = 1234)

chosen_learning_rate = list(grid_result.best_params_.values())[0]
chosen_n = list(grid_result.best_params_.values())[1]

print("chosen_n", chosen_n)
print("chosen_learing_rate", chosen_learning_rate)

adaboost = AdaBoostClassifier(base_estimator = base_estimator, n_estimators = chosen_n,
                              learning_rate = chosen_learning_rate, random_state = 1234)

# Train Adaboost Classifer
model = adaboost.fit(X_train, y_train)

# Predict the response for valid dataset
y_hat = model.predict(X_valid)

chosen_n 500
chosen_learing_rate 0.01


In [20]:
# Model Accuracy with chosen decision tree depth and chosen hyperparameters
accuracy_score = metrics.accuracy_score(y_valid, y_hat)
mse = metrics.mean_squared_error(y_valid, y_hat)
confusion = metrics.confusion_matrix(y_valid, y_hat) 

print("Accuracy:", accuracy_score)
print("MSE:", mse)
print("CONFUSION:", confusion)

Accuracy: 0.6401273885350318
MSE: 0.35987261146496813
CONFUSION: [[ 79  57]
 [ 56 122]]


In [None]:
''' other links
https://inria.github.io/scikit-learn-mooc/python_scripts/ensemble_adaboost.html
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
 https://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
 https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
 
 https://towardsdatascience.com/how-do-you-implement-adaboost-with-python-a76427b0fa7a
 https://towardsdatascience.com/machine-learning-part-17-boosting-algorithms-adaboost-in-python-d00faac6c464
 
 https://machinelearningmastery.com/adaboost-ensemble-in-python/
 https://machinelearningmastery.com/k-fold-cross-validation/

 https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
 https://blog.paperspace.com/adaboost-optimizer/
 https://www.datacamp.com/tutorial/adaboost-classifier-python
 https://educationalresearchtechniques.com/2019/01/02/adaboost-classification-in-python/

 consider trying standardization https://github.com/mehuls45/Heart-Disease-prediction-using-ML/blob/master/AdaBoost.ipynb

 # no need to normalize/standardize in ensemble methods 
 https://towardsdatascience.com/the-ultimate-guide-to-adaboost-random-forests-and-xgboost-7f9327061c4f

# Repeat for Dogecoin


In [None]:
features_df = pd.read_csv("bitcoin_train.csv")
features_df.head()