In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
features_df = pd.read_csv("bitcoin_train.csv")
features_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,...,Daily_Change_Perc_Gold,Increased_Gold,RSI_Gold,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500,RSI_SP500,label
0,268,2015-11-30,371.437012,382.363007,370.382996,377.321014,71701600,6.027008,1.0,10.296558,...,0.008807,1,33.968948,2080.41,2090.95,-10.54,-0.005041,0,57.384705,0.0
1,1016,2018-11-15,5736.149902,5774.819824,5358.379883,5648.029785,7032140000,-90.320313,0.0,-121.126607,...,0.002641,1,48.29623,2730.2,2693.52,36.68,0.013618,1,59.599184,0.0
2,1144,2019-05-23,7677.269043,7943.791504,7533.196777,7881.84668,24457107820,201.780274,1.0,625.729133,...,0.009777,1,53.162762,2822.24,2836.7,-14.46,-0.005097,0,41.547984,1.0
3,1082,2019-02-25,3807.002441,3913.707275,3807.002441,3882.696289,9318796067,72.268799,1.0,94.876946,...,-0.001802,0,58.979842,2796.11,2804.35,-8.24,-0.002938,0,45.563129,0.0
4,1149,2019-05-31,8320.286133,8586.65918,8172.550781,8574.501953,25365190957,255.029297,1.0,608.732748,...,0.013842,1,63.288965,2752.06,2766.15,-14.09,-0.005094,0,20.448755,0.0


In [3]:
features_df.isnull().sum()

Unnamed: 0                 0
Date                       0
Open                       0
High                       0
Low                        0
                          ..
Daily_Change_SP500         0
Daily_Change_Perc_SP500    0
Increased_SP500            0
RSI_SP500                  0
label                      0
Length: 65, dtype: int64

In [6]:
features_df.columns

Index(['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume',
       'Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5',
       'PROC_10', 'wpr', 'sto_os', 'goog_trend_score', 'count',
       'compound_times_retweets', 'likes_count', 'neg_times_retweets',
       'pos_times_retweets', 'replies_count', 'retweets_count', 'pos_weighted',
       'neg_weighted', 'compound_weighted', 'count_avg7', 'count_daily_diff',
       'count_weekly_diff', 'replies_count_avg7', 'replies_count_daily_diff',
       'replies_count_weekly_diff', 'retweets_count_avg7',
       'retweets_count_daily_diff', 'retweets_count_weekly_diff',
       'likes_count_avg7', 'likes_count_daily_diff', 'likes_count_weekly_diff',
       'compound_weighted_avg7', 'compound_weighted_daily_diff',
       'compound_weighted_weekly_diff', 'pos_weighted_avg7',
       'pos_weighted_daily_diff', 'pos_weighted_weekly_diff',
       'neg_weighted_avg7', 'neg_weighted_daily_diff',
       'neg_weighted_weekly_diff', 'Dai

In [11]:
# Creating X and y 

# features_lst = ['Daily_Change', 'Daily_Change_Ind', 'MACD', 'PROC_3', 'PROC_5','PROC_10', 'wpr',\
#                 'sto_os', 'goog_trend_score', 'count', 'compound', 'retweets_count', 'likes_count', 'replies_count',\
#                 'compound_weighted_replies', 'compound_weighted_likes','compound_weighted_retweets',\
#                 'Daily_Change_Perc', 'Daily_Change_Gold', 'Daily_Change_Perc_Gold', 'Increased_Gold', \
#                 'Daily_Change_SP500', 'Daily_Change_Perc_SP500', 'Increased_SP500', 'Weekly_Change', 'Weekly_Change_Perc' ]


features_lst = ['MACD', 'RSI', 'PROC_3', 'Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold', \
                'compound_weighted_avg7', 'likes_count_daily_diff', 'pos_weighted_avg7',\
                'retweets_count_weekly_diff','goog_trend_score', 'RSI_Gold', 'RSI_SP500']

y = features_df['label']


In [12]:
def predict_btc(rf_model, features):
    '''
    Fits a Random Forest model to predict whether the price of bitcoin will go up or down
    Inputs:
        rf_model: the random forest model
        features: list of features to use
    Returns: 
         yhat: predictions  
    '''
    
    X = features_df[features_lst]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 42)
    
    rf.fit(X_train, y_train)

    yhat = rf.predict(X_valid)
    acc_score  = accuracy_score(y_valid, yhat)
    mse = mean_squared_error(y_valid, yhat)
    
    print('Fraction of correctly classified samples: ', acc_score)
    
    return (yhat, rf.feature_importances_)


In [13]:
rf = RandomForestClassifier()
yhat1, feature_import = predict_btc(rf, features_lst)

Fraction of correctly classified samples:  0.5192307692307693


In [14]:
for i in range(len(features_lst)):
    print(f"{features_lst[i]}: {feature_import[i]}")

MACD: 0.08456644239284376
RSI: 0.08116357955712812
PROC_3: 0.08563461601760264
Daily_Change_Perc_SP500: 0.08509464815761884
Daily_Change_Perc_Gold: 0.09051825239789148
compound_weighted_avg7: 0.08286262856039778
likes_count_daily_diff: 0.07996569960200124
pos_weighted_avg7: 0.08156778435152277
retweets_count_weekly_diff: 0.07895311466774188
goog_trend_score: 0.07507869878490928
RSI_Gold: 0.09000110972010887
RSI_SP500: 0.08459342579023327


In [15]:
param_grid = [{'n_estimators': [5, 10, 15, 20, 25, 30]}, {'max_features': ['sqrt', 'log2', None]},
              {'oob_score': [True, False]}]


X = features_df[features_lst]
grid_search_rf = GridSearchCV(rf, param_grid, cv = 5, scoring = 'accuracy')
grid_search_rf.fit(X, y)


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'n_estimators': [5, 10, 15, 20, 25, 30]},
                         {'max_features': ['sqrt', 'log2', None]},
                         {'oob_score': [True, False]}],
             scoring='accuracy')

In [16]:
best_hp = grid_search_rf.best_params_
best_hp

{'n_estimators': 30}

In [17]:
rf_log2 = RandomForestClassifier(n_estimators=30)
yhat_log2 = predict_btc(rf_log2, features_lst)

Fraction of correctly classified samples:  0.5224358974358975


In [24]:
dogecoin = pd.read_csv("dogecoin_train.csv")
dogecoin = dogecoin.rename(columns={'goog_trend': 'goog_trend_score'})

In [25]:
y2 = dogecoin['label']

In [26]:
def predict_doge(rf_model, features):
    '''
    Fits a Random Forest model to predict whether the price of bitcoin will go up or down
    Inputs:
        rf_model: the random forest model
        features: list of features to use
    Returns: 
         yhat: predictions  
    '''
    
    X = dogecoin[features_lst]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y2, random_state = 42)
    
    rf.fit(X_train, y_train)

    yhat = rf.predict(X_valid)
    acc_score  = accuracy_score(y_valid, yhat)
    mse = mean_squared_error(y_valid, yhat)
    
    print('Fraction of correctly classified samples: ', acc_score)
    
    return (yhat, rf.feature_importances_)

In [27]:
rf2 = RandomForestClassifier()
yhat1, feature_import = predict_doge(rf2, features_lst)

Fraction of correctly classified samples:  0.8048780487804879


In [28]:
for i in range(len(features_lst)):
    print(f"{features_lst[i]}: {feature_import[i]}")

MACD: 0.18915503988915888
PROC_5: 0.10319450747669909
wpr: 0.1073144930965813
goog_trend_score: 0.10563574647896565
compound_weighted_retweets: 0.06183551305406923
Daily_Change_Perc: 0.10077242515622381
Weekly_Change_Perc: 0.1018444474009364
Daily_Change_Perc_Gold: 0.11530183014496793
Daily_Change_Perc_SP500: 0.1149459973023978
