In [3]:
import pandas as pd
import numpy as np

In [4]:
bitcoin = pd.read_csv("../models/bitcoin_train.csv")
dogecoin = pd.read_csv("../models/dogecoin_train.csv")

In [5]:
bitcoin.dtypes

Unnamed: 0                   int64
Date                        object
Open                       float64
High                       float64
Low                        float64
                            ...   
Open_SP500                 float64
Daily_Change_SP500         float64
Daily_Change_Perc_SP500    float64
Increased_SP500              int64
label                      float64
Length: 62, dtype: object

In [8]:
feature_list = ['MACD','PROC_3', 'PROC_5', 'PROC_10', 'wpr', 'goog_trend_score', 'compound_weighted',\
                'neg_weighted', 'pos_weighted', 'count_avg7', 'count_daily_diff', 'count_weekly_diff',\
                'replies_count_avg7', 'replies_count_daily_diff', 'replies_count_weekly_diff', 'retweets_count_avg7',\
                'retweets_count_daily_diff', 'retweets_count_weekly_diff', 'likes_count_avg7',\
                'likes_count_daily_diff', 'likes_count_weekly_diff', 'compound_weighted_avg7', \
                'compound_weighted_daily_diff', 'compound_weighted_weekly_diff', 'pos_weighted_avg7', \
                'pos_weighted_daily_diff', 'pos_weighted_weekly_diff', 'neg_weighted_avg7', \
                'neg_weighted_daily_diff','neg_weighted_weekly_diff','Daily_Change_Perc', 'Weekly_Change_Perc',\
                'Daily_Change_Perc_Gold', 'Daily_Change_Perc_SP500', 'RSI']

In [57]:
financial_features = ['MACD','PROC_3', 'PROC_5', 'PROC_10', 'wpr', 'RSI', 'Daily_Change_Perc',\
                      'Daily_Change_Perc_Gold', 'Daily_Change_Perc_SP500']
social_features = ['goog_trend_score', 'compound_weighted',\
                'neg_weighted', 'pos_weighted', 'count_avg7', 'count_daily_diff', 'count_weekly_diff',\
                'replies_count_avg7', 'replies_count_daily_diff', 'replies_count_weekly_diff', 'retweets_count_avg7',\
                'retweets_count_daily_diff', 'retweets_count_weekly_diff', 'likes_count_avg7',\
                'likes_count_daily_diff', 'likes_count_weekly_diff', 'compound_weighted_avg7', \
                'compound_weighted_daily_diff', 'compound_weighted_weekly_diff', 'pos_weighted_avg7', \
                'pos_weighted_daily_diff', 'pos_weighted_weekly_diff', 'neg_weighted_avg7', \
                'neg_weighted_daily_diff','neg_weighted_weekly_diff',]

In [50]:
def get_corr_matrices(df, features, y):

    feature_corr = df[features].corr()
    columns = feature_corr.columns.values.tolist()
    
    def color_corr(x):
        colors = []
        for col in columns:
            if col == x.name:
                colors.append("background: gray")
            elif x[col] > 0.7 or x[col] < -0.7:
                colors.append("background: red")
            elif x[col] > 0.5 or x[col] < -0.5:
                colors.append("background: yellow")
            else: 
                colors.append("")
        return colors

    feature_corr = feature_corr.style.apply(color_corr, axis=1)
    label_corr_list = []

    for feature in features:
        corr = df[feature].corr(df[y])
        label_corr_list.append([feature, np.abs(corr)])
    
    label_corr_list.sort(reverse=True, key= lambda x : x[1])
    label_corr = pd.DataFrame(label_corr_list, columns = ['Feature', 'Corr'])

    return feature_corr, label_corr

In [51]:
bit_feat_corr, bit_label_corr = get_corr_matrices(bitcoin, financial_features, 'label')

In [52]:
bit_feat_corr

Unnamed: 0,MACD,PROC_3,PROC_5,PROC_10,wpr,RSI,Daily_Change_Perc,Daily_Change_Perc_Gold,Daily_Change_Perc_SP500
MACD,1.0,0.192118,0.262594,0.475475,0.26693,0.539179,0.071249,-0.035243,0.036954
PROC_3,0.192118,1.0,0.818099,0.585475,0.627979,0.510222,0.590778,0.0528,0.058325
PROC_5,0.262594,0.818099,1.0,0.73315,0.677458,0.607987,0.485714,0.049506,0.079375
PROC_10,0.475475,0.585475,0.73315,1.0,0.691064,0.744366,0.342134,0.036876,0.092178
wpr,0.26693,0.627979,0.677458,0.691064,1.0,0.706751,0.544168,0.03295,0.140663
RSI,0.539179,0.510222,0.607987,0.744366,0.706751,1.0,0.313829,0.00544,0.060114
Daily_Change_Perc,0.071249,0.590778,0.485714,0.342134,0.544168,0.313829,1.0,0.074008,0.154932
Daily_Change_Perc_Gold,-0.035243,0.0528,0.049506,0.036876,0.03295,0.00544,0.074008,1.0,-0.031496
Daily_Change_Perc_SP500,0.036954,0.058325,0.079375,0.092178,0.140663,0.060114,0.154932,-0.031496,1.0


In [53]:
bit_label_corr

Unnamed: 0,Feature,Corr
0,RSI,0.07695
1,Daily_Change_Perc_SP500,0.046922
2,wpr,0.030434
3,Daily_Change_Perc_Gold,0.027668
4,PROC_3,0.014206
5,MACD,0.01154
6,Daily_Change_Perc,0.009212
7,PROC_10,0.007456
8,PROC_5,2.9e-05


In [13]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rf = RandomForestClassifier()
rf.fit(bitcoin[financial_features], bitcoin['label'])
importances = rf.feature_importances_.tolist()

In [55]:
importances_df = pd.DataFrame(list(zip(financial_features, importances)), columns=["feature", "importance"])

In [56]:
importances_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
0,MACD,0.119547
4,wpr,0.115768
5,RSI,0.113797
8,Daily_Change_Perc_SP500,0.113767
3,PROC_10,0.113371
7,Daily_Change_Perc_Gold,0.11009
1,PROC_3,0.107322
2,PROC_5,0.105995
6,Daily_Change_Perc,0.100344


In [58]:
social_feat_corr, social_label_corr = get_corr_matrices(bitcoin, social_features, 'label')

In [59]:
social_feat_corr

Unnamed: 0,goog_trend_score,compound_weighted,neg_weighted,pos_weighted,count_avg7,count_daily_diff,count_weekly_diff,replies_count_avg7,replies_count_daily_diff,replies_count_weekly_diff,retweets_count_avg7,retweets_count_daily_diff,retweets_count_weekly_diff,likes_count_avg7,likes_count_daily_diff,likes_count_weekly_diff,compound_weighted_avg7,compound_weighted_daily_diff,compound_weighted_weekly_diff,pos_weighted_avg7,pos_weighted_daily_diff,pos_weighted_weekly_diff,neg_weighted_avg7,neg_weighted_daily_diff,neg_weighted_weekly_diff
goog_trend_score,1.0,-0.041943,0.055054,0.010403,-0.052225,-0.039668,0.045811,-0.006421,0.012692,0.02287,-0.111865,-0.001881,-0.004534,-0.021241,-0.01837,0.032418,-0.058428,-0.006804,-0.006155,0.011959,-0.00411,0.005402,0.096297,0.00403,0.010132
compound_weighted,-0.041943,1.0,-0.328813,0.797707,0.329402,0.017838,0.034359,0.312077,0.104524,0.156385,0.200886,0.062327,0.096774,0.271348,0.01653,0.039358,0.640802,0.573675,0.770885,0.527272,0.496602,0.635548,-0.031085,-0.238234,-0.339927
neg_weighted,0.055054,-0.328813,1.0,-0.02285,0.010932,0.018867,0.025202,0.004956,-0.053448,-0.063092,-0.041211,-0.042876,-0.055553,0.023092,0.006036,0.008249,-0.01145,-0.295412,-0.418799,0.035452,-0.016126,-0.046912,0.391505,0.635889,0.881287
pos_weighted,0.010403,0.797707,-0.02285,1.0,0.045445,0.006012,-0.010756,0.095327,0.049549,0.081841,-0.016437,0.008241,0.01815,0.059051,-0.003703,0.000256,0.38592,0.555659,0.718863,0.522976,0.700535,0.875081,0.045009,-0.02086,-0.047848
count_avg7,-0.052225,0.329402,0.010932,0.045445,1.0,0.039529,0.22748,0.721731,0.032794,0.101779,0.856672,0.047855,0.127611,0.896588,0.060431,0.204306,0.523257,-0.015568,-0.005084,0.119825,-0.017031,-0.014776,0.012393,0.017651,0.005467
count_daily_diff,-0.039668,0.017838,0.018867,0.006012,0.039529,1.0,0.687525,0.045776,0.243756,0.168951,0.045724,0.43166,0.341045,0.077554,0.805941,0.639359,0.017621,0.031175,0.008615,0.006835,0.017127,0.003166,0.026003,-0.005841,0.007066
count_weekly_diff,0.045811,0.034359,0.025202,-0.010756,0.22748,0.687525,1.0,0.137953,0.162615,0.185396,0.193317,0.268158,0.376007,0.302374,0.546735,0.785364,0.047113,0.024137,0.005665,-0.007793,0.007131,-0.008182,0.063694,-0.00801,-0.005434
replies_count_avg7,-0.006421,0.312077,0.004956,0.095327,0.721731,0.045776,0.137953,1.0,0.000843,0.083295,0.809603,0.020235,0.093668,0.825993,0.043193,0.142201,0.489457,-0.01285,0.000393,0.210668,-0.014609,-0.007894,0.013735,0.009566,-0.001689
replies_count_daily_diff,0.012692,0.104524,-0.053448,0.049549,0.032794,0.243756,0.162615,0.000843,1.0,0.788655,0.000409,0.65621,0.54556,0.061116,0.47569,0.391367,0.007377,0.139681,0.130028,0.015151,0.049532,0.04947,0.012435,-0.069326,-0.064235
replies_count_weekly_diff,0.02287,0.156385,-0.063092,0.081841,0.101779,0.168951,0.185396,0.083295,0.788655,1.0,0.069676,0.512151,0.675214,0.140514,0.362946,0.466224,0.05007,0.102576,0.162157,0.041543,0.030818,0.072332,0.011841,-0.0427,-0.07437


In [60]:
social_label_corr

Unnamed: 0,Feature,Corr
0,compound_weighted_avg7,0.085825
1,pos_weighted_avg7,0.085565
2,compound_weighted,0.069644
3,pos_weighted,0.066354
4,likes_count_daily_diff,0.061732
5,likes_count_weekly_diff,0.060719
6,retweets_count_weekly_diff,0.051222
7,retweets_count_daily_diff,0.043428
8,replies_count_avg7,0.039634
9,neg_weighted_avg7,0.038582


In [61]:
rf2 = RandomForestClassifier()
rf2.fit(bitcoin[social_features], bitcoin['label'])
importances2 = rf2.feature_importances_.tolist()

importances_df2 = pd.DataFrame(list(zip(social_features, importances2)), columns=["feature", "importance"])
importances_df2.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
22,neg_weighted_avg7,0.047877
19,pos_weighted_avg7,0.046708
1,compound_weighted,0.045447
3,pos_weighted,0.044054
9,replies_count_weekly_diff,0.042574
14,likes_count_daily_diff,0.041734
20,pos_weighted_daily_diff,0.041717
6,count_weekly_diff,0.041443
12,retweets_count_weekly_diff,0.04138
16,compound_weighted_avg7,0.041081


In [62]:
features_to_try = ['MACD', 'RSI', 'PROC_3', 'Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold', \
                   'compound_weighted_avg7', 'likes_count_daily_diff', 'pos_weighted_avg7', 'retweets_count_weekly_diff'\
                  'goog_trend_score']