In [1]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import itertools

FEATURE CATEGORIES: <br> 
    stock measures <br> 
    google trends score <br> 
    tweets <br> 
    S&P 500 <br> 
    Gold <br> 

In [2]:
data = pd.read_csv("bitcoin_train.csv")

financial_features_final = ['RSI', 'RSI_Gold', 'RSI_SP500', 'Daily_Change_Perc_SP500',
                            'Daily_Change_Perc_Gold','wpr', 'MACD', 'PROC_3', 'streak']

sentiment_features_final = ['compound_weighted_avg7', 'compound_weighted', 'likes_count_daily_diff',
                            'retweets_count_weekly_diff', 'likes_count_avg7', 'goog_trend_score','neg_weighted',
                            'neg_weighted_avg7', 'pos_weighted_weekly_diff']

train_cols = data.columns

for i,c in enumerate(train_cols):
    print(i, c)

0 Unnamed: 0
1 Date
2 Open
3 High
4 Low
5 Close
6 Volume
7 Daily_Change
8 Daily_Change_Ind
9 MACD
10 PROC_3
11 PROC_5
12 PROC_10
13 wpr
14 sto_os
15 goog_trend_score
16 count
17 compound_times_retweets
18 likes_count
19 neg_times_retweets
20 pos_times_retweets
21 replies_count
22 retweets_count
23 pos_weighted
24 neg_weighted
25 compound_weighted
26 count_avg7
27 count_daily_diff
28 count_weekly_diff
29 replies_count_avg7
30 replies_count_daily_diff
31 replies_count_weekly_diff
32 retweets_count_avg7
33 retweets_count_daily_diff
34 retweets_count_weekly_diff
35 likes_count_avg7
36 likes_count_daily_diff
37 likes_count_weekly_diff
38 compound_weighted_avg7
39 compound_weighted_daily_diff
40 compound_weighted_weekly_diff
41 pos_weighted_avg7
42 pos_weighted_daily_diff
43 pos_weighted_weekly_diff
44 neg_weighted_avg7
45 neg_weighted_daily_diff
46 neg_weighted_weekly_diff
47 Daily_Change_Perc
48 RSI
49 Weekly_Change
50 Weekly_Change_Perc
51 streak
52 Close/Last_Gold
53 Open_Gold
54 Daily_C

In [3]:
features_lst = ['MACD', 'RSI', 'PROC_3', 'Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold', \
                'compound_weighted_avg7', 'likes_count_daily_diff', 'pos_weighted_avg7',\
                'retweets_count_weekly_diff','goog_trend_score']


train_label = [train_cols.get_loc('label')]

def find_features(feature_names):
    train_col_numbs = []
    for f in feature_names:
        train_col_numbs.append(train_cols.get_loc(f))
    return train_col_numbs

train_col_numbs = find_features(features_lst)
print(train_col_numbs)
print(train_label)

[9, 48, 10, 61, 55, 38, 36, 41, 34, 15]
[64]


In [4]:
from sklearn.metrics import accuracy_score

def train_model(dataset, train_cols, train_label, max_iters=None):
    # get train data
    X = pd.read_csv(dataset,usecols=train_cols)
    y = pd.read_csv(dataset,usecols=train_label)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), random_state = 1234)
    #y_train = y_train.reshape(-1)
    
    # set up model
    lr = LogisticRegression(random_state=1234)
    if max_iters:
        lr.max_iter=max_iters
    
    #fit and predict model
    lr.fit(X_train, y_train)
    y_hat=lr.predict(X_test)
    confusion = metrics.confusion_matrix(y_test, y_hat)
    score = accuracy_score(y_test,y_hat)
    return lr, score, confusion

In [5]:
model, acc, confusion = train_model("bitcoin_train.csv", train_col_numbs, train_label, max_iters=50)
print(acc)
print(confusion)

0.4935897435897436
[[ 23 118]
 [ 40 131]]


In [6]:
coefs = {}
for i, f in enumerate(features_lst):
    coefs[f] = model.coef_[0, i]
for k, v in coefs.items():
    print(k, v)

MACD -0.00017473137327951124
RSI 2.589862960151024e-05
PROC_3 -0.007965572807570029
Daily_Change_Perc_SP500 6.6833472129563466e-06
Daily_Change_Perc_Gold 8.554491737389992e-06
compound_weighted_avg7 -0.00010513510567183508
likes_count_daily_diff -2.2541270316481435e-05
pos_weighted_avg7 0.010491148844596093
retweets_count_weekly_diff -4.5019438669025727e-07
goog_trend_score -4.586396439621075e-06


In [7]:
sorted_coefs = [abs(v) for v in coefs.values()]
sorted_coefs.sort()
sorted_coefs

[4.5019438669025727e-07,
 4.586396439621075e-06,
 6.6833472129563466e-06,
 8.554491737389992e-06,
 2.2541270316481435e-05,
 2.589862960151024e-05,
 0.00010513510567183508,
 0.00017473137327951124,
 0.007965572807570029,
 0.010491148844596093]

In [8]:
# Most valuable features
[0.00017473137327951124, 0.007965572807570029, 0.010491148844596093]
val_features = ['MACD', 'PROC_3', 'pos_weighted_avg7']

TEST ON MOST VALUABLE FEATURES

In [9]:
val_lst = find_features(val_features)
train_model("bitcoin_train.csv", val_features, train_label, max_iters=50)

(LogisticRegression(max_iter=50, random_state=1234),
 0.5384615384615384,
 array([[  3, 138],
        [  6, 165]]))

CHANGE ITERATIONS

In [10]:
models = []
accuracies = []

model1, acc1, _ = train_model("bitcoin_train.csv", val_lst, train_label, max_iters=50)
model2, acc2, _ = train_model("bitcoin_train.csv", val_lst, train_label, max_iters=100) #default
model3, acc3, _ = train_model("bitcoin_train.csv", val_lst, train_label, max_iters=200)
model4, acc4, _ = train_model("bitcoin_train.csv", val_lst, train_label, max_iters=500)

print(model1, acc1)
print(model2, acc2)
print(model3, acc3)
print(model4, acc4)

LogisticRegression(max_iter=50, random_state=1234) 0.5384615384615384
LogisticRegression(random_state=1234) 0.5384615384615384
LogisticRegression(max_iter=200, random_state=1234) 0.5384615384615384
LogisticRegression(max_iter=500, random_state=1234) 0.5384615384615384


Changing iterations doesn't affect accuracy

Test ONLY on S&P 500, gold, google trends score

In [11]:
naive_features = ['Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold', 'goog_trend_score']
naive_list = find_features(naive_features)

model, acc, _ = train_model("bitcoin_train.csv", naive_list, train_label)
print(acc)
print(model.coef_)

0.5448717948717948
[[-0.00485931 -0.02508352 -0.26845533]]


Combining naive features with the valuable features

In [12]:
combined_features = ['MACD', 'PROC_3', 'pos_weighted_avg7', 'goog_trend_score']
combined_list = find_features(combined_features)

model, acc, _ = train_model("bitcoin_train.csv", combined_list, train_label)
print(acc)
print(model.coef_)

0.5256410256410257
[[-8.27416463e-07  9.39521530e-01 -4.76420843e-03 -7.52875656e-01]]


Accuracy gets slightly worse--maybe features don't play nice together

**TEST ON TEST DATA**

Test using valuable features

In [None]:
def test_model(dataset, model):
    

In [13]:
val_features = ['MACD', 'PROC_3', 'pos_weighted_avg7']

model, acc, confusion = train_model("bitcoin_test.csv", val_lst, train_label)
print(model, acc)

LogisticRegression(random_state=1234) 0.6025641025641025


Now test using naive features

In [14]:
model, acc, _ = train_model("bitcoin_test.csv", naive_list, train_label)
print(acc)
print(model.coef_)

0.5769230769230769
[[-0.00667858 -0.09949942 -0.0497938 ]]


Add google trend score to combined features

In [15]:
combined_features = ['MACD', 'PROC_3', 'pos_weighted_avg7', 'goog_trend_score']

model, acc, confusion = train_model("bitcoin_test.csv", combined_list, train_label)
print(model, acc)
print(confusion)

LogisticRegression(random_state=1234) 0.5897435897435898
[[ 2 29]
 [ 3 44]]


**DOGECOIN**

In [16]:
doge = pd.read_csv("dogecoin_train.csv")

financial_features_final = ['RSI', 'RSI_Gold', 'RSI_SP500', 'Daily_Change_Perc_SP500',
                            'Daily_Change_Perc_Gold','wpr', 'MACD', 'PROC_3', 'streak']

sentiment_features_final = ['compound_weighted_avg7', 'compound_weighted', 'likes_count_daily_diff',
                            'retweets_count_weekly_diff', 'likes_count_avg7', 'goog_trend_score','neg_weighted',
                            'neg_weighted_avg7', 'pos_weighted_weekly_diff']

train_cols = doge.columns

for i,c in enumerate(train_cols):
    print(i, c)

0 Unnamed: 0
1 Date
2 Open
3 High
4 Low
5 Close
6 Adj Close**
7 Volume
8 Daily_Change
9 Daily_Change_Ind
10 MACD
11 PROC_3
12 PROC_5
13 PROC_10
14 wpr
15 sto_os
16 Daily_Change_Perc
17 Weekly_Change
18 Weekly_Change_Perc
19 RSI
20 streak
21 Close/Last_Gold
22 Open_Gold
23 Daily_Change_Gold
24 Daily_Change_Perc_Gold
25 Increased_Gold
26 RSI_Gold
27 Close/Last_SP500
28 Open_SP500
29 Daily_Change_SP500
30 Daily_Change_Perc_SP500
31 Increased_SP500
32 RSI_SP500
33 label
34 count
35 compound
36 compound_times_retweets
37 likes_count
38 neg
39 neg_times_retweets
40 neu
41 pos
42 pos_times_retweets
43 replies_count
44 retweets_count
45 pos_weighted
46 neg_weighted
47 compound_weighted
48 count_avg7
49 count_daily_diff
50 count_weekly_diff
51 replies_count_avg7
52 replies_count_daily_diff
53 replies_count_weekly_diff
54 retweets_count_avg7
55 retweets_count_daily_diff
56 retweets_count_weekly_diff
57 likes_count_avg7
58 likes_count_daily_diff
59 likes_count_weekly_diff
60 compound_weighted_avg

In [17]:
features_lst = ['MACD', 'RSI', 'PROC_3', 'Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold', \
                'compound_weighted_avg7', 'likes_count_daily_diff', 'pos_weighted_avg7',\
                'retweets_count_weekly_diff','goog_trend_score']


train_label = [train_cols.get_loc('label')]

def find_features(feature_names):
    train_col_numbs = []
    for f in feature_names:
        train_col_numbs.append(train_cols.get_loc(f))
    return train_col_numbs

train_col_numbs = find_features(features_lst)
print(train_col_numbs)
print(train_label)

[10, 19, 11, 30, 24, 60, 58, 63, 56, 69]
[33]


In [24]:
model, acc, confusion = train_model("dogecoin_train.csv", train_col_numbs, train_label)
print(model, acc)
print(confusion)

LogisticRegression(random_state=1234) 0.7957317073170732
[[261   0]
 [ 67   0]]


In [19]:
coefs = {}
for i, f in enumerate(features_lst):
    coefs[f] = model.coef_[0, i]
for k, v in coefs.items():
    print(k, v)

MACD 0.0031047691550734184
RSI 0.38956348631646504
PROC_3 0.008051315813397153
Daily_Change_Perc_SP500 -0.006235015978670364
Daily_Change_Perc_Gold -0.019189454915207394
compound_weighted_avg7 1.72714443205048e-05
likes_count_daily_diff 3.5562640522676276e-05
pos_weighted_avg7 0.7244792490857764
retweets_count_weekly_diff 0.22816809383926726
goog_trend_score -0.01497118460960242


In [20]:
sorted_coefs = [abs(v) for v in coefs.values()]
sorted_coefs.sort()
sorted_coefs

[1.72714443205048e-05,
 3.5562640522676276e-05,
 0.0031047691550734184,
 0.006235015978670364,
 0.008051315813397153,
 0.01497118460960242,
 0.019189454915207394,
 0.22816809383926726,
 0.38956348631646504,
 0.7244792490857764]

In [30]:
val_features = ['Daily_Change_Perc_Gold','retweets_count_weekly_diff','RSI','pos_weighted_avg7']
val_lst = find_features(val_features)

model, acc, confusion = train_model("dogecoin_train.csv", val_lst, train_label)
print(model, acc)
print(confusion)
print(model.coef_)

LogisticRegression(random_state=1234) 0.7957317073170732
[[261   0]
 [ 67   0]]
[[ 8.44922163e-03 -6.43347621e-02  8.61743833e-05  1.79862961e+00]]


In [32]:
naive_features = ['Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold', 'goog_trend_score']
naive_list = find_features(naive_features)

model, acc, _ = train_model("dogecoin_train.csv", naive_list, train_label)
print(acc)
print(model.coef_)

0.7957317073170732
[[-0.00098927 -0.00468049 -0.01579694]]


In [33]:
combined_features = ['pos_weighted_avg7','Daily_Change_Perc_SP500', 'Daily_Change_Perc_Gold',
                     'goog_trend_score', 'retweets_count_weekly_diff']
combined_list = find_features(combined_features)

model, acc, _ = train_model("dogecoin_train.csv", combined_list, train_label)
print(acc)
print(model.coef_)

0.7957317073170732
[[-4.12839355e-02 -1.07760812e-01  7.95115212e-05  1.57403286e+00
  -1.52002624e-02]]


**Now use dogecoin_test**

In [37]:
model, acc1, confusion = train_model("dogecoin_test.csv", val_lst, train_label)
print(model, acc)
print(confusion)
print(model.coef_)

LogisticRegression(random_state=1234) 0.7926829268292683
[[65  0]
 [17  0]]
[[ 7.09983370e-03  2.57564798e-02 -3.96334996e-05  1.62269423e-01]]


In [39]:
model, acc, _ = train_model("dogecoin_test.csv", naive_list, train_label)
print(acc)
print(model.coef_)

0.7926829268292683
[[ 0.08155698  0.00316606 -0.01763218]]


In [40]:
model, acc, _ = train_model("dogecoin_test.csv", combined_list, train_label)
print(acc)
print(model.coef_)

0.7926829268292683
[[ 9.02343049e-03  8.58443133e-04  3.08654851e-05  2.10983820e-02
  -1.76877178e-02]]
