# BTC Predictor

In [1367]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [1368]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
from ta import *

PATH='data/stock/'

## Stock Predictor Lib


In [1369]:
def cleanData(df):
    df = df.replace([np.inf, -np.inf], np.nan)
    df.fillna(method='bfill')
#     df = df.dropna()
#     df = df.replace(np.nan,df.mean())
    return df

In [1370]:
def calculateAccuracy(df):
    successfulPredictions = df.loc[df.action == df.predicted]
    # total accuracy does not provide an accurate represantation
    # totalAccuracy = len(successfulPredictions)/len(df)
    totalBuyActions = df.loc[df.action == 1]
    totalSellActions = df.loc[df.action == 0]
    successfulBuyPredictions = successfulPredictions.loc[successfulPredictions.action == 1]
    successfulSellPredictions = successfulPredictions.loc[successfulPredictions.action == 0]
    buyAccuracy = len(successfulBuyPredictions)/len(totalBuyActions)
    sellAccuracy = len(successfulSellPredictions)/len(totalSellActions)
    result = {
        'F1Score': (buyAccuracy + sellAccuracy )/2,
        'buyAccuracy': buyAccuracy,
        'sellAccuracy': sellAccuracy,
        'totalBuyActions': len(totalBuyActions),
        'successfulBuyPredictions': len(successfulBuyPredictions)
    }
    return result
            
def calculateNetProfit(dataFrame, startAmount):
    df = dataFrame
    df['buyAmount'] = 0
    df['sellAmount'] = 0
    totalBuys = 0
    totalSells = 0
    for index, row in df.iterrows():
        if index == df.index[0]:
            df.loc[index,'buyAmount'] = startAmount
        elif row.predicted == 1 and df.loc[index -1,'buyAmount'] > 0:
            df.loc[index,'sellAmount'] = df.loc[index -1,'buyAmount']/row.Close
            totalBuys +=1
        elif row.predicted == 1 and df.loc[index -1,'buyAmount'] == 0:
            df.loc[index,'sellAmount'] = df.loc[index -1,'sellAmount']
        elif row.predicted == 0 and df.loc[index -1,'sellAmount'] > 0:
            df.loc[index,'buyAmount'] = df.loc[index -1,'sellAmount']*row.Close
            totalSells +=1
        elif row.predicted == 0 and df.loc[index -1,'sellAmount'] == 0:
            df.loc[index,'buyAmount'] = df.loc[index -1,'buyAmount']
            
    startClose = df.Close.iloc[0]
    endClose = df.Close.iloc[-1]
    endBuyAmount = df.buyAmount.iloc[-1]
    endSellAmount = df.sellAmount.iloc[-1]
    endAmount = endBuyAmount if (endBuyAmount > 0) else (endSellAmount * endClose)
    
    buyAndHoldPercentIncrease = ((endClose - startClose)/startClose) * 100
    percentIncrease = ((endAmount - startAmount)/startAmount) * 100
    percentDifference = percentIncrease - buyAndHoldPercentIncrease
    
    result = {
        'startClose': startClose,
        'endClose': endClose,
        'startAmount': startAmount,
        'endAmount': endAmount,
        'buyAndHoldPercentIncrease':round(buyAndHoldPercentIncrease,3),
        'percentIncrease':round(percentIncrease,3),
        'percentDifference':round(percentDifference,3),
        'totalBuys':totalBuys,
        'totalSells':totalSells
    }
    return df,result


In [1371]:
#  use conflateTimeFrame(df, '5T')
def conflateTime(df, timeFrame):
    ohlc_dict = {                                                                                                             
        'Open':'first',                                                                                                    
        'High':'max',                                                                                                       
        'Low':'min',                                                                                                        
        'Close': 'last',                                                                                                    
        'Volume': 'sum'
    }
    df.resample(timeFrame, how=ohlc_dict, closed='left', label='left')
    return df

## Config


In [1372]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

lookahead = 30
percentIncrease = 1.005
recordsCount = 260000
testRecordsCount = 10000
trainRecordsCount = 250000
trainRatio = 0.9
lr = 1e-3
dropout = 0.09
modelName = 'btcBinaryClassificationModel'
index='Timestamp'
dep = 'action'

## Create datasets

In [1373]:
table_names = ['btc-bitstamp-2012-01-01_to_2018-01-08']

In [1374]:
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]

In [1375]:
from IPython.display import HTML

In [1376]:
for t in tables: display(t.head())

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
2,1325318040,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
3,1325318100,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
4,1325318160,4.39,4.39,4.39,4.39,0.455581,2.0,4.39


The following returns summarized aggregate information to each table accross each field.

In [1377]:
# for t in tables: display(DataFrameSummary(t).summary())

In [1378]:
train= tables[0]

In [1379]:
len(train)

3161057

In [1380]:
# trim to x records for now
# TODO: remove this
# train = train.tail(1000000)
train = train.tail(recordsCount)
len(train)

260000

In [1381]:
train.reset_index(inplace=True)
train.to_feather(f'{PATH}train')

## Data Cleaning

In [1382]:
train = pd.read_feather(f'{PATH}train')

In [1383]:
#convert to date objects
train[index] = pd.to_datetime(train[index], unit='s')
train.head()

Unnamed: 0,index,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,2901057,2017-07-11 10:41:00,2343.46,2343.46,2335.02,2335.02,4.843332,11318.864938,2336.99982
1,2901058,2017-07-11 10:42:00,2343.31,2343.31,2335.04,2335.65,10.780731,25189.064903,2336.489413
2,2901059,2017-07-11 10:43:00,2335.65,2342.8,2335.02,2335.02,12.944384,30230.352996,2335.403051
3,2901060,2017-07-11 10:44:00,2335.0,2335.01,2335.0,2335.01,1.550654,3620.792818,2335.009887
4,2901061,2017-07-11 10:45:00,2336.0,2339.35,2336.0,2336.01,7.062206,16498.913514,2336.226666


SET DEPENDENT VARIABLE ACTION

In [1384]:
# edit columns
train["Volume"] = train["Volume_(BTC)"]
train.drop('Volume_(BTC)',1,inplace=True)
train["VolumeCurrency"] = train["Volume_(Currency)"]
train.drop('Volume_(Currency)',1,inplace=True)

# delete unused columns
train.drop('VolumeCurrency',1,inplace=True)
train.drop('Weighted_Price',1,inplace=True)
train.head()

Unnamed: 0,index,Timestamp,Open,High,Low,Close,Volume
0,2901057,2017-07-11 10:41:00,2343.46,2343.46,2335.02,2335.02,4.843332
1,2901058,2017-07-11 10:42:00,2343.31,2343.31,2335.04,2335.65,10.780731
2,2901059,2017-07-11 10:43:00,2335.65,2342.8,2335.02,2335.02,12.944384
3,2901060,2017-07-11 10:44:00,2335.0,2335.01,2335.0,2335.01,1.550654
4,2901061,2017-07-11 10:45:00,2336.0,2339.35,2336.0,2336.01,7.062206


## Feature Engineering

In [None]:
# add technical analysis
train = add_all_ta_features(train, "Open", "High", "Low", "Close", "Volume", fillna=True)
train = cleanData(train)
len(train)

In [None]:
# train['action'] = 0;
# train.loc[train['Close'].rolling(window=lookahead).max() > train['Close'], 'action'] = 1
# train.loc[train['Close'].rolling(window=lookahead).max() > percentIncrease * train['Close'], 'action'] = 2

train['action'] =  train['Close'].rolling(window=lookahead).max() > percentIncrease * train['Close']
train.action = train.action.astype(int)

# target count by category
len(train[train.action==2]),len(train[train.action==1]),len(train[train.action==0])

Time modifications

In [None]:
# add all date time values
add_datepart(train, index, drop=False)
train['hour'] = train[index].dt.hour;
train['minute'] = train[index].dt.minute;
len(train)

## Split validation and test sets

In [None]:
# todo: make this into a percentage instead of hardcoding the test set 
# todo: create function 
test = train.tail(testRecordsCount)
test.reset_index(inplace=True)
train = train.head(trainRecordsCount)
train.reset_index(inplace=True)
len(train),len(test)

In [None]:
train.to_feather(f'{PATH}train')
test.to_feather(f'{PATH}test')

## Create features

In [None]:
train = pd.read_feather(f'{PATH}train')
test = pd.read_feather(f'{PATH}test')

In [None]:
train.tail(50).T.head(100)

In [None]:
# display(DataFrameSummary(train).summary())
# break break break now

Now that we've engineered all our features, we need to convert to input compatible with a neural network.

This includes converting categorical variables into contiguous integers or one-hot encodings, normalizing continuous features to standard normal, etc...

In [None]:
train.head()

Identify categorical vs continuous variables

In [None]:
cat_vars = ['TimestampYear', 'TimestampMonth', 'TimestampWeek', 'TimestampDay', 'hour','minute', 'TimestampDayofweek',
'TimestampDayofyear','TimestampIs_month_end', 'TimestampIs_month_start', 'TimestampIs_quarter_end',
'TimestampIs_quarter_start','TimestampIs_year_end', 'TimestampIs_year_start']

# techincal_indicators = ['volume_adi','volume_obv','volume_obvm','volume_cmf','volume_nvi','volatility_bbh',
# 'volatility_bbl','volatility_atr','volatility_bbm','trend_mass_index','trend_macd','trend_macd_signal',
# 'trend_kst','trend_kst_sig','trend_kst_diff','trend_macd_diff','trend_ema_fast','trend_ema_slow','trend_adx',
# 'trend_adx_pos','trend_adx_neg','trend_adx_ind','trend_ichimoku_a','trend_ichimoku_b','momentum_rsi','momentum_mfi',
# 'momentum_tsi','momentum_uo','momentum_stoch','momentum_stoch_signal','momentum_wr','momentum_ao']

contin_vars = ['Open', 'Close','High', 'Low', 'Volume', 'TimestampElapsed',
'volume_adi','volume_obv','volume_obvm','volume_cmf','volume_fi','volume_em','volume_vpt','volume_nvi',
'volatility_atr','volatility_bbh','volatility_bbl','volatility_bbm','volatility_bbhi','volatility_bbli',
'volatility_kcc','volatility_kch','volatility_kcl','volatility_kchi','volatility_kcli','volatility_dch',
'volatility_dcl','volatility_dchi','volatility_dcli','trend_macd','trend_macd_signal','trend_macd_diff',
'trend_ema_fast','trend_ema_slow','trend_adx','trend_adx_pos','trend_adx_neg','trend_adx_ind','trend_vortex_ind_pos',
'trend_vortex_ind_neg','trend_vortex_diff','trend_trix','trend_mass_index','trend_cci','trend_dpo','trend_kst',
'trend_kst_sig','trend_kst_diff','trend_ichimoku_a','trend_ichimoku_b','momentum_rsi','momentum_mfi','momentum_tsi',
'momentum_uo','momentum_stoch','momentum_stoch_signal','momentum_wr','momentum_ao']
# 'others_dr','others_cr'

# contin_vars = [base_vars+techincal_indicators]

n = len(train); n

test = test.set_index(index)
train = train.set_index(index)

len(contin_vars)

In [None]:
train = train[cat_vars+contin_vars+[dep]].copy()
# , index

In [None]:
# test[dep] = 0 
test = test[cat_vars+contin_vars+[dep]].copy()
# , index


In [None]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()
#     todo: maybe change dep variable to category here for multiclass option

In [None]:
apply_cats(test, train)
# test

In [None]:
for v in contin_vars:
    train[v] = train[v].astype('float32')
    test[v] = test[v].astype('float32')

We can now process our data...

In [None]:
df, y, nas, mapper = proc_df(train, dep, do_scale=True)

In [None]:
y.shape

In [None]:
df_test, _, nas, mapper = proc_df(test, dep, do_scale=True, mapper=mapper, na_dict=nas)
train.head(30).T.head(70)

In [None]:
nas={}

In [None]:
df.head(2)

In [None]:
df_test.head(2)

Rake the last x% of rows as our validation set.

In [None]:
train_size = int(n * trainRatio); train_size
val_idx = list(range(train_size, len(df)))
#val_idx = list(range(0, len(df)-train_size))
#val_idx = get_cv_idxs(n, val_pct=0.1)

In [None]:
len(val_idx)

## DL

We're ready to put together our models.

We can create a ModelData object directly from our data frame. Is_Reg is set to False to turn this into a classification problem (from a regression).  Is_multi is set True because there there are three labels for target BUY,HOLD,SELL

In [None]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y.astype('int'), cat_flds=cat_vars, bs=64,
                                      is_reg=False,is_multi=False,test_df=df_test)

Some categorical variables have a lot more levels than others.

In [None]:
cat_sz = [(c, len(train[c].cat.categories)+1) for c in cat_vars]

In [None]:
cat_sz

We use the *cardinality* of each variable (that is, its number of unique values) to decide how large to make its *embeddings*. Each level will be associated with a vector with length defined as below.

In [None]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [None]:
emb_szs

Check if cuda is available

In [None]:
torch.cuda.is_available()

In [None]:
len(df.columns)-len(cat_vars)

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),dropout, 2, [100,50], [0.03,0.06],None,True)

In [None]:
m

In [None]:
m.lr_find()
m.sched.plot(100)

In [None]:
m.fit(lr, 3)

In [None]:
m.fit(lr, 5, cycle_len=1)

In [None]:
m.fit(lr, 3, cycle_len=4, cycle_mult=2 )

In [None]:
m.save(modelName)

In [None]:
m.load(modelName)

## Validation

In [None]:
(x,y1)=m.predict_with_targs()

Predicted vs Validation

In [None]:
(np.argmax(x,axis=1),y1)

In [None]:
y1.shape

In [None]:
val = train.iloc[val_idx]
val[[dep]]
valpred = pd.DataFrame({'Close':val.Close,'index':val.index, 'action':val.action, 'predicted':np.argmax(x,axis=1)})[['Close','index', 'action','predicted']]
valpred.tail(100)

Calculate the percent accuracy on the validation set

In [None]:
calculateAccuracy(valpred)

In [None]:
newdf,result = calculateNetProfit(valpred, 10000)
result

In [None]:
newdf.head(10)

In [None]:
newdf.plot(x='index', y=['Close'], style='o')

In [None]:
newdf.tail(10)

## Test

In [None]:
np.argmax(m.predict(True), axis =1)

In [None]:
testPred = pd.DataFrame({'Timestamp':test.index, 'Close':test.Close, 'action':test.action, 'predicted':np.argmax(m.predict(True), axis =1)})[['Close','Timestamp', 'action', 'predicted']]
testPred.head(10)

Calculate the percent accuracy on the test set

In [None]:
calculateAccuracy(testPred)

In [None]:
newdf,result = calculateNetProfit(testPred, 10000)
result

In [None]:
newdf.head(10)

In [None]:
newdf.tail(10)

In [None]:
newdf.plot(x='Timestamp', y=['Close'], style='o',figsize=(10,5), grid=True)

In [None]:
# csv_fn=f'{PATH}/tmp/sub4.csv'
# sub.to_csv(csv_fn, index=False)
# FileLink(csv_fn)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
((val,trn), (y_val,y_trn)) = split_by_idx(val_idx, df.values, y)

In [None]:
m = RandomForestRegressor(n_estimators=40, max_features=0.99, min_samples_leaf=2,
                          n_jobs=-1, oob_score=True)
m.fit(trn, y_trn);

In [None]:
def PredtoClass(a):
    pred_class = []
    for i in range(len(a)):
        if a[i]<.5:
            pred_class.append(0)
        else:
            pred_class.append(1)
    return pred_class
def accuracy(preds, y_val):
    return  sum(1- abs(PredtoClass(preds) - y_val))/len(y_val)

Accuracy on the validation set using a Random Forest Regressor

In [None]:
preds = m.predict(val)
m.score(trn, y_trn), m.score(val, y_val), m.oob_score_, accuracy(preds, y_val)

In [None]:
preds_test = m.predict(df_test.values)

In [None]:
sub = pd.DataFrame({'Timestamp':test.index, 'action':PredtoClass(preds_test)})[['Timestamp', 'action']]
sub.head(10)

In [None]:
# csv_fn=f'{PATH}/tmp/RFsub5.csv'
# sub.to_csv(csv_fn, index=False)
# FileLink(csv_fn)