## Using SVM Models on crypto price trend prediction

In [36]:
# Preliminary code needed for importing from parent directory
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

# Import data API
from data import series

import numpy as np

# This classifier first converts the target values into {-1, 1} and then treats the problem as a regression task
# (multi-output regression in the multiclass case).
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, normalize

from sklearn.model_selection import train_test_split

from mpl_toolkits.mplot3d import Axes3D



import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import datetime
import json
import numpy as np
import pandas as pd
import requests
import time
import warnings
warnings.simplefilter('ignore')

import talib as ta
from talib import MA_Type

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as ticker



ModuleNotFoundError: No module named 'plotly'

### Data Import Functions

In [6]:
 def gen_X_y(symbol: str, timeframe: str):
    # Create a series for BTC-USDT pair on 1h candles
    # btc = series.DataSeries('BTCUSDT', '1h')
    btc = series.DataSeries(symbol, timeframe)
    data = btc.getData()
    #data
    # print(data.keys())

    price_close = data['close']
    # print(len(price_close))
    
    prev5 = np.concatenate([
            # pivot timeframe
            price_close[np.newaxis, 5:],
            # previous 5 timeframes
            price_close[np.newaxis, 4:-1], # 1 frame ago
            price_close[np.newaxis, 3:-2], # 2 frame ago
            price_close[np.newaxis, 2:-3], # 3 frame ago
            price_close[np.newaxis, 1:-4], # 4 frame ago
            price_close[np.newaxis, :-5],  # 5 frame ago
        ],
        axis = 0
    )

    # Generate truth values (y)
    y = prev5[0, :] > np.amin(prev5[1:, :], axis = 0)
#     print('timeframe:', timeframe)
#     print('number of times where trend is up: ', y[y == True].shape)
#     print('number of times where trend is down: ', y[y == False].shape)
    
    btc.addIndicator('RSI', data['close'], 30) # 30-timeframe RSI

    btc.addIndicator('EMA', data['close'], 30) # 30-timeframe EMA
    # btc1h.addIndicator('EMA', btc1h.getData()['close'], 50) # 50-timeframe EMA

    ## MFI: https://www.investopedia.com/terms/m/mfi.asp
    btc.addIndicator('MFI', data['high'], data['low'], data['close'], data['volume'], 10) # 10-timeframe MFI

    ## MACD: https://www.investopedia.com/terms/m/macd.asp
    btc.addIndicator('MACD', data['close'], 12, 26) # fast = 12, slow = 26

    indicators = btc.getIndicators()
    #for indicator in indicators.keys():
        #print(indicator)
      
    time_cut = 50

    # Each technical indicator consists one column of X.
    X = np.concatenate(
        (
            indicators['RSI'][np.newaxis, time_cut:].T,
            indicators['EMA'][np.newaxis, time_cut:].T,
            indicators['MFI'][np.newaxis, time_cut:].T
        ),
        axis = 1
    )
    
   # X_1 =my_data.loc[:, my_data.columns != 'y']
    # print('shape of X:', X.shape)
    # print('shape of y:', y.shape)

    y_truncate = y[(time_cut - 5):]
    
    return (X, y_truncate)

### Hour timeframe data, correcting imbalanced data, creating train and test sets

In [7]:
X_h, y_h = gen_X_y('BTCUSDT', '1h')


# Split train/test sets
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h)
print('X, y generated from 1h-scale data')
print('train set size:', y_train_h.shape)
print('test set size:', y_test_h.shape)



print('hour scale price data')
print('True count:', len(y_h[y_h == True]))
print('False count:', len(y_h[y_h == False]))

indices = np.where(y_h == True)[0]
if len(y_h[y_h == True]) > len(y_h[y_h == False]):
    indices = np.random.choice(indices, size = len(y_h[y_h == False]), replace = False)
else:
    raise NotImplementedError
    
# Combine the `True` indices with `False` indices
indices = np.concatenate((indices, np.where(y_h == False)[0]))
seed =1234
X_train_h_even, X_test_h_even, y_train_h_even, y_test_h_even = train_test_split(X_h[indices], y_h[indices])
print('Train set True/False ratio:', len(np.where(y_train_h_even == True)[0]), '/', len(np.where(y_train_h_even == False)[0]))
print('Test set True/False ratio:', len(np.where(y_test_h_even == True)[0]), '/', len(np.where(y_test_h_even == False)[0]))

X, y generated from 1h-scale data
train set size: (22967,)
test set size: (7656,)
hour scale price data
True count: 24621
False count: 6002
Train set True/False ratio: 4492 / 4511
Test set True/False ratio: 1510 / 1491


### Day timeframe data, correcting imbalanced data, creating train and test sets

In [9]:
X_d, y_d = gen_X_y('BTCUSDT', '1d')

# Split train/test sets
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d)
print('X, y generated from 1d-scale data')
print('train set size:', y_train_d.shape)
print('test set size:', y_test_d.shape)

print('day scale price data')
print('True count:', len(y_d[y_d == True]))
print('False count:', len(y_d[y_d == False]))

indices = np.where(y_d == True)[0]
if len(y_d[y_d == True]) > len(y_d[y_d == False]):
    indices = np.random.choice(indices, size = len(y_d[y_d == False]), replace = False)
else:
    raise NotImplementedError
    
# Combine the `True` indices with `False` indices
indices = np.concatenate((indices, np.where(y_d == False)[0]))
seed =1234
X_train_d_even, X_test_d_even, y_train_d_even, y_test_d_even = train_test_split(X_d[indices], y_d[indices])
print('Total dataset size (train + test):', indices.shape[0])
print('Train set True/False ratio:', len(np.where(y_train_d_even == True)[0]), '/', len(np.where(y_train_d_even == False)[0]))
print('Test set True/False ratio:', len(np.where(y_test_d_even == True)[0]), '/', len(np.where(y_test_d_even == False)[0]))


X, y generated from 1d-scale data
train set size: (924,)
test set size: (309,)
day scale price data
True count: 1000
False count: 233
Total dataset size (train + test): 466
Train set True/False ratio: 173 / 176
Test set True/False ratio: 60 / 57


## Starting the SVM Modelling

In [10]:

# Scale all the x variables 
scaled_x_train = StandardScaler().fit_transform(X_train_d_even)
scaled_x_test=StandardScaler().fit_transform(X_test_d_even)



In [11]:
# running the svm model with no parameter tuning 
svmmodel = SVC(gamma='auto')
svmmodel.fit(X_train_d_even, y_train_d_even)
y_predict_train = svmmodel.predict(scaled_x_train)
y_predict_test = svmmodel.predict(scaled_x_test)

In [12]:
# check the train-set accuracy 
train_accuracy = accuracy_score(y_train_d_even,y_predict_train)
train_accuracy

0.504297994269341

In [13]:
# check the test-set accuracy 
test_accuracy = accuracy_score(y_test_d_even,y_predict_test)
test_accuracy

0.48717948717948717

In [26]:
# Parameter Tuning using different Kernel and different values of C

svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.001,0.01, 0.1, 1.0,10.0, 100.0,1000.0]}
        # ADD CODE HERE
svc = SVC(gamma='auto')
svm_cv = GridSearchCV(svc,svm_parameters,n_jobs=-1,return_train_score=True)
svm_cv.fit(scaled_x_train,y_train_d_even)


GridSearchCV(estimator=SVC(gamma='auto'), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                         'kernel': ('linear', 'rbf')},
             return_train_score=True)

In [27]:
# best score for the svm model 
best_score= svm_cv.best_score_
best_score

0.676231884057971

In [28]:
y_predict_train = svm_cv.predict(scaled_x_train)
y_predict_test = svm_cv.predict(scaled_x_test)

In [29]:
train_accuracy = accuracy_score(y_train_d_even, y_predict_train)
test_accuracy = accuracy_score(y_test_d_even, y_predict_test)


In [30]:
print(train_accuracy)
print(test_accuracy)

0.7106017191977078
0.6410256410256411


In [31]:
rank_test_score = svm_cv.cv_results_['rank_test_score']
rank_test_score

array([12, 13,  9, 13,  2,  3,  4,  1,  4,  8,  4, 10,  4, 11])

In [32]:
mean_test_score = svm_cv.cv_results_['mean_test_score']
mean_test_score

array([0.51585921, 0.50430642, 0.66761905, 0.50430642, 0.67619048,
       0.67333333, 0.67329193, 0.67623188, 0.67329193, 0.6705176 ,
       0.67329193, 0.63602484, 0.67329193, 0.61022774])

In [33]:
svm_cv.cv_results_

# lin 0.01, rbf 0.01, lin 0.1, rbf 0.1, lin 1.0, rbf 1.0 
#0.66761905, 0.50430642, 0.67619048, 0.67333333, 0.67329193, 0.67623188



{'mean_fit_time': array([0.00979605, 0.00800042, 0.00879974, 0.0075994 , 0.00559921,
        0.01180129, 0.00760055, 0.00731792, 0.01360145, 0.00840025,
        0.08836265, 0.02435942, 0.47980084, 0.21229291]),
 'std_fit_time': array([0.00247779, 0.00063295, 0.0044463 , 0.00079914, 0.00185566,
        0.0059809 , 0.00320005, 0.00077481, 0.00265347, 0.0010194 ,
        0.02559945, 0.00238946, 0.09183802, 0.07576667]),
 'mean_score_time': array([0.00280128, 0.00280099, 0.00139999, 0.00240097, 0.00280228,
        0.00219946, 0.00140109, 0.00182891, 0.00199962, 0.0015996 ,
        0.00760002, 0.00179963, 0.00406995, 0.00140061]),
 'std_score_time': array([0.00074746, 0.00075026, 0.00049008, 0.00049059, 0.00098024,
        0.00040028, 0.00049035, 0.00050956, 0.00063196, 0.0004908 ,
        0.01072528, 0.00097973, 0.00538178, 0.00048831]),
 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1.0, 1.0, 10.0,
                    10.0, 100.0, 100.0, 1000.0, 1000.0],
              