In [None]:
url = 'https://anaconda.org/conda-forge/libta-lib/0.4.0/download/linux-64/libta-lib-0.4.0-h166bdaf_1.tar.bz2'
!curl -L $url | tar xj -C /usr/lib/x86_64-linux-gnu/ lib --strip-components=1
url = 'https://anaconda.org/conda-forge/ta-lib/0.4.19/download/linux-64/ta-lib-0.4.19-py310hde88566_4.tar.bz2'
!curl -L $url | tar xj -C /usr/local/lib/python3.10/dist-packages/ lib/python3.10/site-packages/talib --strip-components=3


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4457    0  4457    0     0   5185      0 --:--:-- --:--:-- --:--:--  5182
100  517k  100  517k    0     0   192k      0  0:00:02  0:00:02 --:--:--  323k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4497    0  4497    0     0   3901      0 --:--:--  0:00:01 --:--:--  3903
100  392k  100  392k    0     0   141k      0  0:00:02  0:00:02 --:--:--  271k


In [None]:
import pandas as pd
import numpy as np
import talib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn import svm


df = pd.read_csv('/content/sample_data/DATA2.csv')
#df['Date'] = pd.to_datetime(df['Date'])

# Define categories

overlap_studies = ['BBAND WIDTH', 'BBAND UPPER SIGNAL', 'BBAND LOWER SIGNAL', 'DEMA', 'EMA', 'HTRENDLINE', 'KMAM', 'MIDPOINT', 'MIDPRICE', 'SAR', 'SAREXT', 'SMA3', 'SMA5', 'SMA10', 'SMA20', 'TEMA', 'TRIMA', 'WMA']
momentum_indicators = ['ADX14', 'ADX20', 'ADXR', 'APO', 'AROONOSC', 'BOP', 'CCI3', 'CCI5', 'CCI10', 'CCI14', 'CMO', 'DX', 'MACD', 'MACDSIGNAL', 'MACDHIST', 'MINUS_DI', 'MINUS_DM', 'MOM1', 'MOM3', 'MOM5', 'MOM10', 'PLUS_DI', 'PLUS_DM', 'PPO', 'ROC', 'ROCP', 'ROCR', 'ROCR100', 'RSI5', 'RSI10', 'RSI14', 'SLOWK', 'SLOWD', 'FASTK', 'FASTD', 'TRIX','ULTOSC', 'WILLR']
volatility_indicators = ['ATR', 'NATR', 'TRANGE']
pattern_recognition = ['CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK', 'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR', 'CDLDRAGONFLYDOJI', 'CDLENGULFING', 'CDLEVENINGDOJISTAR', 'CDLEVENINGSTAR', 'CDLGAPSIDESIDEWHITE', 'CDLGRAVESTONEDOJI', 'CDLHAMMER', 'CDLHANGINGMAN', 'CDLHARAMI', 'CDLHARAMICROSS', 'CDLHIGHWAVE', 'CDLHIKKAKE', 'CDLHIKKAKEMOD', 'CDLHOMINGPIGEON', 'CDLIDENTICAL3CROWS', 'CDLINNECK', 'CDLINVERTEDHAMMER', 'CDLKICKING', 'CDLKICKINGBYLENGTH', 'CDLLADDERBOTTOM', 'CDLLONGLEGGEDDOJI', 'CDLLONGLINE', 'CDLMARUBOZU', 'CDLMATCHINGLOW', 'CDLMATHOLD', 'CDLMORNINGDOJISTAR', 'CDLMORNINGSTAR', 'CDLONNECK', 'CDLPIERCING', 'CDLRICKSHAWMAN', 'CDLRISEFALL3METHODS', 'CDLSEPARATINGLINES', 'CDLSHOOTINGSTAR', 'CDLSHORTLINE', 'CDLSPINNINGTOP', 'CDLSTALLEDPATTERN', 'CDLSTICKSANDWICH', 'CDLTAKURI', 'CDLTASUKIGAP', 'CDLTHRUSTING', 'CDLTRISTAR', 'CDLUNIQUE3RIVER', 'CDLUPSIDEGAP2CROWS', 'CDLXSIDEGAP3METHODS']
cycle_indicators = ['HT DCPERIOD', 'HT DCPHASE', 'TRENDMODE']

# Combine all indicators into one list
all_indicators = overlap_studies + momentum_indicators + volatility_indicators + pattern_recognition + cycle_indicators

# OVERLAP STUDIES

# BBAND WIDTH
upper_band, middle_band, lower_band = talib.BBANDS(df['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
bband_width = (upper_band - lower_band) / middle_band
upper_signal = (df['Close'] > upper_band).astype(int)
lower_signal = (df['Close'] < lower_band).astype(int)
df['BBAND WIDTH'] = bband_width
df['BBAND UPPER SIGNAL'] = upper_signal
df['BBAND LOWER SIGNAL'] = lower_signal

# DEMA
dema = talib.DEMA(df['Close'], timeperiod=30)
df['DEMA'] = dema

# EMA
ema = talib.EMA(df['Close'], timeperiod=20)
df['EMA'] = ema

# H TRENDLINE
ht_trendline = talib.HT_TRENDLINE(df['Close'])
df['HTRENDLINE'] = ht_trendline

# KMAM
kmam = talib.KAMA(df['Close'], timeperiod=30)
df['KMAM'] = kmam

# MIDPOINT
midpoint = talib.MIDPOINT(df['Close'], timeperiod=14)
df['MIDPOINT'] = midpoint

# MIDPRICE
midprice = talib.MIDPRICE(df['High'], df['Low'], timeperiod=14)
df['MIDPRICE'] = midprice

# SAR
sar = talib.SAR(df['High'], df['Low'], acceleration=0.02, maximum=0.2)
df['SAR'] = sar

# SAREXT
sar_ext = talib.SAREXT(df['High'], df['Low'])
df['SAREXT'] = sar_ext

# SMA3, SMA5, SMA10, SMA20, TEMA, TRIMA AND WMA.

sma3 = talib.SMA(df['Close'], timeperiod=3)
sma5 = talib.SMA(df['Close'], timeperiod=5)
sma10 = talib.SMA(df['Close'], timeperiod=10)
sma20 = talib.SMA(df['Close'], timeperiod=20)
tema = talib.TEMA(df['Close'], timeperiod=30)
trima = talib.TRIMA(df['Close'], timeperiod=30)
wma = talib.WMA(df['Close'], timeperiod=30)
df['SMA3'] = sma3
df['SMA5'] = sma5
df['SMA10'] = sma10
df['SMA20'] = sma20
df['TEMA'] = tema
df['TRIMA'] = trima
df['WMA'] = wma

# MOMENTUM INDICATORS

# ADX14, ADX20, ADXR, APO, AROONOSC, BOP, CCI3, CCI5, CCI10, CCI14, CMO, DX, MACD, MACD SIGNAL and MACDHIST

df['ADX14'] = talib.ADX(df['High'], df['Low'], df['Close'], timeperiod=14)
df['ADX20'] = talib.ADX(df['High'], df['Low'], df['Close'], timeperiod=20)
df['ADXR'] = talib.ADXR(df['High'], df['Low'], df['Close'], timeperiod=14)
df['APO'] = talib.APO(df['Close'], fastperiod=12, slowperiod=26, matype=0)
df['AROONOSC'] = talib.AROONOSC(df['High'], df['Low'], timeperiod=14)
df['BOP'] = talib.BOP(df['Open'], df['High'], df['Low'], df['Close'])
df['CCI3'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=3)
df['CCI5'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=5)
df['CCI10'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=10)
df['CCI14'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=14)
df['CMO'] = talib.CMO(df['Close'], timeperiod=14)
df['DX'] = talib.DX(df['High'], df['Low'], df['Close'], timeperiod=14)
macd, signal, hist = talib.MACD(df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
df['MACD'] = macd
df['MACDSIGNAL'] = signal
df['MACDHIST'] = hist

# MINUS_DI, MINUS_DM, MOM1, MOM3, MOM5, MOM10, PLUS_DI, PLUS_DM, PPO, ROC, ROCP, ROCR, ROCR100, RSI5, RSI10, RSI14, SLOWK, SLOWD, FASTK, FASTD, TRIX, ULTOSC and WILLR

df['MINUS_DI'] = talib.MINUS_DI(df['High'], df['Low'], df['Close'], timeperiod=14)
df['MINUS_DM'] = talib.MINUS_DM(df['High'], df['Low'], timeperiod=14)
df['MOM1'] = talib.MOM(df['Close'], timeperiod=1)
df['MOM3'] = talib.MOM(df['Close'], timeperiod=3)
df['MOM5'] = talib.MOM(df['Close'], timeperiod=5)
df['MOM10'] = talib.MOM(df['Close'], timeperiod=10)
df['PLUS_DI'] = talib.PLUS_DI(df['High'], df['Low'], df['Close'], timeperiod=14)
df['PLUS_DM'] = talib.PLUS_DM(df['High'], df['Low'], timeperiod=14)
df['PPO'] = talib.PPO(df['Close'], fastperiod=12, slowperiod=26, matype=0)
df['ROC'] = talib.ROC(df['Close'], timeperiod=10)
df['ROCP'] = talib.ROCP(df['Close'], timeperiod=10)
df['ROCR'] = talib.ROCR(df['Close'], timeperiod=10)
df['ROCR100'] = talib.ROCR100(df['Close'], timeperiod=10)
df['RSI5'] = talib.RSI(df['Close'], timeperiod=5)
df['RSI10'] = talib.RSI(df['Close'], timeperiod=10)
df['RSI14'] = talib.RSI(df['Close'], timeperiod=14)
slowk, slowd = talib.STOCH(df['High'], df['Low'], df['Close'], fastk_period=5, slowk_period=3, slowd_period=3)
df['SLOWK'] = slowk
df['SLOWD'] = slowd
fastk, fastd = talib.STOCHF(df['High'], df['Low'], df['Close'], fastk_period=5, fastd_period=3)
df['FASTK'] = fastk
df['FASTD'] = fastd
df['TRIX'] = talib.TRIX(df['Close'], timeperiod=30)
df['ULTOSC'] = talib.ULTOSC(df['High'], df['Low'], df['Close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['WILLR'] = talib.WILLR(df['High'], df['Low'], df['Close'], timeperiod=14)

# VOLATILITY INDICATORS

# ATR, NATR AND TRANGE

df['ATR'] = talib.ATR(df['High'], df['Low'], df['Close'], timeperiod=14)
df['NATR'] = talib.NATR(df['High'], df['Low'], df['Close'], timeperiod=14)
df['TRANGE'] = talib.TRANGE(df['High'], df['Low'], df['Close'])

# PATTERN RECOGNITION

for indicator in pattern_recognition:
    result = getattr(talib, indicator)(df.Open, df.High, df.Low, df.Close)
    df[indicator] = result

# CYCLE INDICATORS

# HT DCPERIOD, HT DCPHASE and TRENDMODE

df['HT DCPERIOD'] = talib.HT_DCPERIOD(df['Close'])
df['HT DCPHASE'] = talib.HT_DCPHASE(df['Close'])
df['TRENDMODE'] = talib.HT_TRENDMODE(df['Close'])

df = df.dropna()

#feature dimension reduction

from sklearn.decomposition import PCA
import numpy as np
pca = PCA(n_components = 2)
pca.fit(df[all_indicators])
df2 = pca.transform(df[all_indicators])
df2 = pd.DataFrame(df2, columns =['A', 'B', 'C'])
all_indicators = ['A', 'B', 'C']
df = pd.concat([df, df2], axis=1)

import warnings
warnings.filterwarnings("ignore")

df = df.dropna()

#PNL code
def pnl(df):
    pos = 0

    profit = 0
    bt = 100
    for index, row in df.iterrows():
        if pos < 1 and df.Signal[index] == 1:
            pos+=1
            bt /= df.Close[index]
        elif pos > -1 and df.Signal[index] == -1:
            pos -=1
            bt *= df.Close[index]
        if pos == 0:
            profit += bt - 100
            bt = 100
    return profit

profit_percentage = []

#I am only training with indices for now, it's training on roughly 8 month's data on predict next 2 month's
#I'll learn how to parse datetime to do the above
window = 150000
for i in range(0, df.shape[0], int(0.2*window)):
    train_df, test_df = df.iloc[i:i+1+int(0.8*window)], df.iloc[i+int(0.8*window):i+1+window]
    X_train = train_df[all_indicators+['Close']].iloc[:-1]
    fac = X_train.Close.pct_change().max()
    y_train = np.where(train_df.Close.shift(-1).pct_change() > 0.01*fac, 1, np.where(train_df.Close.shift(-1).pct_change() < -0.01*fac, -1, 0))[:-1]
    X_test = test_df[all_indicators+['Close']].iloc[:-1]
    y_test = np.where(test_df.Close.shift(-1).pct_change() > 0.01*fac, 1, np.where(test_df.Close.shift(-1).pct_change() < -0.01*fac, -1, 0))[:-1]
    model = DecisionTreeClassifier(criterion="entropy")  # You can try RandomForestClassifier as well
    #model = svm.SVC(C=10, kernel='linear')
    model.fit(X_train, y_train)
    try:
      predictions = model.predict(X_test)
      X_test['Signal'] = predictions
      accuracy = accuracy_score(y_test, predictions)
      #print(f'Accuracy: {accuracy}')
      #print('PNL: ', pnl(X_test))
      profit_percentage.append(pnl(X_test))
    except:
      pass


print(sum(profit_percentage))

  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df[indicator] = result
  df['HT DCPERIOD'] = talib.HT_DCPERIOD(df['Close'])
  df['HT DCPHASE'] = talib.HT_DCPHASE(df['Close'])
  df['TRENDMODE'] = talib.HT_TRENDMODE(df['Close'])


1278.4970763181784
