### ------------------------------------------------------------------------------------------------------------

In [None]:
import os

if 'COLAB_GPU' in os.environ:
    !git clone https://github.com/impulsecorp/PickStocks.git
    !mv PickStocks/*.py .
    !mv PickStocks/data .
    !pip install -U -qq -r PickStocks/requirements.txt
    !pip install -qq autogluon
    !pip install Pillow==9.0.0

In [None]:
import warnings
warnings.filterwarnings("ignore")
import system
from system import *
# small hack to prevent Colab error
try:
    from datablock import *
except:
    from datablock import *
from autogluon.tabular import TabularDataset, TabularPredictor
seed

In [None]:
%pylab inline

### Setup

In [None]:
data_timeperiod = 'D'
data = get_data('SPY', period=data_timeperiod, nrows=None)
data = procdata_lite(data)

In [None]:
# for inspectiion
print(data.shape)
data.head()

In [None]:
system.train_set_end = 0.75 # percentage point specifying the training set end point (1.0 means all data is training set)
system.val_set_end = 1.0 # percentage point specifying the validation set end point (1.0 means no test set)
system.balance_data = 1
system.scale_data = 1

### ------------------------------------------------------------------------------------------------------------

### Base test

In [None]:
# Train classifier on train data
clf, scaler = train_classifier(LogisticRegression, data)

In [None]:
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### ------------------------------------------------------------------------------------------------------------

### XGBClassifier

In [None]:
# Train XGBClassifier classifier on train data
clf, scaler = train_classifier(XGBClassifier, data)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### LGBMClassifier

In [None]:
# Train LGBMClassifier classifier on train data
clf, scaler = train_classifier(LGBMClassifier, data)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### RandomForestClassifier

In [None]:
# Train RandomForestClassifier classifier on train data
clf, scaler = train_classifier(RandomForestClassifier, data, n_jobs=-1)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### XGBClassifier + HPO

In [None]:
# Train XGBClassifier classifier on train data, but optimize it with HPO first
X_train, y_train = get_clean_Xy(data.iloc[0:int(data.shape[0] * system.train_set_end)])
scaler = StandardScaler()
if system.scale_data:
    X_train = scaler.fit_transform(X_train)
if system.balance_data:
    # Apply SMOTE oversampling to balance the training data
    sm = SMOTE(random_state=newseed())
    X_train, y_train = sm.fit_resample(X_train, y_train)
best_hyperparams = optimize_model(XGBClassifier(), 'XGBClassifier',
                                  {
                                        "max_depth": hp.quniform("max_depth", 2, 12, 1),
                                        "learning_rate": hp.uniform("learning_rate", 0.001, 0.2),
                                        "n_estimators": hp.quniform("n_estimators", 5, 1000, 1),
                                        "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
                                        "gamma": hp.uniform("gamma", 0, 1),
                                        "subsample": hp.uniform("subsample", 0.1, 1),
                                        "colsample_bytree": hp.uniform("colsample_bytree", 0.1, 1),
                                        "reg_alpha": hp.uniform("reg_alpha", 0, 1),
                                        "reg_lambda": hp.uniform("reg_lambda", 0, 1),
                                  },
                                  X_train, y_train, max_evals=10)
clf, scaler = train_classifier(XGBClassifier, data, **best_hyperparams)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### LogisticRegression Ensemble

In [None]:
# Train LogisticRegression ensemble on train data
clf, scaler = train_clf_ensemble(LogisticRegression, data, ensemble_size=100)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### AutoGluon

In [None]:
# Train AutoGluon on train data
X_train, y_train = get_clean_Xy(data.iloc[0:int(data.shape[0] * system.train_set_end)])
scaler = StandardScaler()
if system.scale_data:
    X_train = scaler.fit_transform(X_train)
if system.balance_data:
    # Apply SMOTE oversampling to balance the training data
    sm = SMOTE(random_state=newseed())
    X_train, y_train = sm.fit_resample(X_train, y_train)
xtd = pd.DataFrame(X_train)
xtd['target'] = y_train
clf = TabularPredictor(label='target').fit(xtd)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### ------------------------------------------------------------------------------------------------------------

### Neural Network Training

In [None]:
X_train, y_train = get_clean_Xy(data.iloc[0:int(data.shape[0] * system.train_set_end)])

In [None]:
clf, scaler = train_classifier(PyTorchClassifierWrapper, data, input_dim=X_train.shape[1], hidden_dim=32)

In [None]:
# Test on val data
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
trades.head()

### ------------------------------------------------------------------------------------------------------------

### Big Move - search for best threshold

In [None]:
system.train_set_end = 0.5 # percentage point specifying the training set end point (1.0 means all data is training set)
system.val_set_end = 0.75 # percentage point specifying the validation set end point (1.0 means no test set)

In [None]:
# Train classifier on train data
clf, scaler = train_classifier(LogisticRegression, data)

In [None]:
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
feature_name = 'last move'

# Compute the profit factor for every candidate value
levels = np.linspace(0.0, 5.0, 100)
pfs = []
nts = []
for l in tqdm(levels):
    pf, ntrades = compute_stats(data, filter_trades_by_feature(trades, data, featformat(feature_name), min_value=l, use_abs=True))
    pfs.append(pf)
    nts.append(len(ntrades))

In [None]:
# Plot the optimization/search results
plt.plot(levels, pfs);
plt.xlabel(feature_name);
plt.ylabel('Profit Factor');

In [None]:
res = pd.DataFrame(data = np.hstack([ np.array(nts).reshape(-1,1),
                                      np.array(pfs).reshape(-1,1)]),
             index=np.array(levels),
             columns=['num trades', 'profit factor'])
res

In [None]:
best_min_move = 1.0
best_max_move = 1.5

In [None]:
# Base test without the filter
equity, pf, trades = qbacktest(clf, scaler, data, skip_val=1, skip_test=0)

In [None]:
# Test with the filter
equity, pf, trades = qbacktest(clf, scaler, data, quiet=1, skip_val=1, skip_test=0)
# filter stats
pf, ntrades = compute_stats(data, filter_trades_by_feature(trades, data, featformat(feature_name), min_value=best_min_move,
                                                           max_value=best_max_move,
                                                           use_abs=True))
print(f'Profit factor: {get_profit_factor(ntrades):.5f}, Winners: {get_winner_pct(ntrades):.2f}%, Trades: {len(ntrades)}')

In [None]:
plot(ntrades['profit'].cumsum());

In [None]:
ntrades[0:20]

### ------------------------------------------------------------------------------------------------------------

### Day Of Week - Search for best day of week to trade

In [None]:
system.train_set_end = 0.5 # percentage point specifying the training set end point (1.0 means all data is training set)
system.val_set_end = 0.75 # percentage point specifying the validation set end point (1.0 means no test set)

In [None]:
# Train classifier on train data
clf, scaler = train_classifier(LogisticRegression, data)

In [None]:
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
feature_name = 'day'

# Compute the profit factor for every candidate value
levels = [0,1,2,3,4]
pfs = []
nts = []
for l in tqdm(levels):
    pf, ntrades = compute_stats(data, filter_trades_by_feature(trades, data, featformat(feature_name), exact_value=l))
    pfs.append(pf)
    nts.append(len(ntrades))

In [None]:
# Plot the optimization/search results
plt.plot(levels, pfs);
plt.xlabel(feature_name);
plt.ylabel('Profit Factor');

In [None]:
res = pd.DataFrame(data = np.hstack([ np.array(nts).reshape(-1,1),
                                      np.array(pfs).reshape(-1,1)]),
             index=np.array(levels),
             columns=['num trades', 'profit factor'])
res

In [None]:
best_day = 1

In [None]:
# Base test without the filter
equity, pf, trades = qbacktest(clf, scaler, data, skip_val=1, skip_test=0)

In [None]:
# Test with the filter
equity, pf, trades = qbacktest(clf, scaler, data, quiet=1, skip_val=1, skip_test=0)
# filter stats
pf, ntrades = compute_stats(data, filter_trades_by_feature(trades, data, featformat(feature_name), exact_value=best_day))
print(f'Profit factor: {get_profit_factor(ntrades):.5f}, Winners: {get_winner_pct(ntrades):.2f}%, Trades: {len(ntrades)}')

In [None]:
plot(ntrades['profit'].cumsum());

In [None]:
ntrades[0:20]

### ------------------------------------------------------------------------------------------------------------

### Feature Matrix

In [None]:
system.train_set_end = 0.5 # percentage point specifying the training set end point (1.0 means all data is training set)
system.val_set_end = 0.75 # percentage point specifying the validation set end point (1.0 means no test set)

In [None]:
# Train classifier on train data
clf, scaler = train_classifier(LogisticRegression, data)

In [None]:
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
feature_names = [featdeformat(x) for x in data.filter(like='X')]

In [None]:
feature_ranges = []
for fn in feature_names:
    d = data[featformat(fn)].values
    feature_ranges.append((np.min(d), np.max(d)))

In [None]:
# bins for each feature
num_bins = 10 + 1
feat_bins = []
for fmin, fmax in feature_ranges:
    feat_bins.append(np.linspace(fmin, fmax, num_bins))
feat_bins = np.array(feat_bins)

In [None]:
# Train classifier on train data
clf, scaler = train_classifier(LogisticRegression, data)

In [None]:
equity, pf, base_trades = qbacktest(clf, scaler, data)

In [None]:
base_trades.head()

### Search for the best bins for each feature

In [None]:
pf_matrix = []
nt_matrix = []
wn_matrix = []

coords = []

for row_idx, (fname, bins) in enumerate(zip(tqdm(feature_names), feat_bins)):
    pfs = []
    nts = []
    wns = []
    for col_idx in range(1,len(bins)):
        if bins[col_idx-1] > bins[col_idx]:
            bs = bins[col_idx], bins[col_idx-1]
        else:
            bs = bins[col_idx-1], bins[col_idx]
        pf, ntrades = compute_stats(data, filter_trades_by_feature(base_trades, data, featformat(fname), min_value=bs[0], max_value=bs[1]))
        if (pf != -1) and (len(ntrades) > 0):
            pf_matrix.append(pf)
            nt_matrix.append(len(ntrades))
            wn_matrix.append(get_winner_pct(ntrades))
            coords.append((row_idx, col_idx))

### Compute the PF matrix, take the top N

In [None]:
zpd = sorted(list(zip(pf_matrix, nt_matrix, wn_matrix, coords)), key = lambda x: x[2], reverse=True)

In [None]:
N = 10
min_pf = 1.0
min_trades = 50
max_trades = 1000
# the top N PFs here
top_pfs = []
top_nts = []
top_wns = []
top_coords = []
for pf, nt, wn, coords in zpd:
    if (nt >= min_trades) and (nt <= max_trades) and (pf >= min_pf):
        top_pfs.append(pf)
        top_nts.append(nt)
        top_wns.append(wn)
        top_coords.append( coords )
        if len(top_coords) >= N:
            break
pd.DataFrame(data=list(zip(top_pfs, top_nts, top_wns)), columns=['PF', 'Trades', ' % Winners'])

In [None]:
# Description of the discovered best bins - the feature names and the bin boundaries
for i in range(len(top_pfs)):
    r,c = top_coords[i]
    _, ntrades = compute_stats(data, filter_trades_by_feature(base_trades, data, featformat(feature_names[r]), min_value=feat_bins[r,c-1], max_value=feat_bins[r,c]))
    print(feature_names[r], f'[{feat_bins[r,c-1]:.5f} .. {feat_bins[r,c]:.5f}]')

### Combine all good strategies with OR into one big strategy and check the performance on training data

In [None]:
# this is done by simply merging all trade lists and then removing the duplicate trades
alltrades = []
for i in range(len(top_pfs)):
    r,c = top_coords[i]
    _, mtrades = compute_stats(data, filter_trades_by_feature(base_trades, data, featformat(feature_names[r]), min_value=feat_bins[r][c-1], max_value=feat_bins[r][c]))
    alltrades.append(mtrades)
alltrades = pd.concat(alltrades, axis=0).drop_duplicates().sort_index()
plt.plot(alltrades['profit'].cumsum());
print(f'Profit factor: {get_profit_factor(alltrades):.5f}, Winners: {get_winner_pct(alltrades):.2f}%, Trades: {len(alltrades)}')

In [None]:
alltrades.head()

In [None]:
# Base test without the filter
clf, scaler = train_classifier(LogisticRegression, data)
equity, _, test_trades = qbacktest(clf, scaler, data, skip_val=1, skip_test=0)

In [None]:
# Test with the filter
alltrades = []
for r,c in top_coords:
    _, mtrades = compute_stats(data, filter_trades_by_feature(test_trades, data, featformat(feature_names[r]), min_value=feat_bins[r,c-1], max_value=feat_bins[r,c]))
    alltrades.append(mtrades)
alltrades = pd.concat(alltrades, axis=0).drop_duplicates().sort_index()
plt.plot(alltrades['profit'].cumsum());
print(f'Profit factor: {get_profit_factor(alltrades):.5f}, Winners: {get_winner_pct(alltrades):.2f}%, Trades: {len(alltrades)}')

In [None]:
alltrades[0:20]

### ------------------------------------------------------------------------------------------------------------

### Pred Prob - Search for best min_confidence

In [None]:
system.train_set_end = 0.5 # percentage point specifying the training set end point (1.0 means all data is training set)
system.val_set_end = 0.75 # percentage point specifying the validation set end point (1.0 means no test set)

In [None]:
# Train LogisticRegression ensemble on train data
clf, scaler = train_clf_ensemble(LogisticRegression, data, ensemble_size=100)

In [None]:
equity, pf, trades = qbacktest(clf, scaler, data)

In [None]:
max_conf_seen = np.max(np.abs(0.5-trades['pred'].values)*2.0)
max_conf_seen

In [None]:
# Compute the profit factor for every candidate value
levels = np.linspace(0.0, max_conf_seen, 100)
pfs = []
nts = []
for l in tqdm(levels):
    pf, ntrades = compute_stats(data, filter_trades_by_confidence(trades, min_conf=l))
    pfs.append(pf)
    nts.append(len(ntrades))

In [None]:
# Plot the optimization/search results
plt.plot(levels, pfs);
plt.xlabel('Confidence');
plt.ylabel('Profit Factor');

In [None]:
res = pd.DataFrame(data = np.hstack([ np.array(nts).reshape(-1,1),
                                      np.array(pfs).reshape(-1,1)]),
             index=np.array(levels),
             columns=['num trades', 'profit factor'])
res

In [None]:
best_min_confidence = 0.15
best_max_confidence = 0.35

In [None]:
# Base test without the filter
clf, scaler = train_classifier(LogisticRegression, data)
equity, pf, trades = qbacktest(clf, scaler, data, skip_val=1, skip_test=0)

In [None]:
# Test with the filter
# filter stats
pf, ntrades = compute_stats(data, filter_trades_by_confidence(trades, min_conf=best_min_confidence, max_conf=best_max_confidence))
print(f'Profit factor: {get_profit_factor(ntrades):.5f}, Winners: {get_winner_pct(ntrades):.2f}%, Trades: {len(ntrades)}')

In [None]:
plot(ntrades['profit'].cumsum());

In [None]:
ntrades[0:20]

### ------------------------------------------------------------------------------------------------------------

### X in row - Search for best value

In [None]:
system.train_set_end = 0.5 # percentage point specifying the training set end point (1.0 means all data is training set)
system.val_set_end = 0.75 # percentage point specifying the validation set end point (1.0 means no test set)

In [None]:
feature_name = 'times in row'
# Compute the profit factor for every candidate value
levels = [0,1,2,3,4,5,6,7,8,9,10]
pfs = []
nts = []
for l in tqdm(levels):
    pf, ntrades = compute_stats(data, filter_trades_by_feature(trades, data, featformat(feature_name), exact_value=l))
    pfs.append(pf)
    nts.append(len(ntrades))

In [None]:
# Plot the optimization/search results
plt.plot(levels, pfs);
plt.xlabel(feature_name);
plt.ylabel('Profit Factor');

In [None]:
res = pd.DataFrame(data = np.hstack([ np.array(nts).reshape(-1,1),
                                      np.array(pfs).reshape(-1,1)]),
             index=np.array(levels),
             columns=['num trades', 'profit factor'])
res

In [None]:
min_best_x = 7
max_best_x = 9

In [None]:
# Base test without the filter
equity, pf, trades = qbacktest(clf, scaler, data, skip_val=1, skip_test=0)

In [None]:
# Test with the filter
equity, pf, trades = qbacktest(clf, scaler, data, quiet=1, skip_val=1, skip_test=0)
# filter stats
pf, ntrades = compute_stats(data, filter_trades_by_feature(trades, data, featformat(feature_name), min_value=min_best_x, max_value=max_best_x))
print(f'Profit factor: {get_profit_factor(ntrades):.5f}, Winners: {get_winner_pct(ntrades):.2f}%, Trades: {len(ntrades)}')

In [None]:
plot(ntrades['profit'].cumsum());

In [None]:
ntrades[0:20]

### ------------------------------------------------------------------------------------------------------------