### IS
Goal: Walk through all steps to pivot date and then continue day by day trading

In [1]:
# Import standard libraries
%matplotlib inline

import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pylab as plt
from pandas.tseries.offsets import BDay
import os
import os.path
import pickle
import random
import json

from sklearn.model_selection import StratifiedShuffleSplit, TimeSeriesSplit
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Import custom libraries
from Code.lib.plot_utils import PlotUtility
from Code.lib.time_utils import TimeUtility
from Code.lib.retrieve_data import DataRetrieve, ComputeTarget
from Code.lib.retrieve_system_info import TradingSystemUtility
from Code.lib.candle_indicators import CandleIndicators
from Code.lib.transformers import Transformers
from Code.lib.ta_momentum_studies import TALibMomentumStudies
from Code.lib.model_utils import ModelUtility, TimeSeriesSplitImproved
from Code.lib.feature_generator import FeatureGenerator
from Code.utilities.stat_tests import stationarity_tests
from Code.lib.config import current_feature, feature_dict
from Code.models import models_utils
from Code.lib.model_algos import AlgoUtility

plotIt = PlotUtility()
timeUtil = TimeUtility()
ct = ComputeTarget()
candle_ind = CandleIndicators()
dSet = DataRetrieve()
sysUtil = TradingSystemUtility()
taLibMomSt = TALibMomentumStudies()
transf = Transformers()
modelUtil = ModelUtility()
featureGen = FeatureGenerator()
dSet = DataRetrieve()
modelAlgo = AlgoUtility()

#### Establish and save system metadata
If new issue  
    Create system name  
    Create system dict  
Define IS-OOS, pivot date parameters  

Alternative  
Identify system name if existing
Otherwise, if system name is blank, it will create new system
Read system dict..it will contain entries if previous system exists

In [3]:
system_name = "TLT-Long-system-6571-V1"

In [4]:
if system_name == "":
    # set some defaults for now 
    print("New system")
    issue = "TLT"
    direction = "Long"
    ver_num = 1
    system_dict = sysUtil.get_system_dict(system_name, issue, direction, ver_num)

    pivotDate = str(datetime.date(2019, 1, 3))
    is_oos_ratio = 4
    oos_months = 4
    segments = 1

    system_dict['pivotDate'] = pivotDate
    system_dict['is_oos_ratio'] = is_oos_ratio
    system_dict['oos_months'] = oos_months
    system_dict['segments'] = segments

    system_name = system_dict['system_name']
    system_directory = sysUtil.get_system_dir(system_name)

    dSet.save_json('system_dict.json', system_directory, system_dict)
else:
    print("Existing system")
    system_directory = sysUtil.get_system_dir(system_name)
    print(system_directory)
    file_name = 'system_dict.json'
    system_dict = dSet.load_json(system_directory, file_name)

print(system_dict)

Existing system
C:\Users\kruegkj\Documents\GitHub\QuantTradingSys\Code\notebooks\TLT-Long-system-6571-V1
{'direction': 'Long', 'is_oos_ratio': 4, 'issue': 'TLT', 'oos_months': 4, 'pivotDate': '2019-01-03', 'segments': 1, 'system_name': 'TLT-Long-system-6571-V1', 'ver_num': 1}


### At some point, need to address time parameters
Load from system dict or declare
Am I re-calculating params from function call or saving/retrieving from system dict?

In [6]:
# Set IS-OOS parameters
from datetime import datetime
pivotDate = system_dict['pivotDate']
pivotDate = datetime.strptime(pivotDate, '%Y-%m-%d')
print(pivotDate)

2019-01-03 00:00:00


In [8]:
issue = system_dict['issue']
df = dSet.read_issue_data(issue)

Successfully retrieved data series for TLT


In [11]:
# Set date range and target
dataLoadStartDate = df.Date[0]
lastRow = df.shape[0]
dataLoadEndDate = df.Date[lastRow-1]
dataSet = dSet.set_date_range(df, dataLoadStartDate,dataLoadEndDate)
# Resolve any NA's for now
dataSet.fillna(method='ffill', inplace=True)

#set beLong level
beLongThreshold = 0.000
dataSet = ct.setTarget(dataSet, "Long", beLongThreshold)

In [12]:
# Quick review of loaded data
dataSet.tail(3)

Unnamed: 0,Open,High,Low,Close,AdjClose,Volume,gainAhead,beLong
2019-02-12,121.66,121.75,121.2,121.555,121.555,7939591.0,-0.003743,-1
2019-02-13,121.15,121.39,120.91,121.1,121.1,4757681.0,0.005477,1
2019-02-14,122.14,122.1957,121.65,121.7633,121.7633,2496779.0,0.0,-1


### Create features
Features will be normalized.

In [None]:
input_dict = {} # initialize
input_dict = {'f1': 
              {'fname' : 'PPO', 
               'params' : [2,5],
               'transform' : ['Normalized', 20]
               },
              'f2': 
              {'fname' : 'RSI', 
               'params' : [2],
               'transform' : ['Normalized', 20]
               },
              'f3': 
              {'fname' : 'CMO', 
               'params' : [5],
               'transform' : ['Normalized', 20]
               },
              'f4': 
              {'fname' : 'CCI', 
               'params' : [10],
               'transform' : ['Normalized', 20]
               },
              'f5': 
              {'fname' : 'UltimateOscillator', 
               'params' : [10, 20, 30],
               'transform' : ['Normalized', 20]
               },
              'f6': 
              {'fname' : 'ROC', 
               'params' : [10],
               'transform' : ['Normalized', 20]
               },
              'f7': 
                  {'fname' : 'Lag', 
                   'params' : ['Close', 3],
                   'transform' : ['Normalized', 20]
                   },
              'f8': 
                  {'fname' : 'Lag', 
                   'params' : ['Close', 5],
                   'transform' : ['Normalized', 20]
                   },
              'f9': 
                  {'fname' : 'ChaikinADOSC', 
                   'params' : [4, 10],
                   'transform' : ['Normalized', 20]
                   },
              'f10': 
                  {'fname' : 'kaufman_AMA', 
                   'params' : [4],
                   'transform' : ['Normalized', 20]
                   }
             }    


In [None]:
dataSet2 = featureGen.generate_features(dataSet, input_dict)

In [None]:
dataSet2 = transf.normalizer(dataSet, 'Volume', 50)

In [None]:
dataSet2.tail(5)

In [None]:
save_json('input_dict.json', input_dict)

### Save data locally
Save data to a pickle file in the data dir  
Name format: raw-features-<<'system_name'>>

In [None]:
# save Dataset of analysis
print("====Saving dataSet====\n")
print(system_directory)
print(system_name)
file_title = "raw-features-" + system_name + ".pkl"
file_name = os.path.join(system_directory, file_title)
dataSet2.to_pickle(file_name)

### Examine correlation of features
Improve this to identify and drop features with corr value higher than 0.3 Look at other methods to identify features and their contribution

In [None]:
# Get columns to drop from feature_dict
col_vals = [k for k,v in feature_dict.items() if v == 'Drop']
# And set OHLC, etc., to Drop for cleaner correlation analysis
to_drop = ['Open','High','Low', 'gainAhead', 'Close', 'beLong', 'Volume', 'AdjClose']
for x in to_drop:
    col_vals.append(x)
mmData = dSet.drop_columns(dataSet2, col_vals)

plotIt.correlation_matrix(mmData)

#### Examine and drop features with corr value > 0.85

In [None]:
# Create correlation matrix
corr_matrix = mmData.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.85
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
print('Column(s) to drop: %s' % to_drop)

In [None]:
# If there are columns to Drop, change feature dict to indicate Drop
if len(to_drop) > 0:
    for x in to_drop:
        feature_dict[x] = 'Drop'
    print(feature_dict)

### Save feature_dict to json

In [None]:
save_json('feature_dict.json', feature_dict)

# PROGRESS SO FAR

### (Placeholder) Examine feature importance of remaining features

### Load processing dates for IS and OOS; set start date for model analysis

In [None]:
# set date splits
isOosDates = timeUtil.is_oos_data_split(issue, pivotDate, is_oos_ratio, oos_months, segments)
dataLoadStartDate = isOosDates[0]
is_start_date = isOosDates[1]
oos_start_date = isOosDates[2]
is_months = isOosDates[3]
is_end_date = isOosDates[4]
oos_end_date = isOosDates[5]

modelStartDate = is_start_date
modelEndDate = modelStartDate + relativedelta(months=is_months)
print("Issue: " + issue)
print("IS Start date: " + str(modelStartDate) + "  IS End date: " + str(modelEndDate))

#### Prep data sets for classification

In [None]:
model_results = []
mmData = dataSet2.loc[modelStartDate:modelEndDate].copy()
# EV related
evData = dataSet2.loc[modelStartDate:modelEndDate].copy()

col_vals = [k for k,v in feature_dict.items() if v == 'Drop']
to_drop = ['Open','High','Low', 'gainAhead', 'Close', 'Volume', 'AdjClose']
for x in to_drop:
    col_vals.append(x)
mmData = dSet.drop_columns(mmData, col_vals)
nrows = mmData.shape[0]

### Prepare for classification

In [None]:
######################
# ML section
######################
#  Make 'iterations' index vectors for the train-test split
iterations = 100
tscv = TimeSeriesSplit(n_splits=10)

dX, dy = modelUtil.prepare_for_classification(mmData)        

tscvi = TimeSeriesSplitImproved(n_splits=8)

model_results = []

In [None]:
print(mmData.head(10))

### Make predictions with models

In [None]:
to_model = {"RF": modelAlgo.setRFClass(min_samples_split=20,
                                       n_estimators=200,
                                       max_features=None
                                       ),
            "KNN": modelAlgo.setKNNClass(n_neighbors=5),
            "SVM": modelAlgo.setSVMClass(),
            "AdaBoost": modelAlgo.setAdaBoostClass(learning_rate=0.2,
                                                   n_estimators=500
                                                  ),
            "GTB": modelAlgo.setGTBClass(learning_rate=0.05,
                                         subsample=0.5,
                                         max_depth=6,
                                         n_estimators=10
                                        ),
            "QDA": modelAlgo.setQDAClass()}
for key, value in to_model.items():
    modelname = key
    model = value
    info_dict = {'issue':issue,
                 'modelStartDate':modelStartDate,
                 'modelEndDate':modelEndDate,
                 'modelname':modelname,
                 'nrows':nrows,
                 'system_name':system_name
                }
    print(modelname)
    print(model)

    model_results, fit_model = modelUtil.model_and_test(dX,
                                                        dy,
                                                        model,
                                                        model_results,
                                                        tscvi,
                                                        info_dict,
                                                        evData
                                                       )
    
    # save Dataset of analysis
    print("====Saving model====\n")
    file_title = "fit-model-" + modelname + "-IS-" + system_name + ".sav"
    file_name = os.path.join(system_directory, file_title)
    pickle.dump(fit_model, open(file_name, 'wb'))
    print(model_results)

### Save results

In [None]:
## loop ended, print results
df = pd.DataFrame(model_results)
df = df[['Issue',
         'StartDate',
         'EndDate',
         'Model',
         'Rows',
         'beLongCount',
         'Features',
         'IS-Accuracy',
         'IS-Precision',
         'IS-RMC',
         'IS-RF',
         'IS-NPV',
         'IS-MCC',
         'IS-EV',
         'OOS-Accuracy',
         'OOS-Precision',
         'OOS-RMC',
         'OOS-RF',
         'OOS-NPV',
         'OOS-MCC',
         'OOS-EV',
        ]]
print(df)

In [None]:
## Save results
import datetime
dirext = system_name + '_start_' + str(dataLoadStartDate.strftime("%Y-%m-%d")) + '_end_' + str(pivotDate.strftime("%Y-%m-%d")) + '_' + datetime.datetime.now().strftime("%Y-%m-%d")
print(dirext)
filename = dirext + "IS_model_results.csv"
df.to_csv(system_directory+ "\\" + filename, encoding='utf-8', index=False)

In [None]:
# Save best model
system_dict['best_model'] = "SVM"
save_json('system_dict.json', system_dict)

### Current State
1. All models saved
2. Need to manually select best performing model
3. What is the metric for best performing model?
4. Do I need to add MAE, MAPE, etc. or other? How?

Files from analysis:
1. feature_dict
2. system_dict
3. <model results>
4. saved models
5. raw data with features


# Move to the next sheet...to simulate starting the next phase