In [1]:
##################################
# IMPORTING BASE AND API LIBRARIES
##################################
from collections import Counter
import pandas as pd
from pandas import read_csv
import pandas_datareader as pdr
import datetime
# from pandas.tools.plotting import scatter_matrix
import numpy as np
from numpy import set_printoptions
import seaborn as sns
from matplotlib import pyplot
from pathlib import Path

# Import the main functionality from the SimFin Python API.
import simfin as sf
# Import names used for easy access to SimFin's data-columns.
from simfin.names import *

In [4]:
##############################
# IMPORTING MODELING LIBRARIES
##############################
from sklearn import utils
from sklearn import preprocessing
from scipy.stats import uniform

# Pre-processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer

# Feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

# Resample, model eval, & metrics and enhancements 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import precision_recall_curve

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Ensemble for further improvements
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

# Performance tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Finalize model with Pickle and joblib
from pickle import dump
from pickle import load
from sklearn.externals.joblib import dump
from sklearn.externals.joblib import load

Using TensorFlow backend.


In [5]:
sf.__version__

'0.3.0'

In [25]:
sf.set_data_dir(r'C:\Users\roland.ferrao\Desktop\algo-trading\Code\Archives\Data')

In [26]:
sf.load_api_key(default_key='free')

In [27]:
sns.set_style("whitegrid")

In [6]:
%%time
# Data for USA.
market = 'us'

# TTM Income Statements.
df_income_ttm = sf.load_income(variant='ttm', market=market)

# Quarterly Income Statements.
df_income_qrt = sf.load_income(variant='quarterly', market=market)

# TTM Balance Sheets.
df_balance_ttm = sf.load_balance(variant='ttm', market=market)

# TTM Cash-Flow Statements.
df_cashflow_ttm = sf.load_cashflow(variant='ttm', market=market)

# Quarterly Cash-Flow Statements.
df_cashflow_qrt = sf.load_cashflow(variant='quarterly', market=market)

# Latest Share-Prices.
# Use refresh_days=0 to always download the latest share-prices.
df_prices = sf.load_shareprices(variant='daily', market=market)
df_prices_latest = sf.load_shareprices(variant='latest', market=market,
                                       refresh_days=30)

Dataset "us-income-ttm" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!
Dataset "us-income-quarterly" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!
Dataset "us-balance-ttm" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!
Dataset "us-shareprices-latest" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!
Wall time: 19.7 s


In [81]:
tickers = ['MSFT','AAPL','V','MA','INTC','CSCO','ADBE','CRM','NVDA','ACN','AVGO','PYPL','ORCL','IBM','TXN','QCOM','FIS','ADP','INTU','FISV','GPN','AMAT','MU','NOW','ADI','AMD','ADSK','LRCX','CTSH','APH','TEL','HPQ','PAYX','MSI','FLT','KLAC','MCHP','XLNX','GLW','ANSS','HPE','SNPS','VRSN','CDW','KEYS','CDNS','SWKS','MXIM','FTNT','NLOK','NTAP','WDC','IT','AKAM','BR','CTXS','STX','QRVO','LDOS','JKHY','WU','ANET','DXC','JNPR','FFIV','FLIR','XRX','IPGP','ADS']

df_income_qrt = df_income_qrt.loc[tickers].copy()
df_income_ttm = df_income_ttm.loc[tickers].copy()
df_balance_ttm = df_balance_ttm.loc[tickers].copy()
df_cashflow_ttm = df_cashflow_ttm.loc[tickers].copy()
df_cashflow_qrt = df_cashflow_qrt.loc[tickers].copy()
df_prices_latest = df_prices_latest.loc[tickers].copy()

In [84]:
df_prices_latest.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,SimFinId,Open,Low,High,Close,Adj. Close,Dividend,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
VRSN,2019-11-21,378234,192.34,188.85,193.21,189.15,189.15,,442761
WDC,2019-11-21,250582,48.38,47.62,48.99,47.84,47.84,,3755582
WU,2019-11-21,378242,26.89,26.865,27.07,27.03,27.03,,5336647
XLNX,2019-11-21,396684,91.5,90.35,91.9,90.46,90.46,,2408613
XRX,2019-11-21,378247,38.44,38.24,38.95,38.69,38.69,,1670458


In [85]:
# Calculate moving average for all stocks in DataFrame.
# df_mavg = df_prices_latest.groupby(tickers, group_keys=False).rolling(window=200).mean()

In [8]:
%%time
df_fin_signals = \
    sf.fin_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   fill_method='ffill')


df_fin_signals_2017 = \
    sf.fin_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   fill_method='ffill',date_index='12/31/2017')


df_fin_signals_2018 = \
    sf.fin_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   fill_method='ffill',date_index='12/31/2018')

df_fin_signals_2y = \
    sf.fin_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   fill_method='ffill',
                   func=sf.avg_ttm_2y)

df_growth_signals = \
    sf.growth_signals(df_prices=df_prices_latest,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill')

df_growth_signals_2017 = \
    sf.growth_signals(df_prices=df_prices_latest,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill',date_index='12/31/2017')

df_growth_signals_2018 = \
    sf.growth_signals(df_prices=df_prices_latest,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill',date_index='12/31/2018')

df_growth_signals_2y = \
    sf.growth_signals(df_prices=df_prices_latest,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill',
                      func=sf.avg_ttm_2y)

df_val_signals = \
    sf.val_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   df_cashflow_ttm=df_cashflow_ttm,
                   fill_method='ffill')

df_val_signals_2017 = \
    sf.val_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   df_cashflow_ttm=df_cashflow_ttm,
                   fill_method='ffill',date_index='12/31/2017')

df_val_signals_2018 = \
    sf.val_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   df_cashflow_ttm=df_cashflow_ttm,
                   fill_method='ffill',date_index='12/31/2018')

df_val_signals_2y = \
    sf.val_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   df_cashflow_ttm=df_cashflow_ttm,
                   fill_method='ffill',
                   func=sf.avg_ttm_2y)

df_price_signals_2y = \
    sf.val_signals()

Wall time: 8.66 s


In [9]:
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

# Show the result.
df_signals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL,2019-11-21,1.540126,0.319178,0.378178,0.212381,0.151086,0.515703,-0.071811,0.362605,-0.03108,-0.081487,...,0.011591,0.045364,0.048352,1218062000000.0,13.46103,22.043972,20.681569,-14.294989,-9.49952,4.68172
ACN,2019-11-21,1.396741,0.000761,0.308103,0.110589,0.195472,0.445622,0.177148,-0.095308,0.098009,0.11331,...,0.014571,0.037352,0.047113,127947300000.0,8.628944,26.772194,21.225697,261.965953,-25.020021,2.960714
ADBE,2019-11-21,0.966122,0.211714,0.860408,0.280772,0.179113,0.310624,0.427639,-0.005896,0.156352,0.304345,...,,0.018077,0.025757,148363700000.0,15.029521,55.319561,38.824748,-32.967247,-27.469483,15.5322
ADI,2019-11-21,1.001142,0.263837,0.677884,0.247099,0.073887,0.139854,0.076706,-0.015119,-0.12568,-0.063434,...,0.018626,0.037372,0.049213,40626470000.0,3.445773,26.758188,20.319983,-5.229067,-4.878525,6.611917
ADP,2019-11-21,1.010534,0.053154,0.429114,0.165169,0.063335,0.506497,0.382937,0.224816,0.152355,0.21941,...,0.017989,0.031964,0.032489,74139640000.0,13.830217,31.28519,30.779941,-26.05505,-2.903202,5.167354


In [13]:
# Combine the DataFrames.
dfs = [df_fin_signals_2017, df_growth_signals_2017, df_val_signals_2017]
df_signals_2017 = pd.concat(dfs, axis=1)

# Show the result.
df_signals_2017.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL,2019-11-21,1.540126,0.319178,0.378178,0.212381,0.151086,0.515703,-0.071811,0.362605,-0.03108,-0.081487,...,0.011591,0.045364,0.048352,1218062000000.0,13.46103,22.043972,20.681569,-14.294989,-9.49952,4.68172
ACN,2019-11-21,1.396741,0.000761,0.308103,0.110589,0.195472,0.445622,0.177148,-0.095308,0.098009,0.11331,...,0.014571,0.037352,0.047113,127947300000.0,8.628944,26.772194,21.225697,261.965953,-25.020021,2.960714
ADBE,2019-11-21,0.966122,0.211714,0.860408,0.280772,0.179113,0.310624,0.427639,-0.005896,0.156352,0.304345,...,,0.018077,0.025757,148363700000.0,15.029521,55.319561,38.824748,-32.967247,-27.469483,15.5322
ADI,2019-11-21,1.001142,0.263837,0.677884,0.247099,0.073887,0.139854,0.076706,-0.015119,-0.12568,-0.063434,...,0.018626,0.037372,0.049213,40626470000.0,3.445773,26.758188,20.319983,-5.229067,-4.878525,6.611917
ADP,2019-11-21,1.010534,0.053154,0.429114,0.165169,0.063335,0.506497,0.382937,0.224816,0.152355,0.21941,...,0.017989,0.031964,0.032489,74139640000.0,13.830217,31.28519,30.779941,-26.05505,-2.903202,5.167354


In [11]:
# Combine the DataFrames.
dfs = [df_fin_signals_2018, df_growth_signals_2018, df_val_signals_2018]
df_signals_2018 = pd.concat(dfs, axis=1)

# Show the result.
df_signals_2018.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL,2019-11-21,1.540126,0.319178,0.378178,0.212381,0.151086,0.515703,-0.071811,0.362605,-0.03108,-0.081487,...,0.011591,0.045364,0.048352,1218062000000.0,13.46103,22.043972,20.681569,-14.294989,-9.49952,4.68172
ACN,2019-11-21,1.396741,0.000761,0.308103,0.110589,0.195472,0.445622,0.177148,-0.095308,0.098009,0.11331,...,0.014571,0.037352,0.047113,127947300000.0,8.628944,26.772194,21.225697,261.965953,-25.020021,2.960714
ADBE,2019-11-21,0.966122,0.211714,0.860408,0.280772,0.179113,0.310624,0.427639,-0.005896,0.156352,0.304345,...,,0.018077,0.025757,148363700000.0,15.029521,55.319561,38.824748,-32.967247,-27.469483,15.5322
ADI,2019-11-21,1.001142,0.263837,0.677884,0.247099,0.073887,0.139854,0.076706,-0.015119,-0.12568,-0.063434,...,0.018626,0.037372,0.049213,40626470000.0,3.445773,26.758188,20.319983,-5.229067,-4.878525,6.611917
ADP,2019-11-21,1.010534,0.053154,0.429114,0.165169,0.063335,0.506497,0.382937,0.224816,0.152355,0.21941,...,0.017989,0.031964,0.032489,74139640000.0,13.830217,31.28519,30.779941,-26.05505,-2.903202,5.167354


In [12]:
# Combine the DataFrames.
dfs = [df_fin_signals_2y, df_growth_signals_2y, df_val_signals_2y]
df_signals_2y = pd.concat(dfs, axis=1)

# Show the result.
df_signals_2y.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL,2019-11-21,1.336526,0.316104,0.380807,0.218261,0.15485,0.479904,0.079707,0.29442,0.143644,0.078496,...,0.011424,0.047119,0.050497,1218062000000.0,12.326376,21.22299,19.803144,-11.466916,-8.089859,4.633448
ACN,2019-11-21,1.367494,0.000892,0.303594,0.104088,0.187201,0.431865,0.177795,-0.054124,0.101056,0.162651,...,0.013963,0.034542,0.044715,127947300000.0,10.014552,28.950569,22.363759,732.223757,-24.668217,3.016969
ADBE,2019-11-21,1.540728,0.154678,0.863177,0.262391,0.161561,0.279087,0.429241,0.078327,0.309864,0.304944,...,,0.015369,0.022752,148363700000.0,16.034539,65.06433,43.9525,-88.338562,-61.876131,17.200776
ADI,2019-11-21,1.29099,0.291424,0.676234,0.238277,0.070335,0.140981,0.5817,0.03803,2.444183,0.839257,...,0.017814,0.036041,0.050879,40626470000.0,3.587888,27.746543,19.654383,-5.295009,-4.944634,6.611328
ADP,2019-11-21,1.021085,0.074492,0.422348,0.145716,0.055694,0.469974,0.173548,2.201547,0.188635,0.253922,...,0.016498,0.027539,0.029566,74139640000.0,14.769589,36.312702,33.82283,-26.349051,-2.689996,5.310958


In [None]:
# Add index returns

In [28]:
xlk_ticker_path = Path('Data/XLK_All_Holdings.csv')
xlk_tickerlist = pd.read_csv(xlk_ticker_path)

In [29]:
xlk_tickerlist = xlk_tickerlist.drop(columns=["Weight", "Shares Held", "Local Currency", "Name", "SEDOL", "Identifier"])

In [30]:
df_allprices = sf.load(dataset='shareprices', variant='daily', market='us')

Dataset "us-shareprices-daily" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... Done!


In [31]:
XLK_tickers = pd.merge(df_allprices, xlk_tickerlist, on='Ticker')

In [32]:
XLK_tickers = XLK_tickers.drop(columns=["Open", "Low", "High", "Adj. Close", "Volume", 'Dividend'])

In [38]:
XLK_tickers = XLK_tickers.loc[XLK_tickers['Date'] > '2012']
XLK_tickers.loc[XLK_tickers['Ticker']=='AAPL']
XLK_tickers.head()

Unnamed: 0,Ticker,SimFinId,Date,Close,Sector
1260,ADBE,14099,2012-01-03,28.57,Software
1261,ADBE,14099,2012-01-04,28.28,Software
1262,ADBE,14099,2012-01-05,28.48,Software
1263,ADBE,14099,2012-01-06,28.72,Software
1264,ADBE,14099,2012-01-09,28.53,Software


In [34]:
XLK_tickers_returns = XLK_tickers.sort_values(['Ticker','Date']).reset_index(drop=True)
XLK_tickers_returns['Return'] = XLK_tickers_returns.groupby('Ticker')['Close'].pct_change()*100
XLK_tickers_returns.head()

Unnamed: 0,Ticker,SimFinId,Date,Close,Sector,Return
0,AAPL,111052,2012-01-03,58.7471,Technology Hardware Storage & Peripherals,
1,AAPL,111052,2012-01-04,59.0629,Technology Hardware Storage & Peripherals,0.537558
2,AAPL,111052,2012-01-05,59.7186,Technology Hardware Storage & Peripherals,1.110172
3,AAPL,111052,2012-01-06,60.3429,Technology Hardware Storage & Peripherals,1.045403
4,AAPL,111052,2012-01-09,60.2471,Technology Hardware Storage & Peripherals,-0.158759


In [35]:
XLK_tickers_returns['Date'] = pd.to_datetime(XLK_tickers_returns['Date'])
XLK_tickers_returns['Year'] = XLK_tickers_returns['Date'].dt.year

In [36]:
XLK_tickers_returns.head()

Unnamed: 0,Ticker,SimFinId,Date,Close,Sector,Return,Year
0,AAPL,111052,2012-01-03,58.7471,Technology Hardware Storage & Peripherals,,2012
1,AAPL,111052,2012-01-04,59.0629,Technology Hardware Storage & Peripherals,0.537558,2012
2,AAPL,111052,2012-01-05,59.7186,Technology Hardware Storage & Peripherals,1.110172,2012
3,AAPL,111052,2012-01-06,60.3429,Technology Hardware Storage & Peripherals,1.045403,2012
4,AAPL,111052,2012-01-09,60.2471,Technology Hardware Storage & Peripherals,-0.158759,2012


In [39]:
XLK_tickers_returns.groupby(['Ticker', 'Year'])['Return'].sum()
XLK_tickers_returns.head()

Unnamed: 0,Ticker,SimFinId,Date,Close,Sector,Return,Year
0,AAPL,111052,2012-01-03,58.7471,Technology Hardware Storage & Peripherals,,2012
1,AAPL,111052,2012-01-04,59.0629,Technology Hardware Storage & Peripherals,0.537558,2012
2,AAPL,111052,2012-01-05,59.7186,Technology Hardware Storage & Peripherals,1.110172,2012
3,AAPL,111052,2012-01-06,60.3429,Technology Hardware Storage & Peripherals,1.045403,2012
4,AAPL,111052,2012-01-09,60.2471,Technology Hardware Storage & Peripherals,-0.158759,2012


In [3]:
start_sp = datetime.datetime(2018, 10, 1)
end_sp = datetime.datetime(2019, 10, 1)

In [4]:
ticker_symbols_list = ['XLK','MSFT','AAPL','V','MA','INTC','CSCO','ADBE','CRM','NVDA','ACN','AVGO','PYPL','ORCL','IBM','TXN','QCOM','FIS','ADP','INTU','FISV','GPN','AMAT','MU','NOW','ADI','AMD','ADSK','LRCX','CTSH','APH','TEL','HPQ','PAYX','MSI','FLT','KLAC','MCHP','XLNX','GLW','ANSS','HPE','SNPS','VRSN','CDW','KEYS','CDNS','SWKS','MXIM','FTNT','NLOK','NTAP','WDC','IT','AKAM','BR','CTXS','STX','QRVO','LDOS','JKHY','WU','ANET','DXC','JNPR','FFIV','FLIR','XRX','IPGP','ADS']

# df_income_qrt = df_income_qrt.loc[tickers].copy()
# create a yearly dataframe with all features, returns (2), nlp, 3 years (2017-2018 on 2019)

In [5]:
close_price_df = pdr.get_data_yahoo(ticker_symbols_list, start_sp, end_sp)['Close']

In [6]:
close_price_df.head()

Symbols,XLK,MSFT,AAPL,V,MA,INTC,CSCO,ADBE,CRM,NVDA,...,JKHY,WU,ANET,DXC,JNPR,FFIV,FLIR,XRX,IPGP,ADS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-10-01,75.690002,115.610001,227.259995,150.789993,223.770004,46.450001,48.869999,275.48999,159.860001,289.359985,...,159.039993,19.01,259.640015,93.889999,29.92,190.880005,61.490002,27.08,155.619995,237.839996
2018-10-02,75.699997,115.150002,229.279999,149.669998,222.369995,48.099998,49.009998,272.0,157.259995,286.480011,...,159.119995,18.889999,256.149994,93.43,29.83,187.509995,61.130001,27.030001,155.5,233.539993
2018-10-03,75.93,115.169998,232.070007,149.369995,223.130005,48.759998,49.139999,270.51001,159.100006,286.730011,...,159.059998,19.08,261.429993,93.669998,29.75,186.580002,61.099998,27.200001,157.419998,237.0
2018-10-04,74.559998,112.790001,227.990005,146.759995,216.149994,48.130001,48.380001,263.709991,154.899994,279.290009,...,157.800003,18.6,257.73999,92.910004,29.370001,182.979996,60.290001,27.110001,154.020004,235.580002
2018-10-05,73.620003,112.129997,224.289993,145.360001,213.259995,47.029999,48.130001,263.220001,155.070007,269.859985,...,158.660004,18.17,251.270004,90.989998,29.139999,182.660004,59.959999,26.200001,132.759995,231.330002


In [7]:
close_price_df.index

DatetimeIndex(['2018-10-01', '2018-10-02', '2018-10-03', '2018-10-04',
               '2018-10-05', '2018-10-08', '2018-10-09', '2018-10-10',
               '2018-10-11', '2018-10-12',
               ...
               '2019-09-18', '2019-09-19', '2019-09-20', '2019-09-23',
               '2019-09-24', '2019-09-25', '2019-09-26', '2019-09-27',
               '2019-09-30', '2019-10-01'],
              dtype='datetime64[ns]', name='Date', length=252, freq=None)

In [8]:
close_price_df = close_price_df.reset_index(drop = False)

In [9]:
close_price_df.head()

Symbols,Date,XLK,MSFT,AAPL,V,MA,INTC,CSCO,ADBE,CRM,...,JKHY,WU,ANET,DXC,JNPR,FFIV,FLIR,XRX,IPGP,ADS
0,2018-10-01,75.690002,115.610001,227.259995,150.789993,223.770004,46.450001,48.869999,275.48999,159.860001,...,159.039993,19.01,259.640015,93.889999,29.92,190.880005,61.490002,27.08,155.619995,237.839996
1,2018-10-02,75.699997,115.150002,229.279999,149.669998,222.369995,48.099998,49.009998,272.0,157.259995,...,159.119995,18.889999,256.149994,93.43,29.83,187.509995,61.130001,27.030001,155.5,233.539993
2,2018-10-03,75.93,115.169998,232.070007,149.369995,223.130005,48.759998,49.139999,270.51001,159.100006,...,159.059998,19.08,261.429993,93.669998,29.75,186.580002,61.099998,27.200001,157.419998,237.0
3,2018-10-04,74.559998,112.790001,227.990005,146.759995,216.149994,48.130001,48.380001,263.709991,154.899994,...,157.800003,18.6,257.73999,92.910004,29.370001,182.979996,60.290001,27.110001,154.020004,235.580002
4,2018-10-05,73.620003,112.129997,224.289993,145.360001,213.259995,47.029999,48.130001,263.220001,155.070007,...,158.660004,18.17,251.270004,90.989998,29.139999,182.660004,59.959999,26.200001,132.759995,231.330002


In [11]:
close_price_df_melt = close_price_df.melt(id_vars=["Date", "XLK"], 
        var_name="TickerSymbol", 
        value_name="Price")


In [12]:
close_price_df_melt.head()

Unnamed: 0,Date,XLK,TickerSymbol,Price
0,2018-10-01,75.690002,MSFT,115.610001
1,2018-10-02,75.699997,MSFT,115.150002
2,2018-10-03,75.93,MSFT,115.169998
3,2018-10-04,74.559998,MSFT,112.790001
4,2018-10-05,73.620003,MSFT,112.129997


In [13]:
# Select year end returns

In [14]:
close_price_df_melt['month'] = pd.DatetimeIndex(close_price_df_melt['Date']).month

In [15]:
# Merge with main dataframe and others

In [16]:
close_price_df_melt.head()

Unnamed: 0,Date,XLK,TickerSymbol,Price,month
0,2018-10-01,75.690002,MSFT,115.610001,10
1,2018-10-02,75.699997,MSFT,115.150002,10
2,2018-10-03,75.93,MSFT,115.169998,10
3,2018-10-04,74.559998,MSFT,112.790001,10
4,2018-10-05,73.620003,MSFT,112.129997,10


In [17]:
# Select December values


In [21]:
close_price_df_dec = close_price_df_melt.loc[close_price_df_melt['month'] == 12]

In [22]:
close_price_df_dec.head()

Unnamed: 0,Date,XLK,TickerSymbol,Price,month
44,2018-12-03,69.440002,MSFT,112.089996,12
45,2018-12-04,66.809998,MSFT,108.519997,12
46,2018-12-06,66.959999,MSFT,109.190002,12
47,2018-12-07,64.599998,MSFT,104.82,12
48,2018-12-10,65.489998,MSFT,107.589996,12


In [20]:
# Create features

In [None]:
# Add features

In [96]:
# xlf_df = pd.DataFrame()
# if xlk_df['month'] == 12:
#     xlf_df['XLK'].append()

# xlf_df.head()

# Combine return data frames together

# Add additional features

In [None]:
# NLP, FRED

# Create target and  features

In [None]:
array = df_signals_2y.values

In [90]:
len(array)

66

In [92]:
names = df_signals_2y.columns

In [154]:
Y = array[0:,0]

In [155]:
X = array[0:,1:]

In [157]:
# np.savetxt("foo.csv", array, delimiter=",")
X.shape

(66, 24)

In [188]:
mask = ~np.any(np.isnan(X), axis=1)

In [189]:
X = X[mask]
Y = Y[mask]

# Feature selection or engineering

In [None]:
# Feature selection - Univariate, recursive feature elimination, PCA and feature importance

In [None]:
test = SelectKBest(score_func=chi2, k=4)

In [None]:
# fit = test.fit(X, Y)

In [173]:
pca = PCA(n_components=3)

In [175]:
# fit = pca.fit(X)

In [176]:
model = ExtraTreesClassifier()

In [179]:
# model.fit(X,Y)

In [None]:
# print(model.feature_importance_)

# Pre-processing: Scale, standardize, normalize or binarize

In [None]:
lab_enc = preprocessing.LabelEncoder()
Y = lab_enc.fit_transform(Y)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
rescaledX = scaler.fit_transform(X)

# Create train-test harness

In [16]:
# Create test-train split: Basic, K-fold, leave out, repeated

In [202]:
test_size = 0.33 # validation size
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
# print("Accuracy: %.3f%%") % (result*100.0)



# Regression algorithms

In [None]:
# Regression metrics: mean absolute error, mean squared error, R2

In [None]:
# Evaluate algorithms on train - Reg (5), k-means, classification and regression trees, suppot vector machines

In [None]:
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

In [None]:
# Spot-Check Algorithms
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)

In [None]:
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

In [None]:
# create pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

# Classification algorithms

In [None]:
# Classification metrics: classification accuracy, logarithmic loss, AUC, confusion matrix, classification report
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Accuracy: {results.mean()} and {results.std()}")

In [211]:
# Classification: Logisitc, LDA, k-nearest neighbors, Naive bayes, classification and regression trees, & SVM

In [None]:
# SVM for classification
kfold = KFold(n_splits=10, random_state=7)

model = SVC()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

In [None]:
# SVM Regression
num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = SVR()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Building Pipelines to avoid leakage

In [None]:
# Create a pipeline that standardizes the data then creates a model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

In [None]:
# Adding feature extraction plus pre-processing and model into a pipeline

In [None]:
# Create a pipeline that extracts features from the data then creates a model
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression()))
model = Pipeline(estimators)
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

# Improve performance with Ensemble models

In [None]:
# bagging (bag decision trees, random forest, extra trees), boosting (adaBoost, gradient), voting (combine multiple)

# Improve performance with Algorithm tunning

In [None]:
# Using grid search and random search

In [None]:
# Grid Search for Algorithm Tuning
alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])
param_grid = dict(alpha=alphas)
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X, Y)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

# Finalize model

In [None]:
# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = GradientBoostingRegressor(random_state=seed, n_estimators=400)
model.fit(rescaledX, Y_train)

In [None]:
# transform the validation dataset
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(rescaledValidationX)
print(mean_squared_error(Y_validation, predictions))

# Save and load machine learning models - pickle, joblib

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# Fit the model on 33%
model = LogisticRegression()
model.fit(X_train, Y_train)
# save the model to disk
filename = 'finalized_model.sav'
dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

In [None]:
# Using joblib
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# Fit the model on 33%
model = LogisticRegression()
model.fit(X_train, Y_train)
# save the model to disk
filename = 'finalized_model.sav'
dump(model, filename)

# load the model from disk
loaded_model = load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)

# Load and check predictions

In [None]:
# Load the model and make the predictions from the x test dataset
model = load('../Resources/random_forest_model.joblib')
predictions = model.predict(x_test)
predictions

In [None]:
# Add predicted results to DataFrame
results["Predicted Value"] = predictions
results