In [1]:
##################################
# IMPORTING BASE AND API LIBRARIES
##################################
from collections import Counter
import pandas as pd
from pandas import read_csv
import pandas_datareader as pdr
import datetime
# from pandas.tools.plotting import scatter_matrix
import numpy as np
from numpy import set_printoptions
import seaborn as sns
from matplotlib import pyplot
from pathlib import Path

# Import the main functionality from the SimFin Python API.
import simfin as sf
# Import names used for easy access to SimFin's data-columns.
from simfin.names import *

In [2]:
##############################
# IMPORTING MODELING LIBRARIES
##############################
from sklearn import utils
from sklearn import preprocessing
from scipy.stats import uniform

# Pre-processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer

# Feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

# Resample, model eval, & metrics and enhancements 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import precision_recall_curve

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Ensemble for further improvements
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

# Performance tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Finalize model with Pickle and joblib
from pickle import dump
from pickle import load
from sklearn.externals.joblib import dump
from sklearn.externals.joblib import load

Using TensorFlow backend.


# Getting SimFin data

In [3]:
sf.__version__

'0.3.0'

In [4]:
data_path = r'C:\Users\Roland.Ferrao\Desktop\Data'
sf.set_data_dir(data_path)

In [5]:
sf.load_api_key(default_key='free')

In [6]:
sns.set_style("whitegrid")

In [7]:
%%time
# Data for USA.
market = 'us'

# TTM Income Statements.
df_income_ttm = sf.load_income(variant='ttm', market=market)

# Quarterly Income Statements.
df_income_qrt = sf.load_income(variant='quarterly', market=market)

# TTM Balance Sheets.
df_balance_ttm = sf.load_balance(variant='ttm', market=market)

# TTM Cash-Flow Statements.
df_cashflow_ttm = sf.load_cashflow(variant='ttm', market=market)

# Quarterly Cash-Flow Statements.
df_cashflow_qrt = sf.load_cashflow(variant='quarterly', market=market)

# Latest Share-Prices.
# Use refresh_days=0 to always download the latest share-prices.
df_prices = sf.load_shareprices(variant='daily', market=market)
df_prices_latest = sf.load_shareprices(variant='latest', market=market,
                                       refresh_days=30)

Dataset "us-income-ttm" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-latest" on disk (0 days old).
- Loading from disk ... Done!
Wall time: 50.9 s


In [8]:
tickers = ['MSFT','AAPL','V','MA','INTC','CSCO','ADBE','CRM','NVDA','ACN','AVGO','PYPL','ORCL','IBM','TXN','QCOM','FIS','ADP','INTU','FISV','GPN','AMAT','MU','NOW','ADI','AMD','ADSK','LRCX','CTSH','APH','TEL','HPQ','PAYX','MSI','FLT','KLAC','MCHP','XLNX','GLW','ANSS','HPE','SNPS','VRSN','CDW','KEYS','CDNS','SWKS','MXIM','FTNT','NLOK','NTAP','WDC','IT','AKAM','BR','CTXS','STX','QRVO','LDOS','JKHY','WU','ANET','DXC','JNPR','FFIV','FLIR','XRX','IPGP','ADS']

df_income_qrt = df_income_qrt.loc[tickers].copy()
df_income_ttm = df_income_ttm.loc[tickers].copy()
df_balance_ttm = df_balance_ttm.loc[tickers].copy()
df_cashflow_ttm = df_cashflow_ttm.loc[tickers].copy()
df_cashflow_qrt = df_cashflow_qrt.loc[tickers].copy()
# df_prices_latest = df_prices_latest.loc[tickers].copy()

In [9]:
# df_prices_latest.tail()

In [18]:
# Getting signals for last day of every year
df_price_list = []
df_fin_signals_list = []
df_growth_signals_list = []
df_val_signals_list = []

# setting the price and signals data frames (not including latest year)
shareprice_last_dates = ["2011-12-30", "2012-12-31", "2013-12-31", "2014-12-31", "2015-12-31","2016-12-30", "2017-12-29", "2018-12-31"]

for i in range(len(shareprice_last_dates)):
    df_price_list.append(df_prices.loc[(df_prices.index.get_level_values('Date') == shareprice_last_dates[i])])
    df_fin_signals_list.append(sf.fin_signals(df_prices=df_price_list[i],
                                              df_income_ttm=df_income_ttm,
                                              df_balance_ttm=df_balance_ttm,
                                              fill_method='ffill'))
    df_growth_signals_list.append(sf.growth_signals(df_prices=df_price_list[i],
                                                    df_income_ttm=df_income_ttm,
                                                    df_income_qrt=df_income_qrt,
                                                    df_cashflow_ttm=df_cashflow_ttm,
                                                    df_cashflow_qrt=df_cashflow_qrt,
                                                    fill_method='ffill'))
    df_val_signals_list.append(sf.val_signals(df_prices=df_price_list[i],
                                              df_income_ttm=df_income_ttm,
                                              df_balance_ttm=df_balance_ttm,
                                              df_cashflow_ttm=df_cashflow_ttm,
                                              fill_method='ffill'))

In [19]:
# Latest year signals
df_fin_signals = \
    sf.fin_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   fill_method='ffill')

df_growth_signals = \
    sf.growth_signals(df_prices=df_prices_latest,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill')

df_val_signals = \
    sf.val_signals(df_prices=df_prices_latest,
                   df_income_ttm=df_income_ttm,
                   df_balance_ttm=df_balance_ttm,
                   df_cashflow_ttm=df_cashflow_ttm,
                   fill_method='ffill')




In [20]:
# Combine the DataFrames (for latest)
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)
df_signals = df_signals.loc[tickers].copy()
# Show the result.
print("Signals")
df_signals.shape

Signals


(66, 25)

In [21]:
df_signals.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
VRSN,2019-11-21,1.389877,0.932381,0.841861,0.479427,0.198045,-0.462193,0.273902,0.323322,0.771687,0.008291,...,,0.025106,0.02839,23201330000.0,-16.74613,39.831358,35.224082,-11.700445,-11.490243,19.096231
WDC,2019-11-21,2.220854,0.399014,0.226447,-0.045507,-0.025791,-0.065389,-2.117037,-0.660929,-1.260582,-0.767373,...,0.041806,-0.053976,0.056553,13969280000.0,1.401553,-18.526897,17.682633,-1.762463,-1.342748,0.843097
WU,2019-11-21,0.170419,,0.409506,0.1524,0.092283,-1.733618,-2.529169,0.016779,-1.189223,-0.136746,...,0.02782,0.069359,0.039267,12282430000.0,-39.646327,14.417692,25.466374,-1.473915,-1.473915,2.197254
XLNX,2019-11-21,6.440925,0.32145,0.668651,0.287627,0.185031,0.389569,0.6222,-0.059911,0.052297,0.194007,...,0.015921,0.04099,0.043573,23236370000.0,8.619135,24.396368,22.949954,24.878154,38.02364,7.017049
XRX,2019-11-21,1.444171,0.35162,0.41292,0.036724,0.022639,0.065553,0.851282,0.539326,-1.825301,-35.65625,...,0.029315,0.019644,-0.003224,9926693000.0,1.802559,50.906119,-310.209166,-1.905681,-1.294097,0.967043


In [22]:
# Combine the DataFrames (for the previous years.)
dfs_list = [[df_fin_signals_list[i], df_growth_signals_list[i], df_val_signals_list[i]] for i in range(len(df_fin_signals_list))]
df_signals_list = [pd.concat(e, axis=1) for e in dfs_list]
df_signals_list = [e.loc[tickers].copy() for e in df_signals_list]

In [23]:
# Show an example.
print("2011 Signals")
df_signals_list[7].head()

2011 Signals


Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL,2018-12-31,1.300555,0.306995,0.382085,0.227172,0.146096,0.423905,0.176269,0.413451,-0.004984,0.159609,...,0.018022,0.076829,0.080115,773553100000.0,6.56154,13.015986,12.482098,-6.726607,-5.04442,2.956871
ACN,2018-12-31,1.330578,0.000922,0.30099,0.099894,0.183292,0.425632,0.181418,0.238164,0.134436,0.265707,...,0.019383,0.04565,0.059567,92243970000.0,7.065489,21.90565,16.787882,-835.270835,-16.641346,2.188236
ADBE,2018-12-31,1.129248,0.165626,0.867664,0.286907,0.178237,0.306243,0.529424,0.017934,0.352291,0.377703,...,,0.023002,0.033451,112632000000.0,12.030616,43.474267,29.894513,-24.756849,-21.697877,12.473079
ADI,2018-12-31,1.502076,0.30967,0.682687,0.241162,0.070735,0.147166,1.056258,0.044638,0.245405,1.407814,...,0.021855,0.04647,0.067975,32180930000.0,2.92859,21.519486,14.711382,-4.378886,-4.088294,5.189684
ADP,2018-12-31,1.045737,0.082675,0.422163,0.130347,0.040505,0.458969,0.040602,0.104472,0.194011,0.59772,...,0.020114,0.031232,0.040513,57771470000.0,12.126928,32.018773,24.683389,-24.481512,-1.973189,4.173546


In [24]:
df_signals_list[7].shape

(66, 25)

Output of the previous section are `df_signals` (Signals for latest year) `df_signals_list` (Signals for previous years)



# Getting Data from Yahoo Finance

In [25]:
start_sp = datetime.datetime(2011, 1, 1)
end_sp = datetime.datetime(2019, 12, 31)

In [26]:
ticker_symbols_list = ['XLK','MSFT','AAPL','V','MA','INTC','CSCO','ADBE','CRM','NVDA','ACN','AVGO','PYPL','ORCL','IBM','TXN','QCOM','FIS','ADP','INTU','FISV','GPN','AMAT','MU','NOW','ADI','AMD','ADSK','LRCX','CTSH','APH','TEL','HPQ','PAYX','MSI','FLT','KLAC','MCHP','XLNX','GLW','ANSS','HPE','SNPS','VRSN','CDW','KEYS','CDNS','SWKS','MXIM','FTNT','NLOK','NTAP','WDC','IT','AKAM','BR','CTXS','STX','QRVO','LDOS','JKHY','WU','ANET','DXC','JNPR','FFIV','FLIR','XRX','IPGP','ADS']
# df_income_qrt = df_income_qrt.loc[tickers].copy()
# create a yearly dataframe with all features, returns (2), nlp, 3 years (2017-2018 on 2019)

In [27]:
close_price_df = pdr.get_data_yahoo(ticker_symbols_list, start_sp, end_sp)['Close']

In [28]:
close_price_df.shape

(2253, 70)

In [29]:
# Yearly returns
multpl_stock_yearly_returns = close_price_df.resample('Y').ffill().pct_change()
multpl_stock_yearly_returns.head()

Symbols,AAPL,ACN,ADBE,ADI,ADP,ADS,ADSK,AKAM,AMAT,AMD,...,SWKS,TEL,TXN,V,VRSN,WDC,WU,XLK,XLNX,XRX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-12-31,,,,,,,,,,,...,,,,,,,,,,
2012-12-31,0.314,0.249296,0.332862,0.175517,0.054064,0.394068,0.165513,0.267348,0.068161,-0.555556,...,0.251541,0.204804,0.061147,0.492958,0.086786,0.372859,-0.254655,0.133595,0.118528,-0.143216
2013-12-31,0.054212,0.236391,0.589172,0.210889,0.419287,0.816317,0.42348,0.153263,0.545455,0.6125,...,0.406897,0.484644,0.421496,0.469059,0.539928,0.974582,0.26745,0.238822,0.280535,0.784458
2014-12-31,0.377241,0.086232,0.214095,0.090124,0.175228,0.087932,0.193561,0.334464,0.409502,-0.310077,...,1.545868,0.147705,0.217718,0.177474,-0.046504,0.319428,0.038261,0.156967,-0.057273,0.138866
2015-12-31,-0.046385,0.170082,0.29216,-0.003602,0.016193,-0.033141,0.014485,-0.164072,-0.250803,0.074906,...,0.056663,0.021502,0.025061,0.183066,0.532632,-0.457543,0.0,0.035792,0.085008,-0.233045


In [30]:
# Melting yearly returns
multpl_stock_yearly_returns = multpl_stock_yearly_returns.reset_index(drop = False)
multpl_stock_yearly_returns_melt = multpl_stock_yearly_returns.melt(id_vars=["Date", "XLK"], 
        var_name="TickerSymbol", 
        value_name="YearlyReturn")

In [31]:
multpl_stock_yearly_returns_melt.head()

Unnamed: 0,Date,XLK,TickerSymbol,YearlyReturn
0,2011-12-31,,AAPL,
1,2012-12-31,0.133595,AAPL,0.314
2,2013-12-31,0.238822,AAPL,0.054212
3,2014-12-31,0.156967,AAPL,0.377241
4,2015-12-31,0.035792,AAPL,-0.046385


In [32]:
# changing XLK returns column name
multpl_stock_yearly_returns_melt.rename(columns = {'XLK':'XLK_Return'}, inplace = True)


In [33]:
multpl_stock_yearly_returns_melt.head()

Unnamed: 0,Date,XLK_Return,TickerSymbol,YearlyReturn
0,2011-12-31,,AAPL,
1,2012-12-31,0.133595,AAPL,0.314
2,2013-12-31,0.238822,AAPL,0.054212
3,2014-12-31,0.156967,AAPL,0.377241
4,2015-12-31,0.035792,AAPL,-0.046385


In [34]:
threshold = 0.05 
delta_value = threshold * multpl_stock_yearly_returns_melt['YearlyReturn']
win = np.where(multpl_stock_yearly_returns_melt['XLK_Return'] - multpl_stock_yearly_returns_melt['YearlyReturn'] >= delta_value, 1, 0)

In [35]:
multpl_stock_yearly_returns_melt['win'] = win

In [36]:
multpl_stock_yearly_returns_melt = multpl_stock_yearly_returns_melt.sort_values(by=['Date','TickerSymbol'])
multpl_stock_yearly_returns_melt.head()

Unnamed: 0,Date,XLK_Return,TickerSymbol,YearlyReturn,win
0,2011-12-31,,AAPL,,0
9,2011-12-31,,ACN,,0
18,2011-12-31,,ADBE,,0
27,2011-12-31,,ADI,,0
36,2011-12-31,,ADP,,0


In [37]:
multpl_stock_yearly_returns_melt = multpl_stock_yearly_returns_melt.reset_index(drop = True)
multpl_stock_yearly_returns_melt.head()

Unnamed: 0,Date,XLK_Return,TickerSymbol,YearlyReturn,win
0,2011-12-31,,AAPL,,0
1,2011-12-31,,ACN,,0
2,2011-12-31,,ADBE,,0
3,2011-12-31,,ADI,,0
4,2011-12-31,,ADP,,0


In [38]:
for e in df_signals_list:
    print(e.shape)

(59, 25)
(60, 25)
(61, 25)
(63, 25)
(66, 25)
(66, 25)
(66, 25)
(66, 25)


In [39]:
# appending signals dfs
all_years_signals = df_signals_list[0]
for i in range(1, len(df_signals_list)):
    all_years_signals = all_years_signals.append(df_signals_list[i])

In [40]:
all_years_signals.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
VRSN,2018-12-31,1.389877,0.932381,0.841861,0.479427,0.198045,-0.462193,0.273902,0.323322,0.771687,0.008291,...,,0.032024,0.036212,18189400000.0,-13.128647,31.227027,27.61501,-9.172927,-9.008132,14.971081
WDC,2018-12-31,2.23954,0.379899,0.334537,0.043371,0.028184,0.07461,1.178756,-1.953033,-0.408262,-0.401879,...,0.052583,0.074953,0.192863,11220400000.0,1.028262,13.34173,5.185025,-1.540205,-1.120443,0.578639
WU,2018-12-31,0.170419,,0.409506,0.1524,0.092283,-1.733618,-2.529169,0.016779,-1.189223,-0.136746,...,0.044079,0.109893,0.062216,7752064000.0,-25.022802,9.099735,16.073116,-0.930263,-0.930263,1.386798
XLNX,2018-12-31,4.378726,0.32145,0.695999,0.280082,0.159104,0.344472,0.641032,0.109628,19.03851,0.154313,...,0.016716,0.037596,0.04448,21666160000.0,8.215609,26.598902,22.481802,14.893054,18.625178,7.449869
XRX,2018-12-31,1.444171,0.35162,0.41292,0.036724,0.022639,0.065553,0.851282,0.539326,-1.825301,-35.65625,...,0.057398,0.038463,-0.006312,5069823000.0,0.920614,25.999093,-158.431975,-0.973281,-0.660929,0.493894


In [41]:
all_years_signals = all_years_signals.reset_index(drop = True)

In [42]:
all_years_signals.tail()

Unnamed: 0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
502,1.389877,0.932381,0.841861,0.479427,0.198045,-0.462193,0.273902,0.323322,0.771687,0.008291,...,,0.032024,0.036212,18189400000.0,-13.128647,31.227027,27.61501,-9.172927,-9.008132,14.971081
503,2.23954,0.379899,0.334537,0.043371,0.028184,0.07461,1.178756,-1.953033,-0.408262,-0.401879,...,0.052583,0.074953,0.192863,11220400000.0,1.028262,13.34173,5.185025,-1.540205,-1.120443,0.578639
504,0.170419,,0.409506,0.1524,0.092283,-1.733618,-2.529169,0.016779,-1.189223,-0.136746,...,0.044079,0.109893,0.062216,7752064000.0,-25.022802,9.099735,16.073116,-0.930263,-0.930263,1.386798
505,4.378726,0.32145,0.695999,0.280082,0.159104,0.344472,0.641032,0.109628,19.03851,0.154313,...,0.016716,0.037596,0.04448,21666160000.0,8.215609,26.598902,22.481802,14.893054,18.625178,7.449869
506,1.444171,0.35162,0.41292,0.036724,0.022639,0.065553,0.851282,0.539326,-1.825301,-35.65625,...,0.057398,0.038463,-0.006312,5069823000.0,0.920614,25.999093,-158.431975,-0.973281,-0.660929,0.493894


In [43]:
print(multpl_stock_yearly_returns_melt.shape)
print(all_years_signals.shape)

(621, 5)
(507, 25)


In [44]:
# Separating latest data
# multpl_stock_yearly_returns_melt['Date'] = pd.to_datetime(multpl_stock_yearly_returns_melt['Date'])
yearly_returns_latest_df = multpl_stock_yearly_returns_melt[multpl_stock_yearly_returns_melt['Date'].dt.year == 2019]
yearly_returns_df = multpl_stock_yearly_returns_melt[multpl_stock_yearly_returns_melt['Date'].dt.year != 2019]

In [45]:
print(yearly_returns_df.shape)
print(yearly_returns_latest_df.shape)

(552, 5)
(69, 5)


In [46]:
yearly_returns_df = yearly_returns_df.drop(columns = ["TickerSymbol", "Date"])
yearly_returns_latest_df = yearly_returns_latest_df.drop(columns = ["TickerSymbol", "Date"])

In [47]:
yearly_returns_df = yearly_returns_df.reset_index(drop = True)
yearly_returns_latest_df = yearly_returns_latest_df.reset_index(drop = True)

In [48]:
print(yearly_returns_df.shape)
print(all_years_signals.shape)

(552, 3)
(507, 25)


In [49]:
final_df = pd.concat([yearly_returns_df, all_years_signals], axis=1)

In [50]:
final_df.head()

Unnamed: 0,XLK_Return,YearlyReturn,win,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
0,,,0,1.608438,,0.404789,0.239466,0.344785,0.542403,0.849854,...,,0.068334,0.079288,379340900000.0,4.951262,14.633938,12.612326,72.504003,-40.401624,3.504337
1,,,0,1.45116,0.000417,0.305433,0.083792,0.185335,0.720317,0.27505,...,0.020256,0.060548,0.08653,39390880000.0,9.336138,16.515754,11.556717,-498.985125,-11.31613,1.383893
2,,,0,3.360781,0.17855,0.894718,0.22789,0.11677,0.182498,0.95864,...,,0.064662,0.081273,14351120000.0,2.5793,15.464932,12.304213,26.556668,68.349719,3.524304
3,,,0,8.354869,0.167949,0.663658,0.289777,0.200376,0.271085,0.218106,...,0.025536,0.078649,0.070501,11028680000.0,2.905793,12.714734,14.184201,3.797591,4.378039,3.684432
4,,,0,1.094687,,0.416986,0.126393,0.0414,0.222072,0.060209,...,0.026109,0.047539,0.056627,26891590000.0,4.467634,21.035345,17.659302,59.363324,-1.357796,2.658716


In [51]:
final_df.shape

(552, 28)

In [52]:
df_signals = df_signals.reset_index(drop = True)

In [53]:
print(yearly_returns_latest_df.shape)
print(df_signals.shape)

(69, 3)
(66, 25)


In [54]:
df_signals.head()

Unnamed: 0,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
0,1.540126,0.319178,0.378178,0.212381,0.151086,0.515703,-0.071811,0.362605,-0.03108,-0.081487,...,0.011591,0.045364,0.048352,1218062000000.0,13.46103,22.043972,20.681569,-14.294989,-9.49952,4.68172
1,1.396741,0.000761,0.308103,0.110589,0.195472,0.445622,0.177148,-0.095308,0.098009,0.11331,...,0.014571,0.037352,0.047113,127947300000.0,8.628944,26.772194,21.225697,261.965953,-25.020021,2.960714
2,0.966122,0.211714,0.860408,0.280772,0.179113,0.310624,0.427639,-0.005896,0.156352,0.304345,...,,0.018077,0.025757,148363700000.0,15.029521,55.319561,38.824748,-32.967247,-27.469483,15.5322
3,1.001142,0.263837,0.677884,0.247099,0.073887,0.139854,0.076706,-0.015119,-0.12568,-0.063434,...,0.018626,0.037372,0.049213,40626470000.0,3.445773,26.758188,20.319983,-5.229067,-4.878525,6.611917
4,1.010534,0.053154,0.429114,0.165169,0.063335,0.506497,0.382937,0.224816,0.152355,0.21941,...,0.017989,0.031964,0.032489,74139640000.0,13.830217,31.28519,30.779941,-26.05505,-2.903202,5.167354


In [55]:
final_test_df= pd.concat([yearly_returns_latest_df, df_signals], axis=1)
final_test_df.shape
# df_signals.head()

(69, 28)

Output of the previous section are `final_df` (for train data) `final_test_df` (for test data), both including `win` column

In [56]:
# final_df.isnull().sum()

In [None]:
# df_signals_list[5].isnull().sum()

# Add additional features

In [None]:
# NLP, FRED

# Create target and  features

In [None]:
# final_df = final_df.drop(columns = ["Ticker", "Date"])

In [57]:
print(final_df.shape)
print(final_test_df.shape)

(552, 28)
(69, 28)


In [58]:
names = list(final_df.columns)
names[0], names[2] = names[2], names[0]
final_df = final_df.reindex(names, axis=1)

In [59]:
final_df.head()

Unnamed: 0,win,YearlyReturn,XLK_Return,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Return on Assets,Return on Equity,Earnings Growth,...,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
0,0,,,1.608438,,0.404789,0.239466,0.344785,0.542403,0.849854,...,,0.068334,0.079288,379340900000.0,4.951262,14.633938,12.612326,72.504003,-40.401624,3.504337
1,0,,,1.45116,0.000417,0.305433,0.083792,0.185335,0.720317,0.27505,...,0.020256,0.060548,0.08653,39390880000.0,9.336138,16.515754,11.556717,-498.985125,-11.31613,1.383893
2,0,,,3.360781,0.17855,0.894718,0.22789,0.11677,0.182498,0.95864,...,,0.064662,0.081273,14351120000.0,2.5793,15.464932,12.304213,26.556668,68.349719,3.524304
3,0,,,8.354869,0.167949,0.663658,0.289777,0.200376,0.271085,0.218106,...,0.025536,0.078649,0.070501,11028680000.0,2.905793,12.714734,14.184201,3.797591,4.378039,3.684432
4,0,,,1.094687,,0.416986,0.126393,0.0414,0.222072,0.060209,...,0.026109,0.047539,0.056627,26891590000.0,4.467634,21.035345,17.659302,59.363324,-1.357796,2.658716


In [60]:
train_array = final_df.values
test_array = final_test_df.values

In [61]:
Y_train = train_array[0:,0]
Y_test = test_array[0:,0]

In [62]:
X_train = train_array[0:,1:]
X_test = test_array[0:,1:]

In [63]:
# np.savetxt("foo.csv", array, delimiter=",")
print(X_train.shape)
print(X_test.shape)

(552, 27)
(69, 27)


In [82]:
# mask = ~np.any(np.isnan(X), axis=1)

In [83]:
# X = X[mask]
# Y = Y[mask]

In [84]:
# X_train.fillna(0.)
# Y_train.fillna(0.)

# Feature selection or engineering

In [None]:
# Feature selection - Univariate, recursive feature elimination, PCA and feature importance

In [None]:
test = SelectKBest(score_func=chi2, k=4)

In [None]:
# fit = test.fit(X, Y)

In [None]:
pca = PCA(n_components=3)

In [None]:
# fit = pca.fit(X)

In [None]:
model = ExtraTreesClassifier()

In [None]:
# model.fit(X,Y)

In [None]:
# print(model.feature_importance_)

# Pre-processing: Scale, standardize, normalize or binarize

In [None]:
lab_enc = preprocessing.LabelEncoder()
Y = lab_enc.fit_transform(Y)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
rescaledX = scaler.fit_transform(X)

# Create train-test harness

In [None]:
# Create test-train split: Basic, K-fold, leave out, repeated

In [17]:
# test_size = 0.33 # validation size
# seed = 7
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,random_state=seed)
# model = LogisticRegression()
# model.fit(X_train, Y_train)
# result = model.score(X_test, Y_test)
# print("Accuracy: %.3f%%") % (result*100.0)

# Regression algorithms

In [None]:
# Regression metrics: mean absolute error, mean squared error, R2

In [None]:
# Evaluate algorithms on train - Reg (5), k-means, classification and regression trees, suppot vector machines

In [None]:
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

In [None]:
# Spot-Check Algorithms
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

# Classification algorithms

In [None]:
# Classification metrics: classification accuracy, logarithmic loss, AUC, confusion matrix, classification report
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Accuracy: {results.mean()} and {results.std()}")

In [None]:
# Classification: Logisitc, LDA, k-nearest neighbors, Naive bayes, classification and regression trees, & SVM

In [None]:
# SVM for classification
kfold = KFold(n_splits=10, random_state=7)

model = SVC()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

In [None]:
# SVM Regression
num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = SVR()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Building Pipelines to avoid leakage

In [None]:
# Create a pipeline that standardizes the data then creates a model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

In [None]:
# Adding feature extraction plus pre-processing and model into a pipeline

# Improve performance with Ensemble models

In [None]:
# bagging (bag decision trees, random forest, extra trees), boosting (adaBoost, gradient), voting (combine multiple)

# Improve performance with Algorithm tunning

In [None]:
# Using grid search and random search

In [None]:
# Grid Search for Algorithm Tuning
alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])
param_grid = dict(alpha=alphas)
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X, Y)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

# Finalize model

In [None]:
# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = GradientBoostingRegressor(random_state=seed, n_estimators=400)
model.fit(rescaledX, Y_train)

In [None]:
# transform the validation dataset
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(rescaledValidationX)
print(mean_squared_error(Y_validation, predictions))

# Save and load machine learning models - pickle, joblib

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# Fit the model on 33%
model = LogisticRegression()
model.fit(X_train, Y_train)
# save the model to disk
filename = 'finalized_model.sav'
dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

In [None]:
# Using joblib
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# Fit the model on 33%
model = LogisticRegression()
model.fit(X_train, Y_train)
# save the model to disk
filename = 'finalized_model.sav'
dump(model, filename)

# load the model from disk
loaded_model = load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)

# Load and check predictions

In [None]:
# Load the model and make the predictions from the x test dataset
model = load('finalized_model.sav')
predictions = model.predict(X_test)
predictions

In [None]:
# Add predicted results to DataFrame
# results["Predicted Value"] = predictions
results