In [1]:
# Import all liquid stocks of 2015 (features X)
import numpy as np
import csv
data = list(csv.reader(open('data/x2015_noFinance.csv')))
feature_data = np.asarray(data)
# First stock in X with selected features
selected_features = feature_data[:, [0, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20]]  
print('Selected Feature Names: ', '\n', selected_features[0:1, :], ' ...')
# X with selected features and labels & blanks removed
x_data_features = selected_features[1:, :]
x_data_features[x_data_features == ''] = 0.0
x_data = x_data_features
print('First few Stocks with Features: ', '\n', x_data[0:3, :], ' ...')
print('\n', np.size(x_data[:, 0]), 'Stocks by', np.size(x_data[0, :]), 'Features')

Selected Feature Names:  
 [['Ticker' 'total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc'
  'fcf yield' 'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce'
  'momentum']]  ...
First few Stocks with Features:  
 [['DD' '0.03' '11356000156.93' '0.19' '0.16' '16417999988.64'
  '2892999992.35' '0.03' '9514000033.31' '5531999915.68' '0.09' '0.18'
  '4452000010.73' '0.93']
 ['DOW' '0.04' '21291999674.27' '0.13' '0.42' '39294999803.92'
  '5032000045.36' '0.03' '25035000219.44' '7032000010.63' '0.1' '0.13'
  '7032000010.63' '0.94']
 ['MON' '0.09' '9039999988.11' '0.2' '0.1' '11898000085.39' '2375000012.38'
  '0.04' '4534000078.38' '1735999970.61' '0.1' '0.2' '1676000016.95' '0.88']]  ...

 1436 Stocks by 14 Features


In [3]:
# Import best performing stocks of 2015 (y = 1)
import csv
data = list(csv.reader(open('data/y201501_noFinancials.csv')))
y_data = np.asarray(data[1:])
# Find X and Y tickers
x_tickers = x_data[:, 0]
y_tickers = y_data[:, 0]
# Format Y to y = 1 (positive) and y = 0 (negative) examples 
true_false_mask = np.in1d(x_tickers, y_tickers)
y_mask = np.where(true_false_mask, 1, 0)
# Place data set into input (X) and output (Y) variables
x_strings = x_data[:, 1:]  # take off tickers, as they can't be tensor'd
X = x_strings.astype(np.float)  # convert strings to float
Y = y_mask        # Y uses the 0, 1 to show negative and positive examples
np.set_printoptions(precision=3, suppress=True)
print('First few X Training Examples with', np.size(X[0, :]), 'Selected Features to Numbers: \n', X[0:3, :], ' ...')
print('First few Positive Examples: ', '\n', y_data[0:3, 0:5], ' ...')
print('\nPositive Example Count: ', np.size(x_tickers[np.nonzero(y_mask)]))
# print('First few Y values: ', '\n', Y[0:10].reshape(10,1), ' ...')

First few X Training Examples with 13 Selected Features to Numbers: 
 [[  3.000e-02   1.136e+10   1.900e-01   1.600e-01   1.642e+10   2.893e+09
    3.000e-02   9.514e+09   5.532e+09   9.000e-02   1.800e-01   4.452e+09
    9.300e-01]
 [  4.000e-02   2.129e+10   1.300e-01   4.200e-01   3.929e+10   5.032e+09
    3.000e-02   2.504e+10   7.032e+09   1.000e-01   1.300e-01   7.032e+09
    9.400e-01]
 [  9.000e-02   9.040e+09   2.000e-01   1.000e-01   1.190e+10   2.375e+09
    4.000e-02   4.534e+09   1.736e+09   1.000e-01   2.000e-01   1.676e+09
    8.800e-01]]  ...
First few Positive Examples:  
 [['AAPL' 'Apple, Inc.' 'NASD' 'Technology' 'Computer Hardware']
 ['ABAX' 'ABAXIS, Inc.' 'NASD' 'Health Care' 'Medical Supplies']
 ['ABC' 'AmerisourceBergen Corp.' 'NYSE' 'Consumer Staples'
  'Drug Retailers']]  ...

Positive Example Count:  119


In [5]:
# Standardize the X data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
np.set_printoptions(precision=3, suppress=True)
print(rescaledX[0:5, :][0])

[-0.424  0.662 -0.071 -0.563  0.475  0.7   -0.003  0.467  0.648 -0.024
 -0.031  1.144  0.587]


In [6]:
# Control X vs rescaledX
X = rescaledX

In [7]:
# Feature Extraction with Low Variance Removal
from sklearn.feature_selection import VarianceThreshold
# feature extraction
variance_sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
variance_fit = variance_sel.fit(X)
variance_after_features = variance_sel.fit_transform(X)
# summarize selected features
initial_feature_labels = selected_features[0, 1:]
np.set_printoptions(precision=3, suppress=True)
variance_labels = variance_fit.transform(initial_feature_labels.reshape(1, -1))
print('Initial Features: \n', X[0:3, :])
print('Selected Features: \n', variance_after_features[0:3, :])
print('\nInitial Feature Names: \n', initial_feature_labels)
print('\nSelected Feature Names: \n', variance_labels)

Initial Features: 
 [[-0.424  0.662 -0.071 -0.563  0.475  0.7   -0.003  0.467  0.648 -0.024
  -0.031  1.144  0.587]
 [-0.233  1.565 -0.122 -0.059  1.662  1.459 -0.003  1.775  0.874 -0.024
  -0.053  1.942  0.662]
 [ 0.72   0.452 -0.062 -0.68   0.24   0.516  0.048  0.048  0.074 -0.024
  -0.022  0.286  0.211]]
Selected Features: 
 [[-0.424  0.662 -0.071 -0.563  0.475  0.7   -0.003  0.467  0.648 -0.024
  -0.031  1.144  0.587]
 [-0.233  1.565 -0.122 -0.059  1.662  1.459 -0.003  1.775  0.874 -0.024
  -0.053  1.942  0.662]
 [ 0.72   0.452 -0.062 -0.68   0.24   0.516  0.048  0.048  0.074 -0.024
  -0.022  0.286  0.211]]

Initial Feature Names: 
 ['total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc' 'fcf yield'
 'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce' 'momentum']

Selected Feature Names: 
 [['total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc' 'fcf yield'
  'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce' 'momentum']]


In [8]:
# Feature Extraction with Univariate Statistical Tests (ANOVA F-value f_classif for classification)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# feature extraction
SelectKBest_features = SelectKBest(score_func=f_classif, k=4)
SelectKBest_fit = SelectKBest_features.fit(X, Y)
# summarize selected features
initial_feature_labels = selected_features[0, 1:]
np.set_printoptions(precision=3, suppress=True)
SelectKBest_labels = SelectKBest_fit.transform(initial_feature_labels.reshape(1, -1))
SelectKBest_output_features = SelectKBest_fit.transform(X)
print('Initial Features: \n', X[0:3, :])
print('Selected Features: \n', SelectKBest_output_features[0:3, :])
print('Fit SCores: \n', SelectKBest_fit.scores_)
print('\nInitial Feature Names: \n', initial_feature_labels)
print('\nSelected Feature Names: \n', SelectKBest_labels)

Initial Features: 
 [[-0.424  0.662 -0.071 -0.563  0.475  0.7   -0.003  0.467  0.648 -0.024
  -0.031  1.144  0.587]
 [-0.233  1.565 -0.122 -0.059  1.662  1.459 -0.003  1.775  0.874 -0.024
  -0.053  1.942  0.662]
 [ 0.72   0.452 -0.062 -0.68   0.24   0.516  0.048  0.048  0.074 -0.024
  -0.022  0.286  0.211]]
Selected Features: 
 [[-0.563  0.475  0.7    0.467]
 [-0.059  1.662  1.459  1.775]
 [-0.68   0.24   0.516  0.048]]
Fit SCores: 
 [  0.289   2.462   0.002   5.819   3.778  14.872   0.549   4.115   3.08
   0.143   0.004   1.471   0.001]

Initial Feature Names: 
 ['total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc' 'fcf yield'
 'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce' 'momentum']

Selected Feature Names: 
 [['book price' 'sic' 'opInc' 'comStockEq']]


In [9]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# feature extraction
chi_select_features = SelectKBest(score_func=chi2, k=4)
non_z = np.where(X < 0)
x_no_neg = np.copy(X)
x_no_neg[x_no_neg < 0] = 0.0
chi_fit = chi_select_features.fit(x_no_neg, Y)
# summarize selected features
initial_feature_labels = selected_features[0, 1:]
np.set_printoptions(precision=3, suppress=True)
chi_labels = chi_fit.transform(initial_feature_labels.reshape(1, -1))
chi_fit_output_features = chi_fit.transform(X)
print('Initial Features: \n', X[0:3, :])
print('Selected Features: \n', chi_fit_output_features[0:3, :])
print('Chi-Fit Scores: \n', chi_fit.scores_)
print('\nInitial Feature Names: \n', initial_feature_labels)
print('\nSelected Feature Names: \n', chi_labels)

Initial Features: 
 [[-0.424  0.662 -0.071 -0.563  0.475  0.7   -0.003  0.467  0.648 -0.024
  -0.031  1.144  0.587]
 [-0.233  1.565 -0.122 -0.059  1.662  1.459 -0.003  1.775  0.874 -0.024
  -0.053  1.942  0.662]
 [ 0.72   0.452 -0.062 -0.68   0.24   0.516  0.048  0.048  0.074 -0.024
  -0.022  0.286  0.211]]
Selected Features: 
 [[ 0.475  0.7    0.467  0.648]
 [ 1.662  1.459  1.775  0.874]
 [ 0.24   0.516  0.048  0.074]]
Chi-Fit Scores: 
 [  0.054   7.766   0.519  11.59   13.342  55.703   2.128  16.068  19.073
   3.746   0.949   7.101   0.371]

Initial Feature Names: 
 ['total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc' 'fcf yield'
 'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce' 'momentum']

Selected Feature Names: 
 [['sic' 'opInc' 'comStockEq' 'ms cem']]


In [11]:
# Feature Extraction with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# feature extraction
rfe_model = LogisticRegression()
rfe = RFE(rfe_model, 3)
rfe_fit = rfe.fit(X, Y)
# summarize selected features
initial_feature_labels = selected_features[0,1:]
np.set_printoptions(precision=3, suppress=True)
rfe_labels = rfe_fit.transform(initial_feature_labels.reshape(1, -1))
rfe_fit_output_features = rfe_fit.transform(X)
# print('Selected Features: ', rfe_fit.support_)
print('Initial Features: \n', X[0:3, :])
print('Selected Features: \n', rfe_fit_output_features[0:3, :])
print('Feature Ranking: ', rfe_fit.ranking_)
print('\nInitial Feature Names: \n', initial_feature_labels)
# print('\nNumber of Features Chosen: ', rfe_fit.n_features_)
print('\n', rfe_fit.n_features_, 'Selected Feature Labels: \n', rfe_labels[0])

Initial Features: 
 [[-0.424  0.662 -0.071 -0.563  0.475  0.7   -0.003  0.467  0.648 -0.024
  -0.031  1.144  0.587]
 [-0.233  1.565 -0.122 -0.059  1.662  1.459 -0.003  1.775  0.874 -0.024
  -0.053  1.942  0.662]
 [ 0.72   0.452 -0.062 -0.68   0.24   0.516  0.048  0.048  0.074 -0.024
  -0.022  0.286  0.211]]
Selected Features: 
 [[-0.563  0.7   -0.024]
 [-0.059  1.459 -0.024]
 [-0.68   0.516 -0.024]]
Feature Ranking:  [10  3 11  1  8  1  2  9  6  1  7  5  4]

Initial Feature Names: 
 ['total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc' 'fcf yield'
 'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce' 'momentum']

 3 Selected Feature Labels: 
 ['book price' 'opInc' 'total return']


In [12]:
# Feature Importance with Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
# load data
tree_X = X
# feature extraction
initial_feature_labels = selected_features[0, 1:]
print('Initial Feature Names: \n', initial_feature_labels)
tree_model = ExtraTreesClassifier()
tree_model.fit(tree_X, Y)
print(tree_model.feature_importances_)

Initial Feature Names: 
 ['total yield' 'ms debt' 'roic v2' 'book price' 'sic' 'opInc' 'fcf yield'
 'comStockEq' 'ms cem' 'total return' 'spitz roic' 'ms ce' 'momentum']
[ 0.082  0.077  0.068  0.076  0.079  0.092  0.063  0.079  0.073  0.067
  0.086  0.084  0.075]
