In [62]:
from itertools import combinations

# Import libraries and modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import Perceptron
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [14]:
# Load the data
train = pd.read_csv('stocks.csv')
test = pd.read_csv('sp500.csv')

In [15]:
# Convert all None values to NaN
train = train.replace('None', np.nan)
test = test.replace('None', np.nan)

# Convert all - values to NaN
train = train.replace('-', np.nan)
test = test.replace('-', np.nan)

# Remove any null values from the training set
train.dropna(inplace=True)
test.dropna(inplace=True)

# Move the beta column to the end of the testing set
test = test[
    ['stock_return_100', 'market_return_100', 'symbol', 'sentiment', 'market_cap', 'ebitda', 'pe_ratio', 'peg_ratio',
     'book_value',
     'dividend_per_share', 'dividend_yield', 'eps', 'revenue_per_share', 'profit_margin',
     'operating_margin', 'return_on_assets', 'return_on_equity', 'revenue', 'gross_profit',
     'diluted_eps', 'quarterly_earnings_growth', 'quarterly_revenue_growth', 'trailing_pe',
     'forward_pe', 'price_to_sales_ratio', 'price_to_book_ratio', 'ev_to_revenue', 'ev_to_ebitda',
     'beta']]

# Replace the boolean values in "outperformed" with 1 and 0
train['outperformed'].replace(True, 1, inplace=True)
train['outperformed'].replace(False, 0, inplace=True)

# Add a column to the test set that indicates whether the stock outperformed the stock market
test['outperformed'] = np.where(test['stock_return_100'] > test['market_return_100'], 1, 0)

# Cast the columns to the correct data types
train = train.astype(
    {'market_cap': float, 'beta': float, 'ebitda': float, 'pe_ratio': float, 'peg_ratio': float, 'book_value': float,
     'trailing_pe': float, 'forward_pe': float, 'price_to_sales_ratio': float, 'dividend_per_share': float,
     'dividend_yield': float,
     'eps': float, 'price_to_book_ratio': float, 'ev_to_revenue': float, 'ev_to_ebitda': float})
test = test.astype({'beta': float, 'ebitda': float, 'pe_ratio': float, 'peg_ratio': float, 'book_value': float,
                    'trailing_pe': float, 'forward_pe': float, 'price_to_sales_ratio': float,
                    'price_to_book_ratio': float, 'ev_to_revenue': float, 'ev_to_ebitda': float})

In [16]:
# Remove any stocks from the training set that are also in the test set
train = train[~train['symbol'].isin(test['symbol'])]

train

Unnamed: 0.1,Unnamed: 0,symbol,stock_return_100,market_return_100,sentiment,market_cap,ebitda,pe_ratio,peg_ratio,book_value,...,quarterly_earnings_growth,quarterly_revenue_growth,trailing_pe,forward_pe,price_to_sales_ratio,price_to_book_ratio,ev_to_revenue,ev_to_ebitda,beta,outperformed
23,23,ABB,0.204827,0.093607,0.225050,6.393383e+10,4.477000e+09,25.980,0.276,1.713,...,-0.546,0.034,25.980,18.98,1.934,5.030,2.207,10.460,1.0240,1
30,30,ABEV,-0.057495,0.093607,0.161613,4.484881e+10,2.234425e+10,16.380,1.976,5.210,...,0.378,0.031,16.380,16.81,0.559,2.649,0.530,1.962,0.6630,0
31,31,ABG,0.275212,0.093607,0.150953,4.302144e+09,1.341600e+09,4.445,0.314,134.610,...,1.491,0.396,4.445,5.51,0.262,1.486,0.513,5.790,1.1410,1
46,46,ACA,0.079896,0.093607,0.274245,2.874350e+09,3.141000e+08,11.700,6.720,45.130,...,15.770,-0.041,11.700,27.62,1.186,1.368,1.492,10.910,0.5470,0
76,76,ACHC,-0.121089,0.093607,0.161630,6.541778e+09,5.640130e+08,23.640,1.320,31.280,...,-0.138,0.138,23.640,26.04,3.053,2.826,3.640,16.320,1.3150,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7781,7781,ZM,0.025056,0.093607,0.140008,2.070159e+10,3.277500e+08,200.860,10.950,21.120,...,-0.856,0.043,200.860,20.53,4.721,3.719,3.759,23.410,-0.2190,0
7788,7788,ZTO,0.539079,0.093607,0.184920,2.325894e+10,1.040703e+10,23.400,23.960,66.760,...,0.197,0.071,23.400,18.45,0.626,2.903,0.594,2.599,0.0137,1
7790,7790,ZUMZ,-0.109240,0.093607,0.042960,3.716670e+08,5.122600e+07,17.180,0.820,20.900,...,-0.651,-0.192,17.180,12.21,0.398,1.162,0.542,6.240,1.5010,0
7798,7798,ZWS,-0.056729,0.093607,0.039737,3.754832e+09,2.132000e+08,56.210,1.084,9.130,...,-0.528,0.465,56.210,21.69,3.290,2.664,5.330,39.840,1.0740,0


In [17]:
# Drop the returns from both sets
train.drop(['stock_return_100', 'market_return_100'], axis=1, inplace=True)
test.drop(['stock_return_100', 'market_return_100'], axis=1, inplace=True)

# Drop the "Unnamed: 0" column from the training set
train.drop('Unnamed: 0', axis=1, inplace=True)

In [24]:
# Make the symbols the index
train.set_index('symbol', inplace=True)
test.set_index('symbol', inplace=True)

test

Unnamed: 0_level_0,sentiment,market_cap,ebitda,pe_ratio,peg_ratio,book_value,dividend_per_share,dividend_yield,eps,revenue_per_share,...,quarterly_earnings_growth,quarterly_revenue_growth,trailing_pe,forward_pe,price_to_sales_ratio,price_to_book_ratio,ev_to_revenue,ev_to_ebitda,beta,outperformed
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMM,0.098151,5.796493e+10,7.424000e+09,10.26,2.314,26.80,5.96,0.0574,10.19,60.48,...,-0.578,-0.062,10.26,11.92,1.946,4.950,2.377,8.52,0.963,0
AOS,0.088535,1.039050e+10,2.909000e+08,45.60,1.812,11.56,1.14,0.0174,1.51,24.25,...,-0.134,-0.060,45.60,18.12,2.276,5.150,2.331,12.14,1.256,1
ABT,0.174237,1.765583e+11,1.210300e+10,25.98,18.760,21.11,1.92,0.0201,3.91,24.90,...,-0.472,-0.120,25.98,25.13,4.202,5.440,4.452,15.20,0.666,0
ABBV,0.155819,2.823330e+11,3.108900e+10,24.14,1.279,9.75,5.71,0.0369,6.63,32.78,...,-0.388,0.016,24.14,14.39,4.989,18.230,6.040,13.80,0.587,0
ACN,0.232305,1.806623e+11,1.069960e+10,26.32,2.975,37.64,4.33,0.0157,10.86,100.00,...,-0.059,0.051,26.32,25.58,2.878,8.690,3.043,18.53,1.242,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XEL,0.151972,3.707343e+10,4.942000e+09,21.27,3.209,30.34,1.95,0.0308,3.17,27.99,...,0.186,0.208,21.27,21.10,2.645,2.380,4.394,13.13,0.421,0
XYL,0.204479,1.896192e+10,7.410000e+08,53.54,2.644,19.38,1.20,0.0126,1.96,30.64,...,0.322,0.138,53.54,36.36,3.735,6.430,4.106,33.11,1.068,0
ZBRA,0.163402,1.634670e+10,1.140000e+09,36.11,1.633,53.12,0.00,0.0000,8.63,110.73,...,0.009,0.025,36.11,13.95,2.256,5.080,2.744,14.61,1.663,1
ZBH,0.250696,2.701055e+10,2.246400e+09,92.46,1.078,57.51,0.96,0.0075,1.38,33.11,...,0.331,0.027,92.46,18.28,3.309,2.198,4.083,20.87,1.005,1


In [74]:
# Create an instance of Perceptron as the base estimator
model = Perceptron()

rfecv = RFECV(estimator=model, step=1, cv=10, scoring='accuracy')

# Get the training data
X = train[train.columns[:-1]]

# Get the target data
y = train.iloc[:, -1]

# Create a pipeline
pipeline = Pipeline([('Feature Selection', rfecv), ('Model', model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=36851234)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
np.mean(n_scores)

# Fit the model
pipeline.fit(X, y)

# Get the test data
X_test = test[test.columns[:-1]]

# Get the target data
y_test = test.iloc[:, -1]

# Get the predictions
predictions = pipeline.predict(X_test)

# Get the accuracy
accuracy = np.mean(predictions == y_test)

print(accuracy)
print("Optimum number of features: %d" % rfecv.n_features_)

0.604221635883905
Optimum number of features: 1


In [75]:
# Create an instance of Decision Tree as the base estimator
model = DecisionTreeClassifier()

rfecv = RFECV(estimator=model, step=1, cv=10, scoring='accuracy')

# Get the training data
X = train[train.columns[:-1]]

# Get the target data
y = train.iloc[:, -1]

# Create a pipeline
pipeline = Pipeline([('Feature Selection', rfecv), ('Model', model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=36851234)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
np.mean(n_scores)

# Fit the model
pipeline.fit(X, y)

# Get the test data
X_test = test[test.columns[:-1]]

# Get the target data
y_test = test.iloc[:, -1]

# Get the predictions
predictions = pipeline.predict(X_test)

# Get the accuracy
accuracy = np.mean(predictions == y_test)

print(accuracy)
print("Optimum number of features: %d" % rfecv.n_features_)

0.5197889182058048
Optimum number of features: 26
