# Stock Prediction with SVM 

### Read company financials from sqlite database into Pandas Dataframe

In [1]:
# Import Python Pandas, Numpy and SQL toolkit
import pandas as pd
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import numpy as np

In [2]:
# Create engine using the `stock_data.sqlite` database file
engine = create_engine("sqlite:///stock_data.sqlite")

In [3]:
#Read all the data from the cmpny_financials table onto a data frame. Check size of the dataframe with shape()
sql = "select * from cmpny_financials"
company_data = pd.read_sql(sql, engine)
company_data.shape

(18732, 44)

In [4]:
company_data.head()

Unnamed: 0,id_cmpny_financials,id_cmpny,ticker,datekey,de,pe1,ps1,pb,netmargin,marketcap,...,payoutratio,tangibles,tbvps,workingcapital,price,sp_price,price_change,sp_price_change,diff,status
0,1,1.0,MMM,2008-05-02,1.153,15.048,2.221,4.449,0.153,55321780000.0,...,0.357,21217000000.0,30.031,4625000000.0,78.55,1413.900024,0.0,0.0,0.0,underperform
1,2,1.0,MMM,2008-08-01,1.183,13.261,1.925,3.912,0.14,49034180000.0,...,0.37,20590000000.0,29.326,3552000000.0,70.15,1260.310059,-10.693826,-10.862859,0.169033,outperform
2,3,1.0,MMM,2008-10-31,1.263,11.952,1.722,3.652,0.151,44557010000.0,...,0.35,20773000000.0,29.868,3780000000.0,64.3,968.75,-18.141311,-31.48384,13.342529,outperform
3,4,1.0,MMM,2009-02-17,1.586,9.58,1.312,3.33,0.097,32899600000.0,...,0.641,18396000000.0,26.549,3759000000.0,47.42,789.169983,-39.630808,-44.184881,4.554072,outperform
4,5,1.0,MMM,2009-05-01,1.458,13.429,1.68,4.128,0.102,40190940000.0,...,0.68,17403000000.0,25.094,3919000000.0,57.88,877.52002,-26.314449,-37.936204,11.621755,outperform


In [5]:
#Delete index columns and columns with no data
del company_data["id_cmpny_financials"]
del company_data["id_cmpny"]

In [6]:
company_data = company_data[:5000]

In [7]:
#Fill empty entries & NaN to 0
company_data = company_data.fillna(0)

In [8]:
company_data.head()

Unnamed: 0,ticker,datekey,de,pe1,ps1,pb,netmargin,marketcap,ev,evebitda,...,payoutratio,tangibles,tbvps,workingcapital,price,sp_price,price_change,sp_price_change,diff,status
0,MMM,2008-05-02,1.153,15.048,2.221,4.449,0.153,55321780000.0,58776780000.0,8.701,...,0.357,21217000000.0,30.031,4625000000.0,78.55,1413.900024,0.0,0.0,0.0,underperform
1,MMM,2008-08-01,1.183,13.261,1.925,3.912,0.14,49034180000.0,53495180000.0,7.867,...,0.37,20590000000.0,29.326,3552000000.0,70.15,1260.310059,-10.693826,-10.862859,0.169033,outperform
2,MMM,2008-10-31,1.263,11.952,1.722,3.652,0.151,44557010000.0,49353010000.0,7.146,...,0.35,20773000000.0,29.868,3780000000.0,64.3,968.75,-18.141311,-31.48384,13.342529,outperform
3,MMM,2009-02-17,1.586,9.58,1.312,3.33,0.097,32899600000.0,37768600000.0,5.887,...,0.641,18396000000.0,26.549,3759000000.0,47.42,789.169983,-39.630808,-44.184881,4.554072,outperform
4,MMM,2009-05-01,1.458,13.429,1.68,4.128,0.102,40190940000.0,44592940000.0,7.812,...,0.68,17403000000.0,25.094,3919000000.0,57.88,877.52002,-26.314449,-37.936204,11.621755,outperform


### Use sklearn SVM classifier to predict stock performance based on company financials

In [9]:
# import sklearn dependencies
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, preprocessing

In [10]:
#set status column as target
target = (company_data["status"]
            .replace("underperform", 0)
            .replace("outperform", 1))
target_names = ["underperform", "outperform"]

In [11]:
#Remove columns that are not company financials. This will be the features on which the model will be trained
data = (company_data.drop("price", axis=1)
            .drop("sp_price", axis=1)
            .drop("price_change", axis=1)
            .drop("sp_price_change", axis=1)
            .drop("diff", axis=1)
            .drop("status", axis=1)
            .drop("ticker", axis=1)
            .drop("datekey", axis=1))
feature_names = data.columns
data.head()

Unnamed: 0,de,pe1,ps1,pb,netmargin,marketcap,ev,evebitda,revenueusd,gp,...,ncfdebt,ncfcommon,ncfdiv,ncfx,ncf,sps,payoutratio,tangibles,tbvps,workingcapital
0,1.153,15.048,2.221,4.449,0.153,55321780000.0,58776780000.0,8.701,6463000000.0,3127000000.0,...,1122000000.0,-431000000.0,-353000000.0,-17000000.0,831000000.0,35.369,0.357,21217000000.0,30.031,4625000000.0
1,1.183,13.261,1.925,3.912,0.14,49034180000.0,53495180000.0,7.867,6739000000.0,3229000000.0,...,-809000000.0,-434000000.0,-351000000.0,12000000.0,-1180000000.0,36.441,0.37,20590000000.0,29.326,3552000000.0
2,1.263,11.952,1.722,3.652,0.151,44557010000.0,49353010000.0,7.146,6558000000.0,3126000000.0,...,1181000000.0,-475000000.0,-348000000.0,-147000000.0,693000000.0,37.334,0.35,20773000000.0,29.868,3780000000.0
3,1.586,9.58,1.312,3.33,0.097,32899600000.0,37768600000.0,5.887,5509000000.0,2408000000.0,...,-457000000.0,-2000000.0,-346000000.0,-263000000.0,-391000000.0,36.14,0.641,18396000000.0,26.549,3759000000.0
4,1.458,13.429,1.68,4.128,0.102,40190940000.0,44592940000.0,7.812,5089000000.0,2317000000.0,...,-598000000.0,34000000.0,-354000000.0,13000000.0,-217000000.0,34.456,0.68,17403000000.0,25.094,3919000000.0


In [12]:
# There are some columns with value in billions. This has to be scaled inorder for the model to complete training
# Ignore the warning.
data = preprocessing.scale(data)



In [13]:
# Randomly split the data into training & testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)

In [15]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [200, 500, 700, 1000],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)

In [16]:
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 500, 700, 1000], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
print(CV_rfc.best_params_)

{'max_features': 'sqrt', 'n_estimators': 700}


In [None]:
# Model Accuracy
print('Test Acc: %.3f' % rf.score(X_train, y_train))

In [None]:
# Model Accuracy
print('Test Acc: %.3f' % rf.score(X_test, y_test))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)