In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                     RepeatedStratifiedKFold,
                                     cross_val_score)

# Feature Selection
from sklearn.feature_selection import RFE

# ML libs
from sklearn.model_selection import train_test_split
from sklearn import metrics                                   

# Models
from sklearn.linear_model import (LinearRegression, 
                                  Ridge, 
                                  BayesianRidge)

from sklearn.ensemble import (GradientBoostingRegressor,
                              RandomForestRegressor)

from sklearn.neighbors import KNeighborsRegressor

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVR
# Viz libs
import matplotlib.pyplot as plt 

from etl_resources import sqlite_connection

In [2]:
def base_data():
    con = sqlite_connection()
    df = pd.read_sql('select * from training_clean',con=con)
    return df

In [3]:
def residual_plot(y_pred, residuals):
    
    plt.scatter(y_pred, residuals)
    plt.title("residual plot")
    plt.xlabel("price")
    plt.ylabel("residuals")
    plt.axhline(0, color='red')
    plt.show()

In [4]:
def select_features(df):
    
    non_nums = ['date','index','quarter','year','ticker','fiscaldateending','close','close_pct','close_val']
    features = [f for f in df.columns if f not in non_nums]
    
    X = df[features]
    Y = df['close'].values
    #print(features)
    
    return X, Y

In [5]:
def profile_data():
    
    con = sqlite_connection()
    
    qry = '''select * from training'''
    
    df = pd.read_sql(qry, con=con)
    
    profile = ProfileReport(df, title='training profile')
    profile.to_file('../data/profiles/pre-training.html')

In [6]:
cols = ['totalassets', 'inventory', 'currentnetreceivables', 'totalnoncurrentassets', 'propertyplantequipment', 'accumulateddepreciationamortizationppe', 'intangibleassetsexcludinggoodwill', 'goodwill', 'othercurrentassets', 'othernoncurrrentassets', 'totalcurrentliabilities', 'currentaccountspayable', 'deferredrevenue', 'shorttermdebt', 'totalnoncurrentliabilities', 'capitalleaseobligations', 'longtermdebtnoncurrent', 'shortlongtermdebttotal', 'othernoncurrentliabilities', 'totalshareholderequity', 'treasurystock', 'retainedearnings', 'commonstock', 'commonstocksharesoutstanding', 'totalassets_pct', 'cashandcashequivalentsatcarryingvalue_pct', 'inventory_pct', 'currentnetreceivables_pct', 'propertyplantequipment_pct', 'accumulateddepreciationamortizationppe_pct', 'intangibleassetsexcludinggoodwill_pct', 'goodwill_pct', 'currentaccountspayable_pct', 'deferredrevenue_pct', 'currentdebt_pct', 'shorttermdebt_pct', 'totalnoncurrentliabilities_pct', 'longtermdebtnoncurrent_pct', 'shortlongtermdebttotal_pct', 'othercurrentliabilities_pct', 'othernoncurrentliabilities_pct', 'totalshareholderequity_pct', 'commonstocksharesoutstanding_pct', 'totalassets_val', 'currentnetreceivables_val', 'propertyplantequipment_val', 'investments_val', 'shortterminvestments_val', 'totalcurrentliabilities_val', 'currentdebt_val', 'shorttermdebt_val', 'totalshareholderequity_val', 'treasurystock_val', 'retainedearnings_val', 'commonstock_val', 'commonstocksharesoutstanding_val', 'paymentsforoperatingactivities', 'changeinoperatingliabilities', 'changeinoperatingassets', 'depreciationdepletionandamortization', 'capitalexpenditures', 'cashflowfrominvestment', 'paymentsforrepurchaseofcommonstock', 'dividendpayout', 'dividendpayoutcommonstock', 'dividendpayoutpreferredstock', 'changeinexchangerate', 'netincome', 'proceedsfromoperatingactivities_pct', 'changeinoperatingliabilities_pct', 'changeinoperatingassets_pct', 'changeinreceivables_pct', 'profitloss_pct', 'dividendpayout_pct', 'dividendpayoutcommonstock_pct', 'proceedsfromoperatingactivities_val', 'dividendpayoutcommonstock_val', 'changeinexchangerate_val', 'totalrevenue', 'costofgoodsandservicessold', 'operatingincome', 'sellinggeneralandadministrative', 'researchanddevelopment', 'operatingexpenses', 'investmentincomenet', 'noninterestincome', 'othernonoperatingincome', 'depreciation', 'depreciationandamortization', 'incometaxexpense', 'interestanddebtexpense', 'netincomefromcontinuingoperations', 'ebit', 'ebitda', 'grossprofit_pct', 'totalrevenue_pct', 'investmentincomenet_pct', 'interestexpense_pct', 'noninterestincome_pct', 'incometaxexpense_pct', 'interestanddebtexpense_pct', 'ebitda_pct', 'researchanddevelopment_val', 'investmentincomenet_val', 'incomebeforetax_val', 'interestanddebtexpense_val', 'netincomefromcontinuingoperations_val', 'ebit_val', 'ebitda_val', 'cpi_perc_change', 'cpi_val_change', 'cpi_value', 'effr_val_change', 'effr_value', 'gdp_perc_change', 'gdp_val_change', 'gdp_value', 'retail_perc_change', 'retail_val_change', 'retail_value', 'unemp_perc_change', 'unemp_value']

In [None]:
CREATE TABLE "training_clean" (
"index" INTEGER,
  "date" TEXT,
  "quarter" INTEGER,
  "year" INTEGER,
  "ticker" TEXT,
  "close" REAL,
  "close_pct" REAL,
  "close_val" REAL,
  "fiscaldateending" TEXT,
  "totalassets" REAL,
  "inventory" REAL,
  "currentnetreceivables" REAL,
  "totalnoncurrentassets" REAL,
  "propertyplantequipment" REAL,
  "accumulateddepreciationamortizationppe" REAL,
  "intangibleassets" REAL,
  "longterminvestments" REAL,
  "shortterminvestments" REAL,
  "othercurrentassets" REAL,
  "othernoncurrrentassets" REAL,
  "totalcurrentliabilities" REAL,
  "currentaccountspayable" REAL,
  "deferredrevenue" REAL,
  "currentdebt" REAL,
  "totalnoncurrentliabilities" REAL,
  "capitalleaseobligations" REAL,
  "longtermdebt" REAL,
  "currentlongtermdebt" REAL,
  "longtermdebtnoncurrent" REAL,
  "othernoncurrentliabilities" REAL,
  "totalshareholderequity" REAL,
  "treasurystock" REAL,
  "retainedearnings" REAL,
  "commonstock" REAL,
  "commonstocksharesoutstanding" REAL,
  "totalassets_pct" REAL,
  "totalcurrentassets_pct" REAL,
  "cashandcashequivalentsatcarryingvalue_pct" REAL,
  "cashandshortterminvestments_pct" REAL,
  "inventory_pct" REAL,
  "currentnetreceivables_pct" REAL,
  "totalnoncurrentassets_pct" REAL,
  "propertyplantequipment_pct" REAL,
  "accumulateddepreciationamortizationppe_pct" REAL,
  "intangibleassets_pct" REAL,
  "intangibleassetsexcludinggoodwill_pct" REAL,
  "goodwill_pct" REAL,
  "investments_pct" REAL,
  "longterminvestments_pct" REAL,
  "shortterminvestments_pct" REAL,
  "othercurrentassets_pct" REAL,
  "othernoncurrrentassets_pct" REAL,
  "totalliabilities_pct" REAL,
  "totalcurrentliabilities_pct" REAL,
  "currentaccountspayable_pct" REAL,
  "deferredrevenue_pct" REAL,
  "currentdebt_pct" REAL,
  "shorttermdebt_pct" REAL,
  "totalnoncurrentliabilities_pct" REAL,
  "capitalleaseobligations_pct" REAL,
  "longtermdebt_pct" REAL,
  "currentlongtermdebt_pct" REAL,
  "longtermdebtnoncurrent_pct" REAL,
  "shortlongtermdebttotal_pct" REAL,
  "othercurrentliabilities_pct" REAL,
  "othernoncurrentliabilities_pct" REAL,
  "totalshareholderequity_pct" REAL,
  "treasurystock_pct" REAL,
  "retainedearnings_pct" REAL,
  "commonstock_pct" REAL,
  "commonstocksharesoutstanding_pct" REAL,
  "totalassets_val" REAL,
  "totalcurrentassets_val" REAL,
  "cashandcashequivalentsatcarryingvalue_val" REAL,
  "inventory_val" REAL,
  "currentnetreceivables_val" REAL,
  "totalnoncurrentassets_val" REAL,
  "propertyplantequipment_val" REAL,
  "accumulateddepreciationamortizationppe_val" REAL,
  "intangibleassets_val" REAL,
  "goodwill_val" REAL,
  "investments_val" REAL,
  "longterminvestments_val" REAL,
  "shortterminvestments_val" REAL,
  "othercurrentassets_val" REAL,
  "othernoncurrrentassets_val" REAL,
  "totalcurrentliabilities_val" REAL,
  "currentaccountspayable_val" REAL,
  "deferredrevenue_val" REAL,
  "currentdebt_val" REAL,
  "shorttermdebt_val" REAL,
  "totalnoncurrentliabilities_val" REAL,
  "capitalleaseobligations_val" REAL,
  "longtermdebt_val" REAL,
  "currentlongtermdebt_val" REAL,
  "longtermdebtnoncurrent_val" REAL,
  "shortlongtermdebttotal_val" REAL,
  "othercurrentliabilities_val" REAL,
  "othernoncurrentliabilities_val" REAL,
  "totalshareholderequity_val" REAL,
  "treasurystock_val" REAL,
  "retainedearnings_val" REAL,
  "commonstock_val" REAL,
  "commonstocksharesoutstanding_val" REAL,
  "operatingcashflow" REAL,
  "paymentsforoperatingactivities" REAL,
  "proceedsfromoperatingactivities" REAL,
  "changeinoperatingliabilities" REAL,
  "changeinoperatingassets" REAL,
  "depreciationdepletionandamortization" REAL,
  "capitalexpenditures" REAL,
  "changeinreceivables" REAL,
  "changeininventory" REAL,
  "profitloss" REAL,
  "cashflowfrominvestment" REAL,
  "proceedsfromrepaymentsofshorttermdebt" REAL,
  "paymentsforrepurchaseofcommonstock" REAL,
  "paymentsforrepurchaseofpreferredstock" REAL,
  "dividendpayout" REAL,
  "dividendpayoutpreferredstock" REAL,
  "proceedsfromissuanceofcommonstock" REAL,
  "proceedsfromissuanceoflongtermdebtandcapitalsecuritiesnet" REAL,
  "proceedsfromissuanceofpreferredstock" REAL,
  "proceedsfromsaleoftreasurystock" REAL,
  "changeincashandcashequivalents" REAL,
  "changeinexchangerate" REAL,
  "operatingcashflow_pct" REAL,
  "paymentsforoperatingactivities_pct" REAL,
  "proceedsfromoperatingactivities_pct" REAL,
  "changeinoperatingliabilities_pct" REAL,
  "changeinoperatingassets_pct" REAL,
  "depreciationdepletionandamortization_pct" REAL,
  "capitalexpenditures_pct" REAL,
  "changeinreceivables_pct" REAL,
  "changeininventory_pct" REAL,
  "profitloss_pct" REAL,
  "cashflowfrominvestment_pct" REAL,
  "cashflowfromfinancing_pct" REAL,
  "proceedsfromrepaymentsofshorttermdebt_pct" REAL,
  "paymentsforrepurchaseofcommonstock_pct" REAL,
  "paymentsforrepurchaseofequity_pct" REAL,
  "paymentsforrepurchaseofpreferredstock_pct" REAL,
  "dividendpayout_pct" REAL,
  "dividendpayoutpreferredstock_pct" REAL,
  "proceedsfromissuanceofcommonstock_pct" REAL,
  "proceedsfromissuanceoflongtermdebtandcapitalsecuritiesnet_pct" REAL,
  "proceedsfromissuanceofpreferredstock_pct" REAL,
  "proceedsfromrepurchaseofequity_pct" REAL,
  "proceedsfromsaleoftreasurystock_pct" REAL,
  "changeincashandcashequivalents_pct" REAL,
  "changeinexchangerate_pct" REAL,
  "netincome_pct" REAL,
  "operatingcashflow_val" REAL,
  "paymentsforoperatingactivities_val" REAL,
  "proceedsfromoperatingactivities_val" REAL,
  "changeinoperatingliabilities_val" REAL,
  "changeinoperatingassets_val" REAL,
  "depreciationdepletionandamortization_val" REAL,
  "capitalexpenditures_val" REAL,
  "changeininventory_val" REAL,
  "profitloss_val" REAL,
  "cashflowfrominvestment_val" REAL,
  "cashflowfromfinancing_val" REAL,
  "paymentsforrepurchaseofcommonstock_val" REAL,
  "paymentsforrepurchaseofpreferredstock_val" REAL,
  "dividendpayout_val" REAL,
  "dividendpayoutcommonstock_val" REAL,
  "dividendpayoutpreferredstock_val" REAL,
  "proceedsfromissuanceofcommonstock_val" REAL,
  "proceedsfromissuanceoflongtermdebtandcapitalsecuritiesnet_val" REAL,
  "proceedsfromissuanceofpreferredstock_val" REAL,
  "proceedsfromrepurchaseofequity_val" REAL,
  "proceedsfromsaleoftreasurystock_val" REAL,
  "changeincashandcashequivalents_val" REAL,
  "changeinexchangerate_val" REAL,
  "grossprofit" REAL,
  "totalrevenue" REAL,
  "operatingincome" REAL,
  "sellinggeneralandadministrative" REAL,
  "researchanddevelopment" REAL,
  "operatingexpenses" REAL,
  "investmentincomenet" REAL,
  "noninterestincome" REAL,
  "othernonoperatingincome" REAL,
  "depreciation" REAL,
  "depreciationandamortization" REAL,
  "incometaxexpense" REAL,
  "grossprofit_pct" REAL,
  "totalrevenue_pct" REAL,
  "costofrevenue_pct" REAL,
  "costofgoodsandservicessold_pct" REAL,
  "operatingincome_pct" REAL,
  "sellinggeneralandadministrative_pct" REAL,
  "researchanddevelopment_pct" REAL,
  "operatingexpenses_pct" REAL,
  "investmentincomenet_pct" REAL,
  "netinterestincome_pct" REAL,
  "interestincome_pct" REAL,
  "othernonoperatingincome_pct" REAL,
  "depreciation_pct" REAL,
  "incomebeforetax_pct" REAL,
  "incometaxexpense_pct" REAL,
  "interestanddebtexpense_pct" REAL,
  "comprehensiveincomenetoftax_pct" REAL,
  "grossprofit_val" REAL,
  "totalrevenue_val" REAL,
  "costofrevenue_val" REAL,
  "costofgoodsandservicessold_val" REAL,
  "operatingincome_val" REAL,
  "sellinggeneralandadministrative_val" REAL,
  "researchanddevelopment_val" REAL,
  "operatingexpenses_val" REAL,
  "investmentincomenet_val" REAL,
  "netinterestincome_val" REAL,
  "interestincome_val" REAL,
  "interestexpense_val" REAL,
  "noninterestincome_val" REAL,
  "othernonoperatingincome_val" REAL,
  "depreciation_val" REAL,
  "depreciationandamortization_val" REAL,
  "incomebeforetax_val" REAL,
  "incometaxexpense_val" REAL,
  "interestanddebtexpense_val" REAL,
  "netincomefromcontinuingoperations_val" REAL,
  "ebit_val" REAL,
  "cpi_perc_change" REAL,
  "effr_perc_change" REAL,
  "effr_value" REAL,
  "gdp_perc_change" REAL,
  "gdp_value" REAL,
  "retail_perc_change" REAL,
  "retail_value" REAL,
  "unemp_perc_change" REAL,
  "unemp_value" REAL
)

In [12]:
df = base_data()
df = df[cols]
df.corr().to_csv('correlation.csv')