In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                     RepeatedStratifiedKFold,
                                     cross_val_score)

# Feature Selection
from sklearn.feature_selection import RFE

# ML libs
from sklearn.model_selection import train_test_split
from sklearn import metrics                                   

# Models
from sklearn.linear_model import (LinearRegression, 
                                  Ridge, 
                                  BayesianRidge)

from sklearn.ensemble import (GradientBoostingRegressor,
                              RandomForestRegressor)

from sklearn.neighbors import KNeighborsRegressor

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVR
# Viz libs
import matplotlib.pyplot as plt 

from etl_resources import sqlite_connection

In [2]:
def base_data():
    con = sqlite_connection()
    df = pd.read_sql('select * from training_clean',con=con)
    return df

In [3]:
def residual_plot(y_pred, residuals):
    
    plt.scatter(y_pred, residuals)
    plt.title("residual plot")
    plt.xlabel("price")
    plt.ylabel("residuals")
    plt.axhline(0, color='red')
    plt.show()

In [4]:
def select_features(df):
    
    non_nums = ['date','index','quarter','year','ticker','fiscaldateending','close','close_pct','close_val']
    features = [f for f in df.columns if f not in non_nums]
    
    X = df[features]
    Y = df['close'].values
    #print(features)
    
    return X, Y

In [5]:
def profile_data():
    
    con = sqlite_connection()
    
    qry = '''select * from training'''
    
    df = pd.read_sql(qry, con=con)
    
    profile = ProfileReport(df, title='training profile')
    profile.to_file('../data/profiles/pre-training.html')

In [6]:
cols = ['totalassets', 'inventory', 'currentnetreceivables', 'totalnoncurrentassets', 'propertyplantequipment', 'accumulateddepreciationamortizationppe', 'intangibleassetsexcludinggoodwill', 'goodwill', 'othercurrentassets', 'othernoncurrrentassets', 'totalcurrentliabilities', 'currentaccountspayable', 'deferredrevenue', 'shorttermdebt', 'totalnoncurrentliabilities', 'capitalleaseobligations', 'longtermdebtnoncurrent', 'shortlongtermdebttotal', 'othernoncurrentliabilities', 'totalshareholderequity', 'treasurystock', 'retainedearnings', 'commonstock', 'commonstocksharesoutstanding', 'totalassets_pct', 'cashandcashequivalentsatcarryingvalue_pct', 'inventory_pct', 'currentnetreceivables_pct', 'propertyplantequipment_pct', 'accumulateddepreciationamortizationppe_pct', 'intangibleassetsexcludinggoodwill_pct', 'goodwill_pct', 'currentaccountspayable_pct', 'deferredrevenue_pct', 'currentdebt_pct', 'shorttermdebt_pct', 'totalnoncurrentliabilities_pct', 'longtermdebtnoncurrent_pct', 'shortlongtermdebttotal_pct', 'othercurrentliabilities_pct', 'othernoncurrentliabilities_pct', 'totalshareholderequity_pct', 'commonstocksharesoutstanding_pct', 'totalassets_val', 'currentnetreceivables_val', 'propertyplantequipment_val', 'investments_val', 'shortterminvestments_val', 'totalcurrentliabilities_val', 'currentdebt_val', 'shorttermdebt_val', 'totalshareholderequity_val', 'treasurystock_val', 'retainedearnings_val', 'commonstock_val', 'commonstocksharesoutstanding_val', 'paymentsforoperatingactivities', 'changeinoperatingliabilities', 'changeinoperatingassets', 'depreciationdepletionandamortization', 'capitalexpenditures', 'cashflowfrominvestment', 'paymentsforrepurchaseofcommonstock', 'dividendpayout', 'dividendpayoutcommonstock', 'dividendpayoutpreferredstock', 'changeinexchangerate', 'netincome', 'proceedsfromoperatingactivities_pct', 'changeinoperatingliabilities_pct', 'changeinoperatingassets_pct', 'changeinreceivables_pct', 'profitloss_pct', 'dividendpayout_pct', 'dividendpayoutcommonstock_pct', 'proceedsfromoperatingactivities_val', 'dividendpayoutcommonstock_val', 'changeinexchangerate_val', 'totalrevenue', 'costofgoodsandservicessold', 'operatingincome', 'sellinggeneralandadministrative', 'researchanddevelopment', 'operatingexpenses', 'investmentincomenet', 'noninterestincome', 'othernonoperatingincome', 'depreciation', 'depreciationandamortization', 'incometaxexpense', 'interestanddebtexpense', 'netincomefromcontinuingoperations', 'ebit', 'ebitda', 'grossprofit_pct', 'totalrevenue_pct', 'investmentincomenet_pct', 'interestexpense_pct', 'noninterestincome_pct', 'incometaxexpense_pct', 'interestanddebtexpense_pct', 'ebitda_pct', 'researchanddevelopment_val', 'investmentincomenet_val', 'incomebeforetax_val', 'interestanddebtexpense_val', 'netincomefromcontinuingoperations_val', 'ebit_val', 'ebitda_val', 'cpi_perc_change', 'cpi_val_change', 'cpi_value', 'effr_val_change', 'effr_value', 'gdp_perc_change', 'gdp_val_change', 'gdp_value', 'retail_perc_change', 'retail_val_change', 'retail_value', 'unemp_perc_change', 'unemp_value']

In [12]:
df = base_data()
df = df[cols]
df.corr().to_csv('correlation.csv')