In [None]:
# Install the following packages in case not already on installed on your machine.
!pip install pandas_datareader
!pip install yfinance
# !pip install probscale

In [None]:
import pandas as pd      
import numpy as np
import math

# Import data visualation and plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("seaborn")

# Statistical analysis libraries
import scipy.stats as stats
import statistics as st
from scipy.stats import norm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.gofplots import qqplot
import pylab as py
import probscale

# Packages to be used to extract NYSE stock data and create/manipulate datetime objects
import yfinance as yf
import pandas_datareader
from pandas_datareader.data import DataReader
from pandas_datareader import data as pdr
import datetime
from datetime import date

In [None]:
def dens_histograms(df, company_list=[], company_name=[], sp500_stocks=False, num_cols=2, num_rows=3, part_num=''):
    %matplotlib inline
    plt.style.use("seaborn")
    
    if (sp500_stocks==False):
        fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20,20))
        fig.delaxes(axes[2,1]) # deletes the 6th plot instance (since we are computing an odd number of plots)

        colors = ['navy','firebrick','darkviolet','brown','forestgreen']
        for i, company in enumerate(company_list):
            ax = axes[int(i/num_cols), i%num_cols]
            sns.distplot(df[company], hist=True, fit=norm, ax=ax, color=colors[i], 
                         hist_kws={"alpha": 0.5})
            ax.set_xlabel('Daily Log-Return', fontsize=14)
            ax.set_ylabel('Frequency', fontsize=14)
            ax.set_title(company, fontsize=18) 

        plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.15,hspace=0.21)
        plt.savefig('figures/part'+part_num+'_nonSP500_histograms.png',bbox_inches='tight',pad_inches=0.1)
        plt.show()
    
    else:
        fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20,15))

        colors = ['navy','firebrick','darkviolet','forestgreen']
        for i, company in enumerate(company_list):
            ax = axes[int(i/num_cols), i%num_cols]
            sns.distplot(company['Daily Log-Return'], hist=True, fit=norm, ax=ax, color=colors[i], 
                         hist_kws={"alpha": 0.5})
            ax.set_xlabel('Daily Log-Return', fontsize=14)
            ax.set_ylabel('Frequency', fontsize=14)
            ax.set_title(company_name[i], fontsize=18) 

        plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.15,hspace=0.21)
        plt.savefig('figures/part'+part_num+'_sp500_stocks_histograms.png',bbox_inches='tight',pad_inches=0.1)
        plt.show()

In [None]:
def norm_qqplot(df, company_list=[], mod_list=[], resid_bool=False, num_cols=2, num_rows=3, part_num=''):
    %matplotlib inline
    plt.style.use("seaborn")
    
    if(resid_bool == False):
        fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20,20))
        axes.flatten()
        fig.delaxes(axes[2,1]) 
        for i, company in enumerate(company_list):
            ax = axes[int(i/num_cols), i%num_cols]
            qqplot(df[company], dist=norm, fit=True, line ='q',ax=ax)
            ax.set_title('Normal Q-Q Plot for '+ company + ' Log-Returns', fontsize=16)
            sns.despine()
        plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.15,hspace=0.22)
        plt.savefig('figures/part'+part_num+'_stock_qqplots.png')
        plt.show()
    else:
        fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20,15))
        axes.flatten() 
        for i, model in enumerate(model_list):
            ax = axes[int(i/num_cols), i%num_cols]
            qqplot(model.resid, dist=norm, fit=True, line ='q',ax=ax)
            ax.set_title('Normal Q-Q Plot for '+ company_list[i] + ' ~ SP500 Residuals', fontsize=16)
            sns.despine()
        plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.15,hspace=0.22)
        plt.savefig('figures/part'+part_num+'_residual_qqplots.png')
        plt.show()

In [None]:
def lr_plots(df, mod_list, com_list, num_cols=2,num_rows=2,part_num='', fig_size=(20,14), time_bool=False):
    %matplotlib inline
    plt.style.use("seaborn")
    
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=fig_size)
    if (time_bool==False):
        for i, model in enumerate(mod_list):
            ax = axes[int(i/num_cols), i%num_cols]
            intercept = model.params[0]
            slope=model.params[1]
            x = df.SP500
            y = df[com_list[i]]
            ax.plot(x, y, 'o', label='original data')
            ax.plot(x, intercept + slope*x, 'k', label='fitted line')
            ax.set_title(com_list[i] + ' ~ SP500',fontsize=16)
            ax.legend()

        plt.savefig('figures/part'+part_num+'_lr_plots.png')
        plt.show()
    else:
        for i, model in enumerate(mod_list):
            ax = axes[int(i/num_cols), i%num_cols]
            intercept = model.params[0]
            slope=model.params[1]
            x = df.SP500
            y = df[com_list[i]]
            ax.plot(x, y, 'o', label='original data')
            ax.plot(x, intercept + slope*x, 'k', label='fitted line')
            ax.set_title(com_list[i] + ' ~ time',fontsize=16)
            ax.legend()

        plt.savefig('figures/part'+part_num+'_time_lr_plots.png')
        plt.show()

In [None]:
def one_way_anova(df, com_list):
    print('One-Way ANOVA tests the equality of two population means \n')
    for i, ticker in enumerate(com_list):
        anova = stats.f_oneway(df[ticker], df.SP500)
        print('One-Way ANOVA Test for', ticker, 'and SP500')
        print('=====================================')
        print('F-statistic: ', anova.statistic)
        print('p-value:     ', anova.pvalue, '\n\n')

In [None]:
def eq_means_t_test(df,com_list):
    print('This also tests the equality of two population means \n') 
    for i, ticker in enumerate(com_list):
        two_pop_t_test = stats.ttest_ind(df[ticker], df.SP500)
        print('T-test for Means of', ticker, 'and SP500')
        print('==================================')
        print('t-statistic: ', two_pop_t_test.statistic)
        print('p-value:     ', two_pop_t_test.pvalue, '\n\n')

In [None]:
def resid_vs_fit(df, mod_list, com_list, num_cols=2,num_rows=2,part_num=''):
    %matplotlib inline
    plt.style.use("seaborn")
    
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(25,15))
    for i, model in enumerate(mod_list):
        ax = axes[int(i/num_cols), i%num_cols]
        y_hat = model.fittedvalues
        sns.set(rc = {'figure.figsize':(25,15)})
        res_plot = sns.residplot(x=y_hat, y=com_list[i], data=df, lowess=True, 
                             scatter_kws={'alpha': 0.65}, 
                             line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8},ax=ax)

        res_plot.set_title('Residuals vs Fitted (' + com_list[i] + ' ~ SP500)',fontsize=18)
        res_plot.set_xlabel('Fitted values',fontsize=16)
        res_plot.set_ylabel('Residuals',fontsize=16)
        plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.15,hspace=0.22)
        
    plt.savefig('figures/part'+part_num+'_residual_plots.png')
    plt.show()

In [None]:
def paired_data_t_test(df,com_list):
    print('This tests the equality of means from paired data \n') 
    for i, ticker in enumerate(com_list):
        paired_t_test = stats.ttest_rel(df[ticker], df.SP500)
        print('T-test of Means within Paired data for', ticker, 'and SP500')
        print('===========================================================')
        print('t-statistic: ', paired_t_test.statistic)
        print('p-value:     ', paired_t_test.pvalue, '\n\n')

In [None]:
def runs_test(df, com_list): 
    for i, ticker in enumerate(com_list):
        rt_results = runstest_1samp(df[ticker])
        print('Runs test for', ticker)
        print('================================')
        print('test stat: ', rt_results[0])
        print('p-value:   ', rt_results[1], '\n\n')