# Project Functions

### Description

Below are the different functions used in the analysis.

In [2]:
# Imported necessary packages
from scipy import stats
from scipy.stats import skew
from scipy.stats import kurtosis
import math
import seaborn as sns 
sns.set()
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import random as rnd
import numpy as np

### Confidence Interval Function

In [None]:
# T_test confidence intervals
def get_95_ci(array_1, array_2):
    sample_1_n = array_1.shape[0]
    sample_2_n = array_2.shape[0]
    sample_1_mean = array_1.mean()
    sample_2_mean = array_2.mean()
    sample_1_var = array_1.var()
    sample_2_var = array_2.var()
    mean_difference = sample_1_mean - sample_2_mean 
    std_err_difference = math.sqrt((sample_1_var/sample_1_n) + (sample_2_var/sample_2_n))
    margin_of_error = 1.96 * std_err_difference
    ci_lower = mean_difference - margin_of_error
    ci_upper = mean_difference + margin_of_error
    return '(' + str(round(ci_lower,2)) + ' & ' + str(round(ci_upper,2)) + ')'

### Scatter Subplot With Pearson Correlation Coefficient Function

In [None]:
# Correlations and scatter plots for six columns (one y value and five X values)
def series_stats_gen(column, string1, column2, string2, column3, string3, column4, string4, 
                     column5, string5, column6, string6):
    # Scatter plots (sub plots)
    print('\n')
    sns.set_style("whitegrid")
    fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(ncols=5, sharey=True, figsize=(20, 4))
    sns.regplot(column2, column, ax=ax1).grid(False)
    ax1.set_title('Correlation: {} \n'.format(round(stats.pearsonr(column, column2)[0],2)))
    ax1.set_xlabel('\n' + string2)
    ax1.set_ylim(column.min(), column.max())
    ax1.set_ylabel(string1 + '\n')

    sns.regplot(column3, column, ax=ax2).grid(False)
    ax2.set_title('Correlation: {} \n'.format(round(stats.pearsonr(column, column3)[0],2)))
    ax2.set_xlabel('\n' + string3)
    ax2.set_ylabel('')

    sns.regplot(column4, column, ax=ax3).grid(False)
    ax3.set_title('Correlation: {} \n'.format(round(stats.pearsonr(column, column4)[0],2)))
    ax3.set_xlabel('\n' + string4)
    ax3.set_ylabel('')

    sns.regplot(column5, column, ax=ax4).grid(False)
    ax4.set_title('Correlation: {} \n'.format(round(stats.pearsonr(column, column5)[0],2)))
    ax4.set_xlabel('\n' + string5)
    ax4.set_ylabel('')

    sns.regplot(column6, column, ax=ax5).grid(False)
    ax5.set_title('Correlation: {} \n'.format(round(stats.pearsonr(column, column6)[0],2)))
    ax5.set_xlabel('\n' + string6)
    # Zoom in on the data
    ax5.set_xlim(1979, 2019)
    ax5.set_ylabel('')
    return plt.show()

### Descriptive Stats for One Column Function

In [None]:
# Descriptive stats for individual coulmns
def series_stats_gen1(column, string1):
    
    # Tuple unpacking
    print('\n')
    print('Statistics for: {}'.format(string1))
    mode = round(sum(column.mode())/len(column.mode()),2)
    discriptives = [('Mean:',round(column.mean(),2)),
                  ('Standard Error:',round(math.sqrt(column.var()/column.count()),2)),
                  ('Median:',round(column.median(),2)),('Mode:',mode),('Std:',round(column.std(),2)),
                  ('Variance:',round(column.var(),2)),('Kurtosis:',round(kurtosis(column),2)),
                  ('Skewness:',round(skew(column),2)),('Range:',round(column.max()-column.min(),2)),
                  ('Min:',round(column.min(),2)),('Max:',round(column.max(),2)),
                  ('Sum:',round(column.sum(),2)),('Count:',round(column.count(),2))]

    for label, value in discriptives:
        print(f"{label:{20}} {value:.>{20}}")
    return print("")

### T-test With Supporting Bar Plots and Error Bars Function

In [None]:
# Descriptive stats, t-tests, confidence intervals, and bar plots for two columns
def series_stats_gen2(string, column, string2, column2, t, length, width):
    
    # Legend
    print('\n')
    legend = [('Legend',''),('',''),('Decade1','years >= 1979 & < 1989'),('Decade2','years >= 1989 & < 1999'),
            ('Decade3','years >= 1999 & < 2009'),('Decade4','years >= 2009 & < 2019')]

    for label, value in legend:
        print(f"{label:{30}} {value:.{30}}")
    
    print('\n')
    print(string, column.name, 'Treatment       ', string2, column2.name, 'Control')
    
    # Tuple unpacking
    mode = round(sum(column.mode())/len(column.mode()),2)
    mode2 = round(sum(column2.mode())/len(column2.mode()),2)
    discriptives = [('Mean:',round(column.mean(),2),'   Mean:',round(column2.mean(),2)),
                  ('Standard Error:',round(math.sqrt(column.var()/column.count()),2),
                  '   Standard Error:',round(math.sqrt(column2.var()/column2.count()),2)),
                  ('Median:',round(column.median(),2),'   Median:',round(column2.median(),2)),
                  ('Mode:',mode,'   Mode:',mode2),
                  ('Std:',round(column.std(),2),'   Std:',round(column2.std(),2)),
                  ('Variance:',round(column.var(),2),'   Variance:',round(column2.var(),2)),
                  ('Kurtosis:',round(kurtosis(column),2),'   Kurtosis:',round(kurtosis(column2),2)),
                  ('Skewness:',round(skew(column),2),'   Skewness:',round(skew(column2),2)),
                  ('Range:',round(column.max()-column.min(),2),'   Range:',round(column2.max()-column2.min(),2)),
                  ('Min:',round(column.min(),2),'   Min:',round(column2.min(),2)),
                  ('Max:',round(column.max(),2),'   Max:',round(column2.max(),2)),
                  ('Sum:',round(column.sum(),2),'   Sum:',round(column2.sum(),2)),
                  ('Count:',round(column.count(),2),'   Count:',round(column2.count(),2))]

    for label, value, label2, value2 in discriptives:
        print(f"{label:{20}} {value:.>{20}} {label2:{23}} {value2:.>{20}}")

    print('\n')

    diff = round(column.mean()-column2.mean(),2)
    t_stat = round(stats.ttest_ind(column, column2, equal_var = False)[0],2)
    p_value = round(stats.ttest_ind(column, column2)[1],2)
    ci = get_95_ci(column, column2)

    ci1 = [('Difference in Means:', diff),('T-Stat:', t_stat),('P-Value:', p_value), ('95% CI (two-tail):', ci)]

    for label, value in ci1:
        print(f"{label:{20}} {value:.>{20}}")

    print('\n')
    
    # Bar plot with error bars
    col1 = column
    col2 = column2

    mean1 = col1.mean()
    var1 = col1.var()
    count1 = col1.count()
    upper1 = round((math.sqrt(var1/count1))*1.96,2)
    lower1 = round((math.sqrt(var1/count1))*1.96,2)

    mean2 = col2.mean()
    var2 = col2.var()
    count2 = col2.count()
    upper2 = round((math.sqrt(var2/count2))*1.96,2)
    lower2 = round((math.sqrt(var2/count2))*1.96,2)

    means = [mean1,mean2] 
    ci = [(lower1,lower2),(upper1,upper2)]

    plt.figure(figsize=(width, length))
    sns.set_style('white')
    blue_patch = mpatches.Patch(color='b', label=string)
    orange_patch = mpatches.Patch(color='darkorange', label=string2)
    plt.legend(handles=[blue_patch,orange_patch])
    plt.bar([0,1], means, yerr=ci, alpha=1, align='center',color=("b","darkorange"))
    plt.xticks(range(len(means)), [str(x) for x in ['Treatment','Control']])
    plt.ylabel('S&P 500 Index Closing Prices')
    plt.title(t)
    return plt.show()


### Descriptive Statistics for Five Columns Function

In [None]:
# Descriptive statistics for five columns
def series_stats_gen3(column, column2, column3, column4, column5):
    
    # Tuple unpacking
    mode = round(sum(column.mode())/len(column.mode()),2)
    mode2 = round(sum(column2.mode())/len(column2.mode()),2)
    mode3 = round(sum(column3.mode())/len(column3.mode()),2)
    mode4 = round(sum(column4.mode())/len(column4.mode()),2)
    mode5 = round(sum(column5.mode())/len(column5.mode()),2)
    discriptives = [('Mean:',round(column.mean(),2),'   Mean:',round(column2.mean(),2),
                    '   Mean:',round(column3.mean(),2)),
                  ('Standard Error:',round(math.sqrt(column.var()/column.count()),2),
                  '   Standard Error:',round(math.sqrt(column2.var()/column2.count()),2),
                  '   Standard Error:',round(math.sqrt(column3.var()/column3.count()),2)),
                  ('Median:',round(column.median(),2),'   Median:',round(column2.median(),2),
                  '   Median:',round(column3.median(),2)),
                  ('Mode:',mode,'   Mode:',mode2,'   Mode:',mode3),
                  ('Std:',round(column.std(),2),'   Std:',round(column2.std(),2),
                  '   Std:',round(column3.std(),2)),
                  ('Variance:',round(column.var(),2),'   Variance:',round(column2.var(),2),
                  '   Variance:',round(column3.var(),2)),
                  ('Kurtosis:',round(kurtosis(column),2),'   Kurtosis:',round(kurtosis(column2),2),
                  '   Kurtosis:',round(kurtosis(column3),2)),
                  ('Skewness:',round(skew(column),2),'   Skewness:',round(skew(column2),2),
                  '   Skewness:',round(skew(column3),2)),
                  ('Range:',round(column.max()-column.min(),2),
                  '   Range:',round(column2.max()-column2.min(),2),
                  '   Range:',round(column3.max()-column3.min(),2)),
                  ('Min:',round(column.min(),2),'   Min:',round(column2.min(),2),
                  '   Min:',round(column3.min(),2)),
                  ('Max:',round(column.max(),2),'   Max:',round(column2.max(),2),
                  '   Max:',round(column3.max(),2)),
                  ('Sum:',round(column.sum(),2),'   Sum:',round(column2.sum(),2),
                  '   Sum:',round(column3.sum(),2)),
                  ('Count:',round(column.count(),2),'   Count:',round(column2.count(),2),
                  '   Count:',round(column3.count(),2))]
    discriptives2 = [('Mean:',round(column4.mean(),2),'   Mean:',round(column5.mean(),2)),
                  ('Standard Error:',round(math.sqrt(column4.var()/column4.count()),2),
                  '   Standard Error:',round(math.sqrt(column5.var()/column5.count()),2)),
                  ('Median:',round(column4.median(),2),'   Median:',round(column5.median(),2)),
                  ('Mode:',mode4,'   Mode:',mode5),
                  ('Std:',round(column4.std(),2),'   Std:',round(column5.std(),2)),
                  ('Variance:',round(column4.var(),2),'   Variance:',round(column5.var(),2)),
                  ('Kurtosis:',round(kurtosis(column4),2),'   Kurtosis:',round(kurtosis(column5),2)),
                  ('Skewness:',round(skew(column4),2),'   Skewness:',round(skew(column5),2)),
                  ('Range:',round(column4.max()-column4.min(),2),
                  '   Range:',round(column5.max()-column5.min(),2)),
                  ('Min:',round(column4.min(),2),'   Min:',round(column5.min(),2)),
                  ('Max:',round(column4.max(),2),'   Max:',round(column5.max(),2)),
                  ('Sum:',round(column4.sum(),2),'   Sum:',round(column5.sum(),2)),
                  ('Count:',round(column4.count(),2),'   Count:',round(column5.count(),2))]
    
    print('\n')
    print(column.name,' stats                         ',column2.name,' stats           ',column3.name)
    for label,value,label2,value2,label3,value3 in discriptives:
        print(f"{label:{20}} {value:.>{15}} {label2:{20}} {value2:.>{15}} {label3:{22}} {value3:.>{17}}")
    print('\n')
    print(column4.name,' stats             ',column5.name)
    for label4,value4,label5,value5 in discriptives2:
        print(f"{label4:{20}} {value4:.>{15}} {label5:{20}} {value5:.>{15}}")

### Monte Carlo Price and Return Sims for Two Columns <br> and Data Sets Function

In [None]:
# Monte Carlo price and return simulations for two columns and data sets
def price_sims(data1, prices1, string1, data2, prices2, string2, number_of_sims, years_compound):
    
    # Monte Carlo random number generator and comparision
    def price_sim():

        sd1 = data1[prices1].std()
        mean1 = data1[prices1].mean()
        rnd1 = rnd.gauss(mean1,sd1)

        sd2 = data2[prices2].std()
        mean2 = data2[prices2].mean()
        rnd2 = rnd.gauss(mean2,sd2)

        sim = rnd1 - rnd2
        sim_total = rnd2
        sim_invested = rnd1


        if int(sim) > 0:
            return [1,sim_total, sim_invested]
        elif int(sim) < 0:
            return [-1, sim_total, sim_invested]
        else:
            return [0, sim_total, sim_invested]
    
    # For loop to run the price_sims() function a specified number of times
    profit = 0
    loss = 0
    break_even = 0
    sim_total = []
    sim_invested = []

    for i in range (number_of_sims):
        sim = price_sim()[0]
        sim_total.append(price_sim()[1])
        sim_invested.append(price_sim()[2])
    if sim == 1:
        profit += 1
    elif sim == -1:
        loss += 1
    else:
        break_even += 1
    
    # Average simulated compounded return (geometric means) calculations
    sim_total = sum(sim_total)/number_of_sims
    sim_invested = sum(sim_invested)/number_of_sims
    try:
        compound_return = round((((sim_total/sim_invested)**(1/years_compound))-1)*100,2)
    except TypeError:
        compound_return = years_compound
       
    # Actual compounded returns calculations
    actual_invested = data1.loc[0,prices1]
    actual_return = data2.loc[data2[prices2].count()-1,prices2]
    actual_compound_return = round((((actual_return/actual_invested)**(1/years_compound))-1)*100,2)
    
    # strings and tuple unpacking to present results 
    print('Probability that ' + string1 + ' closing prices are greater than ' + string2 + ' closing prices')

    profit_string = string1 + ' larger than ' + string2 + ' (profit):'
    comp_return_string = string1 + ' to ' + string2 + ' average compounded return:'
    loss_string = string1 + ' lesser than ' + string2 + ' (loss):'
    break_even_string = string1 + ' even with ' + string2 +  ' (Break Even):'
    num_sims_string = 'Number of simulations:'
    actual_string = string1 + ' to ' +  string2 + ' actual compounded return:'

    profit_value = round((profit/(profit + loss + break_even))*100,2)
     
    if int(string2[7]) - int(string1[7]) > -1:
        comp_return_value = str(compound_return) + '%'
        actual_comp_value = str(actual_compound_return) + '%'
    else:
        comp_return_value = 'N/A'
        actual_comp_value = 'N/A'
        
    loss_value = round((loss/(profit + loss + break_even))*100,2)
    break_even_value = round((break_even/(profit + loss + break_even))*100,2)
    num_sims_value = number_of_sims

    sims = [(profit_string, str(profit_value) + '% of the time'),
            (loss_string, str(loss_value) + '% of the time'),
            (break_even_string, str(break_even_value) + '% of the time'),
            (comp_return_string, str(comp_return_value)),
            (num_sims_string, num_sims_value),
            (actual_string, str(actual_comp_value))]

    for label, value in sims:
        print(f"{label:{53}} {value:.>{53}}")

### Correlations and Scatter Plots for Five of the Same Columns <br> in Five Different Data Sets Function

In [None]:
# Correlations and scatter plots for five of the same columns in five different data sets
def correlations(string1, string2, column, column2, data1, data2, data3, data4):
    
    # Legend
    print('\n')
    legend = [('Legend',''),('',''),('Decade1','years >= 1979 & < 1989'),('Decade2','years >= 1989 & < 1999'),
            ('Decade3','years >= 1999 & < 2009'),('Decade4','years >= 2019')]

    for label, value in legend:
        print(f"{label:{17}} {value:.{30}}")

    print('\n')
    
    # Scatter plots (sub plots)
    sns.set_style("whitegrid")
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, sharey=True, figsize=(20, 4))
    sns.regplot(data1[column2], data1[column], ax=ax1).grid(False)
    ax1.set_title(' Decade 1 Correlation: {} \n'.format(round(stats.pearsonr(data1[column],
                                                                             data1[column2])[0],2)))
    ax1.set_xlim(data1[column2].min(), data1[column2].max())
    ax1.set_xlabel('\n' + string2)
    ax1.set_ylabel(string1 + '\n')

    sns.regplot(data2[column2], data2[column], ax=ax2).grid(False)
    ax2.set_title(' Decade 2 Correlation: {} \n'.format(round(stats.pearsonr(data2[column],
                                                                             data2[column2])[0],2)))
    ax2.set_xlim(data2[column2].min(), data2[column2].max())
    ax2.set_xlabel('\n' + string2)
    ax2.set_ylabel('')

    sns.regplot(data3[column2], data3[column], ax=ax3).grid(False)
    ax3.set_title(' Decade 3 Correlation: {} \n'.format(round(stats.pearsonr(data3[column],
                                                                             data3[column2])[0],2)))
    ax3.set_xlim(data3[column2].min(), data3[column2].max())
    ax3.set_xlabel('\n' + string2)
    ax3.set_ylabel('')

    sns.regplot(data4[column2], data4[column], ax=ax4).grid(False)
    ax4.set_title(' Decade 4 Correlation: {} \n'.format(round(stats.pearsonr(data4[column],
                                                                             data4[column2])[0],2)))
    ax4.set_xlim(data4[column2].min(), data4[column2].max())
    ax4.set_xlabel('\n' + string2)
    ax4.set_ylabel('')
    return plt.show()

### Line Plots for Five Columns Function

In [None]:
# Line plots for five columns
def line_plot(string1, string2, string3, string4, string5, string6, 
              column, column2, column3, column4, column5, column6, data):
    
    # Line plots (sub plots)
    print('\n')
    sns.set_style('white')
    fig, axs = plt.subplots(1,5, figsize=(25, 4))
    axs[0].grid(False)
    axs[0].set_title(string2+' by '+string1)
    axs[0].plot(data[column], data[column2], color='darkblue')
    axs[0].set_xlabel(string1)
    axs[0].set_ylabel(string2)

    axs[1].grid(False)
    axs[1].set_title(string3+' by '+string1)
    axs[1].plot(data[column], data[column3], color='darkorange')
    axs[1].set_xlabel(string1)
    axs[1].set_ylabel(string3)

    axs[2].grid(False)
    axs[2].set_title(string4+' by '+string1)
    axs[2].plot(data[column], data[column4], color='darkgreen')
    axs[2].set_xlabel(string1)
    axs[2].set_ylabel(string4)

    axs[3].grid(False)
    axs[3].set_title(string5+' by '+string1)
    axs[3].plot(data[column], data[column5], color='darkred')
    axs[3].set_xlabel(string1)
    axs[3].set_ylabel(string5)

    axs[4].grid(False)
    axs[4].set_title(string6+' by '+string1)
    axs[4].plot(data[column], data[column6], color='blueviolet')
    axs[4].set_xlabel(string1)
    axs[4].set_ylabel(string6)
    return plt.show()

### Line Plots for Two Columns in Four Different Data <br> Sets Function

In [None]:
# Line plots for two columns in four different data sets
def line_plot2(string2, string1, column2, column, data1, data2, data3, data4):
    
    # Line plots (sub plots)
    sns.set_style("whitegrid")
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, sharey=True, figsize=(20, 4))
    sns.lineplot(data1[column2], data1[column], ax=ax1, color='darkblue').grid(False)
    
    ax1.set_title('Decade 1: ' + string1 + ' by' + string2)
    ax1.set_xlim(data1[column2].min(), data1[column2].max())
    ax1.set_xlabel('\n' + string2)
    ax1.set_ylabel(string1 + '\n')

    sns.lineplot(data2[column2], data2[column], ax=ax2, color='darkorange').grid(False)
    ax2.set_title('Decade 2: ' + string1 + ' by' + string2)
    
    ax2.set_xlim(data2[column2].min(), data2[column2].max())
    ax2.set_xlabel('\n' + string2)
    ax2.set_ylabel('')

    sns.lineplot(data3[column2], data3[column], ax=ax3, color='darkgreen').grid(False)
    ax3.set_title('Decade 3: ' + string1 + ' by' + string2)
    
    ax3.set_xlim(data3[column2].min(), data3[column2].max())
    ax3.set_xlabel('\n' + string2)
    ax3.set_ylabel('')

    sns.lineplot(data4[column2], data4[column], ax=ax4, color='darkred').grid(False)
    ax4.set_title('Decade 4: ' + string1 + ' by' + string2)
    
    ax4.set_xlim(data4[column2].min(), data4[column2].max())
    ax4.set_xlabel('\n' + string2)
    ax4.set_ylabel('')
    return plt.show()