In [23]:
import pandas as pd
import numpy as np
import math

### Investigating Pearson Correlation Coefficients Using Paired T-Test with Hacker Statistics

We execute a two-sample permutation test to check statistical significance of the Pearson Correlation Coefficient between each population group and keyword. For easier reading and analysis, each keyword is given its own DataFrame, as described in the comments below.

In [125]:
""" dataframe key, for reference:

how to find number and routing number = kw_df_1
fiduciary bank = kw_df_2
foreign bank account = kw_df_3
balance your checkbook = kw_df_4
purchase money order = kw_df_5
savings account = kw_df_6
personal bankruptcy = kw_df_7
savings plan = kw_df_8
direct deposit = kw_df_9
529 plan = kw_df_10
credit card = kw_df_11
bankruptcy = kw_df_12
check cashing = kw_df_13
ATM = kw_df_14
fafsa = kw_df_15
savings association = kw_df_16
deposit money order = kw_df_17
deposit check = kw_df_18
best bank accounts = kw_df_19
small business bank = kw_df_20
"""

# extract data from cleaned csv: df
df = pd.read_csv('/Users/jacobschroeder/anaconda3/projects/test.csv')

# store population groups as a list: pop_list
pop_list = []
for i in range(101):
    pop_list.append("POP_" + str(i))

# pearson_r: Pearson Correlation Coefficient function
def pearson_r(x, y):
    """Compute Pearson correlation coefficient between two arrays."""
    # Compute correlation matrix: corr_mat
    corr_mat = np.corrcoef(x,y)

    # Return entry [0,1]
    return corr_mat[0,1]

# draw_bs_pairs: function to draw bootstrap pairs
def draw_bs_pairs(x, y, func, size=1):
    
    # Set up array of indices to sample from: inds
    inds = np.arange(len(x))

    # Initialize replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, len(inds))
        bs_replicates[i] = func(x[bs_inds], y[bs_inds])
        
    return bs_replicates

# function to remove unnecessary columns for this excercise: clean_dataframe
def clean_dataframe(df):
    del df['YEAR']
    del df['TOTAL_POP']
    del df['how to find number and routing number']
    del df['fiduciary bank']
    del df['foreign bank account']
    del df['balance your checkbook']
    del df['purchase money order']
    del df['savings account']
    del df['personal bankruptcy']
    del df['savings plan']
    del df['direct deposit']
    del df['529 plan']
    del df['credit card']
    del df['bankruptcy']
    del df['check cashing']
    del df['ATM']
    del df['fafsa']
    del df['savings association']
    del df['deposit money order']
    del df['deposit check']
    del df['best bank accounts']
    del df['small business bank']
    return df

In this two-sample t-test, we test randomly sampled Pearson Correlation Coefficients against the Pearson Correlation Coefficient extracted from the original dataset. 

The null hypothesis is that the correlation is no higher than what was initially sampled.

Testing assumes a 95% confidence interval, or 5% margin of error that the p-value will be evaluated against.

In [126]:
""" Our null hypothesis is that the selected keyword does not have
    a higher correlation to the age group than initially computed
"""

def test_pearson(kw,  i, size, pop_list):
    """ DOCSTRING: a function for testing pearson correlation coefficient over and over 
        kw: the keyword you wish to analyze, numpy array (independent variable)
        i: the starting row of the data (slice)
        size: the number of rows counted in the data analysis
    """
    # t: master counter
    t = 0
    return_list = np.empty(len(pop_list))
    return_pval = np.empty(len(pop_list))
    # for every age group in the population list
    for p in pop_list:
    
        # j, k: counters for operation
        j = 0
        k = 0

        # initialize empty numpy arrays to store data
        xdata = np.empty(size)
        ydata = np.empty(size)

        # xdata: an increase in a certain population group (independent) 
        for item in df[p][4:19]:
            xdata[j] = item
            j += 1
        j = 0

        # ydata: will create an increase in keyword demand (dependent)
        for item in df[kw][4:19]:
            ydata[k] = item
            k += 1
        k = 0

        # multiply each data set by 100 to ensure no "divide by zero" errors
        xdata = xdata
        ydata = ydata

        # run replicates test for selected age
        replicates = draw_bs_pairs(xdata, ydata, pearson_r, 10000)

        # get actual data for comparison
        actual_pearson = pearson_r(xdata, ydata)
        # print(actual_pearson)

        # p-value
        return_pval[t] = sum(abs(replicates) >= abs(actual_pearson)) / 10000
        
        return_list[t] = np.mean(replicates)
        print('Age Group {}: '.format(str(t)) + str(return_list[t]))
        print('P-Value: {}'.format(str(return_pval[t])))

        # increase master counter
        t += 1
    
    return { 'results': [return_list, return_pval] }

In [120]:
# setup the first dataframe
kw_df_1 = pd.DataFrame().reindex_like(df)
kw_df_1 = clean_dataframe(kw_df_1)
kw = 'how to find number and routing number'
kw_reps_1 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_1.columns)):
    kw_df_1.iloc[1, i] = kw_reps_1['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_1.columns)):
    kw_df_1.iloc[2, i] = kw_reps_1['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')
print(kw_df_1.head())

Age Group 0: 0.928853466095
P-Value: 0.6653
Age Group 1: 0.936578473304
P-Value: 0.6729
Age Group 2: 0.944359661295
P-Value: 0.667
Age Group 3: 0.955692339859
P-Value: 0.6836
Age Group 4: 0.931112784336
P-Value: 0.591
Age Group 5: 0.914321790533
P-Value: 0.6355
Age Group 6: 0.902887782543
P-Value: 0.6332
Age Group 7: 0.895554176574
P-Value: 0.614
Age Group 8: 0.911675095779
P-Value: 0.6071
Age Group 9: 0.927689096789
P-Value: 0.6057
Age Group 10: 0.939006940571
P-Value: 0.6147
Age Group 11: 0.922364706627
P-Value: 0.5782
Age Group 12: 0.841574845797
P-Value: 0.545
Age Group 13: 0.661400889039
P-Value: 0.5159
Age Group 14: 0.434305757749
P-Value: 0.5213
Age Group 15: 0.251357729979
P-Value: 0.5489
Age Group 16: 0.100375434713
P-Value: 0.6775
Age Group 17: -0.0119835952287
P-Value: 0.9067
Age Group 18: -0.138071606366
P-Value: 0.6428
Age Group 19: -0.0250033989599
P-Value: 0.9042
Age Group 20: 0.167459839506
P-Value: 0.6213
Age Group 21: 0.388581570996
P-Value: 0.5667
Age Group 22: 0.591

In [124]:
# setup dataframe 2
kw_df_2 = pd.DataFrame().reindex_like(df)
kw_df_2 = clean_dataframe(kw_df_2)
kw = 'fiduciary bank'
kw_reps_2 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_2.columns)):
    kw_df_2.iloc[1, i] = kw_reps_2['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_2.columns)):
    kw_df_2.iloc[2, i] = kw_reps_2['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_2.head())

Age Group 0: -0.0542550688788
P-Value: 0.8026
Age Group 1: -0.0719183438766
P-Value: 0.7944
Age Group 2: -0.0618070253454
P-Value: 0.7979
Age Group 3: -0.013704198808
P-Value: 0.8938
Age Group 4: -0.0807880931536
P-Value: 0.768
Age Group 5: -0.144159691192
P-Value: 0.6921
Age Group 6: -0.0991724387202
P-Value: 0.7502
Age Group 7: -0.0165259083844
P-Value: 0.8813
Age Group 8: 0.0573688345094
P-Value: 0.9329
Age Group 9: 0.0919945931027
P-Value: 0.7922
Age Group 10: 0.0720642131524
P-Value: 0.7786
Age Group 11: 0.129183575731
P-Value: 0.5698
Age Group 12: 0.278018691148
P-Value: 0.5207
Age Group 13: 0.395737057736
P-Value: 0.5515
Age Group 14: 0.400146735807
P-Value: 0.5398
Age Group 15: 0.160834202397
P-Value: 0.635
Age Group 16: -0.14014640278
P-Value: 0.6391
Age Group 17: -0.164772560763
P-Value: 0.6541
Age Group 18: -0.155397690405
P-Value: 0.6646
Age Group 19: -0.19624173487
P-Value: 0.6309
Age Group 20: -0.32472051482
P-Value: 0.5308
Age Group 21: -0.356074566431
P-Value: 0.5083
Ag

Unnamed: 0,POP_0,POP_1,POP_2,POP_3,POP_4,POP_5,POP_6,POP_7,POP_8,POP_9,...,POP_91,POP_92,POP_93,POP_94,POP_95,POP_96,POP_97,POP_98,POP_99,POP_100
0,,,,,,,,,,,...,,,,,,,,,,
1,-0.054255,-0.071918,-0.061807,-0.013704,-0.080788,-0.14416,-0.099172,-0.016526,0.057369,0.091995,...,-0.105631,-0.09417,-0.058084,-0.030027,-0.023141,-0.00964,-0.013711,-0.031344,-0.046073,-0.024901
2,0.8026,0.7944,0.7979,0.8938,0.768,0.6921,0.7502,0.8813,0.9329,0.7922,...,0.7335,0.7491,0.8088,0.864,0.8634,0.9115,0.9066,0.8835,0.8371,0.8895
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# setup dataframe 3
kw_df_3 = pd.DataFrame().reindex_like(df)
kw_df_3 = clean_dataframe(kw_df_3)
kw = 'foreign bank account'
kw_reps_3 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_3.columns)):
    kw_df_3.iloc[1, i] = kw_reps_3['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_3.columns)):
    kw_df_3.iloc[2, i] = kw_reps_3['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_3.head())

In [None]:
# setup dataframe 4
kw_df_4 = pd.DataFrame().reindex_like(df)
kw_df_4 = clean_dataframe(kw_df_4)
kw = 'balance your checkbook'
kw_reps_4 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_4.columns)):
    kw_df_4.iloc[1, i] = kw_reps_4['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_4.columns)):
    kw_df_4.iloc[2, i] = kw_reps_4['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_4.head())

In [None]:
# setup dataframe 5
kw_df_5 = pd.DataFrame().reindex_like(df)
kw_df_5 = clean_dataframe(kw_df_5)
kw = 'purchase money order'
kw_reps_5 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_5.columns)):
    kw_df_5.iloc[1, i] = kw_reps_5['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_5.columns)):
    kw_df_5.iloc[2, i] = kw_reps_5['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_5.head())

In [None]:
# setup dataframe 6
kw_df_6 = pd.DataFrame().reindex_like(df)
kw_df_6 = clean_dataframe(kw_df_6)
kw = 'savings account'
kw_reps_6 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_6.columns)):
    kw_df_6.iloc[1, i] = kw_reps_6['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_6.columns)):
    kw_df_6.iloc[2, i] = kw_reps_6['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_6.head())

In [None]:
# setup dataframe 7
kw_df_7 = pd.DataFrame().reindex_like(df)
kw_df_7 = clean_dataframe(kw_df_7)
kw = 'personal bankruptcy'
kw_reps_7 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_7.columns)):
    kw_df_7.iloc[1, i] = kw_reps_7['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_7.columns)):
    kw_df_7.iloc[2, i] = kw_reps_7['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_7.head())

In [None]:
# setup dataframe 8
kw_df_8 = pd.DataFrame().reindex_like(df)
kw_df_8 = clean_dataframe(kw_df_8)
kw = 'savings plan'
kw_reps_8 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_8.columns)):
    kw_df_8.iloc[1, i] = kw_reps_8['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_8.columns)):
    kw_df_8.iloc[2, i] = kw_reps_8['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_8.head())

In [None]:
# setup dataframe 9
kw_df_9 = pd.DataFrame().reindex_like(df)
kw_df_9 = clean_dataframe(kw_df_9)
kw = 'direct deposit'
kw_reps_9 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_9.columns)):
    kw_df_9.iloc[1, i] = kw_reps_9['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_9.columns)):
    kw_df_9.iloc[2, i] = kw_reps_9['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_9.head())

In [None]:
# setup dataframe 10
kw_df_10 = pd.DataFrame().reindex_like(df)
kw_df_10 = clean_dataframe(kw_df_10)
kw = '529 plan'
kw_reps_10 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_10.columns)):
    kw_df_10.iloc[1, i] = kw_reps_10['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_10.columns)):
    kw_df_10.iloc[2, i] = kw_reps_10['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_10.head())

In [None]:
# setup dataframe 11
kw_df_11 = pd.DataFrame().reindex_like(df)
kw_df_11 = clean_dataframe(kw_df_11)
kw = 'credit card'
kw_reps_11 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_11.columns)):
    kw_df_11.iloc[1, i] = kw_reps_11['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_11.columns)):
    kw_df_11.iloc[2, i] = kw_reps_11['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_11.head())

In [None]:
# setup dataframe 12
kw_df_12 = pd.DataFrame().reindex_like(df)
kw_df_12 = clean_dataframe(kw_df_12)
kw = 'bankruptcy'
kw_reps_11 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_12.columns)):
    kw_df_12.iloc[1, i] = kw_reps_12['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_12.columns)):
    kw_df_12.iloc[2, i] = kw_reps_12['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_12.head())

In [None]:
# setup dataframe 13
kw_df_13 = pd.DataFrame().reindex_like(df)
kw_df_13 = clean_dataframe(kw_df_13)
kw = 'check cashing'
kw_reps_13 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_13.columns)):
    kw_df_13.iloc[1, i] = kw_reps_13['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_13.columns)):
    kw_df_13.iloc[2, i] = kw_reps_13['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_13.head())

In [None]:
# setup dataframe 14
kw_df_14 = pd.DataFrame().reindex_like(df)
kw_df_14 = clean_dataframe(kw_df_14)
kw = 'ATM'
kw_reps_14 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_14.columns)):
    kw_df_14.iloc[1, i] = kw_reps_14['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_14.columns)):
    kw_df_14.iloc[2, i] = kw_reps_14['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_14.head())

In [None]:
# setup dataframe 15
kw_df_15 = pd.DataFrame().reindex_like(df)
kw_df_15 = clean_dataframe(kw_df_15)
kw = 'fafsa'
kw_reps_15 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_15.columns)):
    kw_df_15.iloc[1, i] = kw_reps_15['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_15.columns)):
    kw_df_15.iloc[2, i] = kw_reps_15['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_15.head())

In [None]:
# setup dataframe 16
kw_df_16 = pd.DataFrame().reindex_like(df)
kw_df_16 = clean_dataframe(kw_df_16)
kw = 'savings association'
kw_reps_16 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_16.columns)):
    kw_df_16.iloc[1, i] = kw_reps_16['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_16.columns)):
    kw_df_16.iloc[2, i] = kw_reps_16['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_16.head())

In [None]:
# setup dataframe 17
kw_df_17 = pd.DataFrame().reindex_like(df)
kw_df_17 = clean_dataframe(kw_df_17)
kw = 'deposit money order'
kw_reps_17 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_17.columns)):
    kw_df_17.iloc[1, i] = kw_reps_17['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_17.columns)):
    kw_df_17.iloc[2, i] = kw_reps_17['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_17.head())

In [None]:
# setup dataframe 18
kw_df_18 = pd.DataFrame().reindex_like(df)
kw_df_18 = clean_dataframe(kw_df_18)
kw = 'deposit check'
kw_reps_18 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_18.columns)):
    kw_df_18.iloc[1, i] = kw_reps_18['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_18.columns)):
    kw_df_18.iloc[2, i] = kw_reps_18['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_18.head())

In [None]:
# setup dataframe 19
kw_df_19 = pd.DataFrame().reindex_like(df)
kw_df_19 = clean_dataframe(kw_df_19)
kw = 'best bank accounts'
kw_reps_19 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_19.columns)):
    kw_df_19.iloc[1, i] = kw_reps_19['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_19.columns)):
    kw_df_19.iloc[2, i] = kw_reps_19['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_19.head())

In [None]:
# setup dataframe 20
kw_df_20 = pd.DataFrame().reindex_like(df)
kw_df_20 = clean_dataframe(kw_df_20)
kw = 'small business bank'
kw_reps_20 = test_pearson(kw, 4, 15, pop_list)
print('Permutation Test Complete')

# place corresponding permuatation values in first row of dataframe
for i in range(len(kw_df_20.columns)):
    kw_df_20.iloc[1, i] = kw_reps_20['results'][0][i]
    print('success 1: ' + str(i))
print('Checkpoint 2')

# p-value loading in the second row
for i in range(len(kw_df_20.columns)):
    kw_df_20.iloc[2, i] = kw_reps_20['results'][1][i]
    print('success 2: ' + str(i))
print('Checkpoint 3')

print('Complete')

print(kw_df_20.head())