
    Page 114: 9-1 (As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real…)
    Page 128: 10-1 (Using the data from the BRFSS, compute the linear least squares fit for log(weight) versus height…)


# Stoneburner, Kurt
- ## DSC 530 - Week 08
- ## Chapter 9, Exercise 1

**Exercise:** As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.

To investigate this behavior, run the tests in this chapter with different subsets of the NSFG data. You can use `thinkstats2.SampleRows` to select a random subset of the rows in a DataFrame.

What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [10]:
# //****************************************************************************************
# //*** Set Working Directory to thinkstats folder.
# //*** This pseudo-relative path call should work on all Stoneburner localized projects. 
# //****************************************************************************************

import os
import sys
workingPath = os.getcwd().replace("coding", "ThinkStats2\\code")
sys.path.insert(1, workingPath)
os.chdir(workingPath)

In [11]:
# //*** Imports and Load Data
import nsfg
import thinkstats2
import thinkplot
import first
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy

resp = nsfg.ReadFemResp()
preg = nsfg.ReadFemPreg()


In [12]:
def getSample_from_series(input_df,input_percentage=.1):
    #//*** Returns a random sampling of the input_df or Series
    # //*** Series type conversion to dataFrame
    
    is_series = False
    
    if isinstance(input_df, pd.Series):
        input_df = pd.DataFrame(input_df)
        is_series = True
    
    sample_size = int(len(input_df) * input_percentage)
    remainder_size = len(input_df) - sample_size
    
    #print(f"{sample_size} {remainder_size}")
    
    # //*** Set Loop safety as a function of sample_size and input_percentage
    loop_safe_max = (sample_size * 1/input_percentage)**2
    #print(f"Loop Safe Max: {loop_safe_max}")
    
    loop_safe = 0
    sample_index = []
    sample_dict = {}
    
    # //*** Get a random integer between 0 and size of input_series -1
    # //*** Build a list of unique random numbers equal to the sample size
    # //*** A dictionary is used to keep track of unique values.
    while len(sample_index) < sample_size:
        # //*** Pick a random integer between - and len(input_df) -1 
        random_int = np.random.randint( (len(input_df)-1) )
        
        #//*** Convert integer to index key
        random_int = input_df.index[random_int]
        
               
        
        #//*** Check if we've used this number
        if random_int not in sample_dict.keys():
            sample_index.append(random_int)
            sample_dict[random_int] = ""
        
        loop_safe = loop_safe + 1
        
        if loop_safe > loop_safe_max:
            print("Loop Maximum exceeded! Quitting for Safety!")
            break;
    
    #//*** Sort the values
    sample_index = np.sort(sample_index)
    
    #//*** Build a list of values, 1 is sample, 0 is not sample
    #//*** This will be a column to add to the input_df
    is_sample = []
    for x in input_df.index:
        if x in sample_index:
            is_sample.append(1)
        else:
            is_sample.append(0)
    
    #//*** Add is_sample column
    input_df = input_df.assign( is_sample = is_sample)
    
    # //*** get sample and remainder dataframes based on is_sample attribute
    # //*** Get the sample data frame and the remainder dataframe
    sample_df = input_df[input_df ['is_sample'] == 1]
    remainder_df = input_df[input_df ['is_sample'] == 0 ]
    
    # //**** Remove is_Sample Parameter
    del sample_df['is_sample']
    del remainder_df['is_sample']
    
    #print(len(sample_index))
    #print(len(sample_df))

    
    if is_series == True:
        sample_df = pd.Series(index= sample_df.index, data= sample_df.iloc[:,0])
        remainder_df = pd.Series(index= remainder_df.index, data= remainder_df.iloc[:,0])
    return sample_df, remainder_df

In [13]:
def get_test_stat(input_ts_type, data1, data2):

    if input_ts_type ==  'mean_diff':
        return abs( np.mean(data1) - np.mean(data2) )    

    return ""

def hypothesis_test(input_dict):
    
    valid_types = ["mean_diff"]
    
    if 'type' not in input_dict.keys():
        print(f"hypothesis test requires a type value")
        return ""
    
    if input_dict['type'] not in valid_types:
        print("Need a valid input type")
        print(f"{valid_types}")
        return ""
    
    if 'data' not in input_dict.keys():
        print(f"Need valid data")
        return ""
    
    # //*** Assign the total number of tests to run.
    # //*** Defaults to 1000
    if 'count' in input_dict.keys():
        max_test_count = input_dict['count']
    else:
        max_test_count = 1000
    
    test_statistic = -1
    

    # //*** Convert data to Lists
    data1 = input_dict['data'][0]
    data2 = input_dict['data'][1]

    # //*** Sample a random subset of data
    # //*** Defaults to all data
    if 'sample' in input_dict.keys():
        # //*** Get random sample and convert to list
        data1 = getSample_from_series(data1,input_dict['sample'])[0]
        data2 = getSample_from_series(data2,input_dict['sample'])[0]
        
        #print(f"Sample Lengths: {len(data1)} {len(data2)}")
    
    data1 = list(data1)
    data2 = list(data2)
    
    n = len(data1)


    
    # //*** Concatinate the lists
    combined_data = np.hstack( (data1,data2) )
    
    #print(f"Combined: {combined_data}")
    
    # //*** get the test statistic. Function performs calculation based on type
    # //*** Assume data has been properly validated at this point.
    test_statistic = get_test_stat(input_dict['type'],data1,data2)
    
    #print(f"Test Statistic: {test_statistic}")
    
    null_count = 0
    # //*** Build random permutations
    for loop_counter in range(max_test_count):
        
        # //*** randomly shuffle the combined data
        np.random.shuffle(combined_data)
        
        
        # //*** Split shuffled data evenly
        data1,data2 = combined_data[0:n],combined_data[n+1:]
        
        loop_test_statistic = get_test_stat(input_dict['type'],data1,data2)
        
        #print(f"{test_statistic < loop_test_statistic} {test_statistic} {loop_test_statistic}")
        
        # //*** If loop test statistic greater than test statistic. Then add to null count
        if loop_test_statistic > test_statistic:
            null_count = null_count + 1
        
    
    #print(f"Null Count: { null_count} / {max_test_count} ")
    #print(f"p-value {null_count / max_test_count }")
    return (null_count / max_test_count)

#sample_df,remainder_df = getSample_from_series(total_weight,.1)

#total_weight['totalwgt_lb']
#total_weight['agepreg']

#Permutation Test
#Difference in standard deviation
#print(np.std( total_weight['totalwgt_lb']))
#print(np.std( total_weight['agepreg']))
#abs(np.std( total_weight['totalwgt_lb']) - np.std( total_weight['agepreg']))

#get_p_scores(total_weight['totalwgt_lb'])

#print(f"{scipy.stats.pearsonr(total_weight['totalwgt_lb'],total_weight['agepreg'])}")
#correlation Testing
#Testing Proportions
#Chi Squared test

#Difference of Means Permutation test
# Generate a test statistic for reference.
# This is the difference of the means
# test statistic or t-value is the abs(difference of means)
# combine both data sets.
# 1000 thousand times:
#      Randomly split combined data in half.
#      Find the difference of the means for the random samples, for a random sample test statistic
#      count the random test statistics that are greater than the base line test statistic
# The P-value is the count / total tests run (1000)
# The P-value represents the chance of the outcome occuring randomly.
# Reference: https://www.ohbmbrainmappingblog.com/blog/a-brief-overview-of-permutation-testing-with-examples


In [27]:
# //*** Get the birthweight Series data from the preg dataframe
preg = preg.dropna(subset=['totalwgt_lb','agepreg','birthord'])

firsts_df = preg[preg.birthord == 1]
others_df = preg[preg.birthord != 1]

first_preglen = firsts_df['prglngth']
other_preglen = others_df['prglngth']

baseline_test_statistic = get_test_stat('mean_diff',first_preglen,other_preglen)
print(f"=======================================================")
print(f"Baseline (all data) p-val: {baseline_test_statistic}")
print(f"=======================================================")

for x in [.05,.1,.2,.5,.75]:
#for x in [.05]:
    pvals = [ hypothesis_test({ 'type':'mean_diff','data':(first_preglen,other_preglen),'sample':x }) for i in range(50) ]
    pvals = np.array(pvals)
    pval_mean, pval_min, pval_max, pval_range = np.mean(pvals),pvals.min(),pvals.max(),pvals.max()-pvals.min()
    #print(f"=====================")
    #print(f"p-values from 100 tests of sampling data at a sample size of {x}")
    #print(f"================================================================================================")
    print(f"Sample size {round(x*100,2)}%: p-val(mean) {round(pval_mean,4)} [ min: {pval_min} max: {pval_max} range: {round(pval_range,4)} ]")
    print(f"================================================================================================")
    


Baseline (all data) p-val: 0.07511149297508268
Sample size 5.0%: p-val(mean) 0.5074 [ min: 0.001 max: 1.0 range: 0.999 ]
Sample size 10.0%: p-val(mean) 0.4429 [ min: 0.003 max: 1.0 range: 0.997 ]
Sample size 20.0%: p-val(mean) 0.4618 [ min: 0.006 max: 0.971 range: 0.965 ]
Sample size 50.0%: p-val(mean) 0.3603 [ min: 0.005 max: 0.929 range: 0.924 ]
Sample size 75.0%: p-val(mean) 0.2941 [ min: 0.012 max: 0.903 range: 0.891 ]


- ## Chapter 10, Exercise 1



In [15]:
# //*** CODE HERE

In [16]:
# //*** CODE HERE

In [17]:
# //*** CODE HERE