# Hypothesis Testing

In [1]:
from scipy import stats
import numpy as np
import pandas as pd

In [3]:
# Read in data from csv file
planet_data = pd.read_csv('../data/distinct_planets.csv')

### Is the mean planet radius for the leading method (Transit) smaller than that of the next method (Radial Velocity)?

#### Null hypothesis:  The means are equal
#### Alternative hypothesis:  Mean for the transit method < Mean for radial velocity method

In [4]:
# Hypothesis test.  Is the mean planet radius for one method different than the mean planet radius for another method?

# Separate samples for transit method (method1) and radial velocity method (method2)
method1 = 'Transit'
method2 = 'Radial Velocity'

sample_m1 = planet_data[planet_data['discoverymethod'] == method1]['pl_rade'].dropna()
sample_m2 = planet_data[planet_data['discoverymethod'] == method2]['pl_rade'].dropna()

def hypothesis_test(sample_1, sample_2, alpha, alt):
    # Print sample means for reference
    print(f'Sample 1 mean = {sample_1.mean()}, Sample 2 mean = {sample_2.mean()}')
    # Perform t_test
    _, p_val = stats.ttest_ind(sample_1, sample_2, equal_var = False, alternative = alt)
    return p_val, p_val <= alpha

hypothesis_test(sample_m1, sample_m2, 0.05, 'less')

Sample 1 mean = 4.209055455635496, Sample 2 mean = 10.195298224852074


(4.566567296422839e-178, True)

The p-value is much less than alpha, therefore, we reject the null hypothesis that the mean detected planet radius is equal between the Transit and Radial Velocity methods

In [5]:
sample_old = planet_data[planet_data['disc_year'] <= 2010]['pl_rade'].dropna()
sample_new = planet_data[planet_data['disc_year'] > 2010]['pl_rade'].dropna()

hypothesis_test(sample_old, sample_new, 0.05, 'greater')

Sample 1 mean = 12.005313229571989, Sample 2 mean = 4.858201910663569


(2.1672062321268732e-175, True)