# Hypothesis Testing

In [1]:
import numpy as np
import pandas as pd
import sys
from scipy import stats
import os
pwd, _ = os.path.split(os.getcwd())
sys.path.append(os.path.join(pwd, 'src'))
import hypothesis

In [2]:
# Read in data from csv file
planet_data = pd.read_csv('../data/distinct_planets.csv')

### Is the mean planet radius for the leading method (Transit) smaller than that of the next method (Radial Velocity)?

$H_0$:   $\mu_{transit} = \mu_{radial velocity}$

$H_A$:   $\mu_{transit} < \mu_{radial velocity}$

In [3]:
# Separate samples for transit method (method1) and radial velocity method (method2)
method1 = 'Transit'
method2 = 'Radial Velocity'

sample_m1 = planet_data[planet_data['discoverymethod'] == method1]['pl_rade'].dropna()
sample_m2 = planet_data[planet_data['discoverymethod'] == method2]['pl_rade'].dropna()

# Run hypothesis test
hypothesis.hypothesis_test(sample_m1, sample_m2, 0.05, 'less')

Sample 1 mean = 4.209055455635496, Sample 2 mean = 10.195298224852074


(4.566567296422839e-178, True)

The p-value is much less than alpha, therefore, we reject the null hypothesis that the mean detected planet radius is equal between the Transit and Radial Velocity methods

### Is the mean planet radius from 1989 - 2010 greater than that detected over the period 2011 - 2021?

$H_0$:   $\mu_{1989-2010} = \mu_{2011-2021}$

$H_A$:   $\mu_{1989-2010} > \mu_{2011-2021}$

In [4]:
sample_old = planet_data[planet_data['disc_year'] <= 2010]['pl_rade'].dropna()
sample_new = planet_data[planet_data['disc_year'] > 2010]['pl_rade'].dropna()

hypothesis.hypothesis_test(sample_old, sample_new, 0.05, 'greater')

Sample 1 mean = 12.005313229571989, Sample 2 mean = 4.858201910663569


(2.1672062321268732e-175, True)

The p-value is much less than alpha, therefore, we reject the null hypothesis that the mean detected planet radius is equal between the two time periods, in favor of the alternate hypothesis.