# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt
from scipy import stats

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])

In [4]:
# Set the date column as the DataFrame index
# Drop the date column
date_df = df.set_index("date", drop=True)
date_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


### Compare June and December data across all years 

In [5]:
# Extract Month from the date index and enter it as a new month column
date_df["month"] = pd.DatetimeIndex(date_df.index).month
date_df

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,0.08,65,1
2010-01-02,USC00519397,0.00,63,1
2010-01-03,USC00519397,0.00,74,1
2010-01-04,USC00519397,0.00,76,1
2010-01-06,USC00519397,,73,1
...,...,...,...,...
2017-08-19,USC00516128,0.09,71,8
2017-08-20,USC00516128,,78,8
2017-08-21,USC00516128,0.56,76,8
2017-08-22,USC00516128,0.50,76,8


In [6]:
# Filter data for June months
june_df = date_df[(date_df['month'] == 6)]
june_df

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-01,USC00519397,0.00,78,6
2010-06-02,USC00519397,0.01,76,6
2010-06-03,USC00519397,0.00,78,6
2010-06-04,USC00519397,0.00,76,6
2010-06-05,USC00519397,0.00,77,6
...,...,...,...,...
2017-06-26,USC00516128,0.02,79,6
2017-06-27,USC00516128,0.10,74,6
2017-06-28,USC00516128,0.02,74,6
2017-06-29,USC00516128,0.04,76,6


In [7]:
# Filter data for December months
december_df = date_df[(date_df['month'] == 12)]
december_df

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-01,USC00519397,0.04,76,12
2010-12-03,USC00519397,0.00,74,12
2010-12-04,USC00519397,0.00,74,12
2010-12-06,USC00519397,0.00,64,12
2010-12-07,USC00519397,0.00,64,12
...,...,...,...,...
2016-12-27,USC00516128,0.14,71,12
2016-12-28,USC00516128,0.14,71,12
2016-12-29,USC00516128,1.03,69,12
2016-12-30,USC00516128,2.37,65,12


In [8]:
# Identify the average temperature for June
avg_june_temp = june_df["tobs"].mean()
avg_june_temp

74.94411764705882

In [9]:
# Identify the average temperature for December
avg_dec_temp = december_df["tobs"].mean()
avg_dec_temp

71.04152933421226

In [12]:
# Create collections of temperature data
june_temp_list = june_df.tobs.values
dec_temp_list = december_df.tobs.values

In [13]:
# Run un-paired t-test
stats.ttest_ind(june_temp_list, dec_temp_list, equal_var=False)

Ttest_indResult(statistic=31.355036920962423, pvalue=4.193529835915755e-187)

### Analysis

We will use unpaired or independent t-test as we are comparing observed temperatures from two completely independent and non-overlapping datasets (in this case June and December).  

Since pvalue < 0.05, we can reject the null hypothesis and conclude that there is meaningful difference in temperature between June and December in Hawaii and difference in the temperature means of these datasets is statistically significant.