# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources\hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
df['date'] = df['date'].astype('datetime64[ns]')

df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [4]:
# Set the date column as the DataFrame index
df.set_index(["date"], inplace = True) 

In [5]:
# Drop the date column

# already done!
df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


### Compare June and December data across all years 

In [6]:
from scipy import stats

In [7]:
# Filter data for desired months
junedec_df = df[df.index.month.isin([6,12])]
junedec_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.00,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.00,78
2010-06-04,USC00519397,0.00,76
2010-06-05,USC00519397,0.00,77
...,...,...,...
2017-06-26,USC00516128,0.02,79
2017-06-27,USC00516128,0.10,74
2017-06-28,USC00516128,0.02,74
2017-06-29,USC00516128,0.04,76


In [8]:
# Identify the average temperature for June
june_df = junedec_df[junedec_df.index.month.isin([6])]
juneav = june_df['tobs'].mean()
print(f'The average temperature during June in this dataset is {round(juneav,2)}.')

The average temperature during June in this dataset is 74.94.


In [9]:
# Identify the average temperature for December
dec_df = junedec_df[junedec_df.index.month.isin([12])]
decav = dec_df['tobs'].mean()
print(f'The average temperature during December in this dataset is {round(decav,2)}.')

The average temperature during December in this dataset is 71.04.


In [10]:
# Create collections of temperature data
print('Temperature data for June:')
print(june_df.describe())

print('Temperature data for December:')
print(dec_df.describe())

print('----------')

june_len = len(june_df)
dec_len = len(dec_df)
print(f'There are {june_len} temperature observations for the June data.')
print(f'There are {dec_len} temperature observations for the December data.')
print('----------')

if june_len > dec_len:
    shorter = 'December'
    shorter_value = dec_len
    subset = round(dec_len*.9)
else:
    shorter = 'June'
    shorter_value - june_len
    subset = round(june_len*.9)
    
print(f'To create two equal amounts of data with which to perform the paired T-Test, we will determine whether the length'
      f' of the June or December list of temperature observations is shorter, and will make our subset 90% of the shorter'
      f' length. As the June list is {june_len} observations long, and the December list is {dec_len} long, we will use the'
      f' length of {shorter} to determine the size of our subset. As {shorter} is {shorter_value} long, and 90% of'
      f' {shorter_value} is {subset}, we will use a random selection of {subset} observations to perform our paired T-Test.')

Temperature data for June:
              prcp         tobs
count  1574.000000  1700.000000
mean      0.136360    74.944118
std       0.335731     3.257417
min       0.000000    64.000000
25%       0.000000    73.000000
50%       0.020000    75.000000
75%       0.120000    77.000000
max       4.430000    85.000000
Temperature data for December:
              prcp         tobs
count  1405.000000  1517.000000
mean      0.216819    71.041529
std       0.541399     3.745920
min       0.000000    56.000000
25%       0.000000    69.000000
50%       0.030000    71.000000
75%       0.150000    74.000000
max       6.420000    83.000000
----------
There are 1700 temperature observations for the June data.
There are 1517 temperature observations for the December data.
----------
To create two equal amounts of data with which to perform the paired T-Test, we will determine whether the length of the June or December list of temperature observations is shorter, and will make our subset 90% of the sho

In [11]:
# Run paired t-test
value = stats.ttest_rel(june_df['tobs'].sample(subset), dec_df['tobs'].sample(subset))
p_value_list = list(value)
p_value = p_value_list[1]
if p_value <= 0.05:
    print(f'As the pvalue of the average temperatures in June vs. December for this dataset is {p_value}, ' 
          'the difference is statistically significant. We must reject the null hypthesis.')
else:
    print(f'As the pvalue of the average temperatures in June vs. December for this dataset is {p_value}, ' 
          'the difference is statistically significant. We must accept the null hypthesis.')

As the pvalue of the average temperatures in June vs. December for this dataset is 1.0136070891676038e-150, the difference is statistically significant. We must reject the null hypthesis.


### Analysis

In [12]:
#Will you use a paired t-test, or an unpaired t-test? Why?

print('I chose to use a paired T-Test because I am comparing two observations- June vs. December- '
      'of the same object- the temperature of a specific region. Were I testing unrelated objects,'
     ' I would have used an unpaired T-Test.')

I chose to use a paired T-Test because I am comparing two observations- June vs. December- of the same object- the temperature of a specific region. Were I testing unrelated objects, I would have used an unpaired T-Test.
