# Bonus: Temperature Analysis I

In [25]:
import pandas as pd
from datetime import datetime as dt

In [26]:
# "tobs" is "temperature observations"
tobs_df = pd.read_csv('./Resources/hawaii_measurements.csv')
tobs_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [27]:
# Convert the date column format from string to datetime

tobs_df['date'] = pd.to_datetime(df['date'], format="%Y/%m/%d")
tobs_df['date']


0       2010-01-01
1       2010-01-02
2       2010-01-03
3       2010-01-04
4       2010-01-06
           ...    
19545   2017-08-19
19546   2017-08-20
19547   2017-08-21
19548   2017-08-22
19549   2017-08-23
Name: date, Length: 19550, dtype: datetime64[ns]

In [28]:
# Set the date column as the DataFrame index

tobs_df = tobs_df.set_index("date")
tobs_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [29]:
# Drop the date column
tobs_df.reset_index(drop=True, inplace=True)
tobs_df

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [38]:
from scipy.stats import ttest_ind
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func, extract

## create engine to hawaii.sqlite

engine = create_engine("sqlite:///hawaii.sqlite")
base = automap_base()
base.prepare(engine, reflect=True)
station = base.classes.station

## reflect an existing database into a new model
Base = automap_base()
## reflect the tables
Base.prepare(engine, reflect=True)

## Save references to each table
station = Base.classes.station
measurement = Base.classes.measurement

## Create our session (link) from Python to the DB
session = Session(engine)

In [40]:
# Filter data for desired months
# Identify the average temperature for June

jun_analysis = session.query(measurement.date, measurement.tobs)\
    .filter(extract("month", measurement.date) == "6").all()

# Convert to data frame
jun_analysis = pd.DataFrame(jun_analysis, columns = ["Date", "Temperature"])
jun_analysis

Unnamed: 0,Date,Temperature
0,2010-06-01,78.0
1,2010-06-02,76.0
2,2010-06-03,78.0
3,2010-06-04,76.0
4,2010-06-05,77.0
...,...,...
1695,2017-06-26,79.0
1696,2017-06-27,74.0
1697,2017-06-28,74.0
1698,2017-06-29,76.0


In [52]:

june_avg=session.query(func.avg(measurement.tobs)).group_by(measurement.station).\
    filter(func.strftime("%m", measurement.date)== '06').all()
june_avg

[(74.13939393939394,),
 (74.05084745762711,),
 (76.00537634408602,),
 (71.9372197309417,),
 (76.6554054054054,),
 (73.39473684210526,),
 (73.27118644067797,),
 (77.55932203389831,),
 (76.66810344827586,)]

In [55]:
# Identify the average temperature for December

dec_avg=session.query(func.avg(measurement.tobs)).group_by(measurement.station).\
    filter(func.strftime("%m", measurement.date)== '12').all()
dec_avg

[(69.6842105263158,),
 (71.06944444444444,),
 (73.2247191011236,),
 (69.29126213592232,),
 (71.8348623853211,),
 (72.42105263157895,),
 (69.90322580645162,),
 (71.10952380952381,),
 (72.43333333333334,)]

In [56]:
# Create collections of temperature data

june = june_avg
dec = dec_avg

In [69]:
# Run paired t-test
import scipy.stats as stats

stats.ttest_rel(june, dec)
print(f'T-Test Analysis results:\n ----------------------\n t value:{t}\n p value:{p}\n')


T-Test Analysis results:
 ----------------------
 t value:[4.61586542]
 p value:[0.00028626]



### Analysis

Since the p-value [0.00028626] is less than 0.05, we reject the null hypothesis. 
We have sufficient evidence to say there is no statistically significant difference in average temparture in Hawaii between the months of June and Decemeber in the dataset available.