# Bonus: Temperature Analysis I

In [28]:
import pandas as pd
from datetime import datetime as dt

In [29]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [30]:
# Convert the date column format from string to datetime
df["date"] = pd.to_datetime(df["date"])
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [31]:
# Set the date column as the DataFrame index
df.set_index(df.date)

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65
2010-01-02,USC00519397,2010-01-02,0.00,63
2010-01-03,USC00519397,2010-01-03,0.00,74
2010-01-04,USC00519397,2010-01-04,0.00,76
2010-01-06,USC00519397,2010-01-06,,73
...,...,...,...,...
2017-08-19,USC00516128,2017-08-19,0.09,71
2017-08-20,USC00516128,2017-08-20,,78
2017-08-21,USC00516128,2017-08-21,0.56,76
2017-08-22,USC00516128,2017-08-22,0.50,76


In [32]:
# Drop the date column
df.drop(axis=1, columns="date")

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [33]:
from scipy import stats

In [34]:
# Create new column for month
df["month"] = df['date'].map(lambda x: x.strftime("%m"))
df.head()

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.0,63,1
2,USC00519397,2010-01-03,0.0,74,1
3,USC00519397,2010-01-04,0.0,76,1
4,USC00519397,2010-01-06,,73,1


In [35]:
# Filter data for June
june_data = df.loc[df["month"] == "06"]
june_data

Unnamed: 0,station,date,prcp,tobs,month
133,USC00519397,2010-06-01,0.00,78,06
134,USC00519397,2010-06-02,0.01,76,06
135,USC00519397,2010-06-03,0.00,78,06
136,USC00519397,2010-06-04,0.00,76,06
137,USC00519397,2010-06-05,0.00,77,06
...,...,...,...,...,...
19492,USC00516128,2017-06-26,0.02,79,06
19493,USC00516128,2017-06-27,0.10,74,06
19494,USC00516128,2017-06-28,0.02,74,06
19495,USC00516128,2017-06-29,0.04,76,06


In [36]:
# Return series of unique values for June
june_data['tobs'].value_counts()

76    194
78    183
77    176
75    175
74    169
72    152
73    145
79    129
71     97
70     82
80     65
69     49
81     32
68     23
67     10
82      7
65      3
66      3
83      2
84      2
85      1
64      1
Name: tobs, dtype: int64

In [40]:
# Identify the average temperature for June
june_avg_temp = june_data["tobs"].groupby([june_data["station"]]).mean()
june_avg_temp

station
USC00511918    74.139394
USC00513117    74.050847
USC00514830    76.005376
USC00516128    71.937220
USC00517948    76.655405
USC00518838    73.394737
USC00519281    73.271186
USC00519397    77.559322
USC00519523    76.668103
Name: tobs, dtype: float64

In [41]:
# Filter data for Dec
dec_data = df.loc[df["month"] == "12"]
dec_data

Unnamed: 0,station,date,prcp,tobs,month
305,USC00519397,2010-12-01,0.04,76,12
306,USC00519397,2010-12-03,0.00,74,12
307,USC00519397,2010-12-04,0.00,74,12
308,USC00519397,2010-12-06,0.00,64,12
309,USC00519397,2010-12-07,0.00,64,12
...,...,...,...,...,...
19323,USC00516128,2016-12-27,0.14,71,12
19324,USC00516128,2016-12-28,0.14,71,12
19325,USC00516128,2016-12-29,1.03,69,12
19326,USC00516128,2016-12-30,2.37,65,12


In [42]:
# Return series of unique values for Dec
dec_data['tobs'].value_counts()

71    174
72    166
70    149
73    148
69    144
74    138
68     99
75     98
76     77
67     68
65     45
66     45
77     38
78     32
64     28
63     22
62     15
79      9
61      6
60      3
80      3
57      2
56      2
81      2
82      1
59      1
58      1
83      1
Name: tobs, dtype: int64

In [43]:
# Identify the average temperature for December
dec_avg_temp = dec_data["tobs"].groupby([dec_data["station"]]).mean()
dec_avg_temp

station
USC00511918    69.684211
USC00513117    71.069444
USC00514830    73.224719
USC00516128    69.291262
USC00517948    71.834862
USC00518838    72.421053
USC00519281    69.903226
USC00519397    71.109524
USC00519523    72.433333
Name: tobs, dtype: float64

In [47]:
# Run paired t-test
stats.ttest_ind(june_avg_temp, dec_avg_temp, equal_var=False)

Ttest_indResult(statistic=4.615865424404701, pvalue=0.0003657335214469917)

### Analysis

The pvalue is less than 0.05. The temperature observations between June and December are statistically significant. There is a difference between temperatures for June and December. The means are not equal and used the unpaired t-test. 