# Bonus: Temperature Analysis I

In [58]:
import pandas as pd
from datetime import datetime as dt
from statistics import mean

In [59]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  19550 non-null  object 
 1   date     19550 non-null  object 
 2   prcp     18103 non-null  float64
 3   tobs     19550 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 611.1+ KB


In [61]:
# Convert the date column format from string to datetime
df['date']= pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [62]:
# Set the date column as the DataFrame index
df.set_index("date", inplace=True)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 19550 entries, 2010-01-01 to 2017-08-23
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  19550 non-null  object 
 1   prcp     18103 non-null  float64
 2   tobs     19550 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 610.9+ KB


In [64]:
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [65]:
new_df = df[df.index.year != 2017]
new_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2016-12-27,USC00516128,0.14,71
2016-12-28,USC00516128,0.14,71
2016-12-29,USC00516128,1.03,69
2016-12-30,USC00516128,2.37,65


In [66]:
new_df.dropna(how="any", inplace=True)
new_df.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,prcp,tobs
count,16836.0,16836.0
mean,0.160246,72.917676
std,0.469854,4.46791
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


### Compare June and December data across all years 

In [67]:
from scipy import stats

In [68]:
# Filter data for desired months
june_df = new_df[new_df.index.month == 6]
june_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.00,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.00,78
2010-06-04,USC00519397,0.00,76
2010-06-05,USC00519397,0.00,77
...,...,...,...
2016-06-23,USC00516128,0.90,72
2016-06-26,USC00516128,0.70,74
2016-06-27,USC00516128,0.30,72
2016-06-28,USC00516128,0.25,74


In [69]:
dec_df = new_df[new_df.index.month == 12]
dec_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-12-01,USC00519397,0.04,76
2010-12-03,USC00519397,0.00,74
2010-12-04,USC00519397,0.00,74
2010-12-06,USC00519397,0.00,64
2010-12-07,USC00519397,0.00,64
...,...,...,...
2016-12-27,USC00516128,0.14,71
2016-12-28,USC00516128,0.14,71
2016-12-29,USC00516128,1.03,69
2016-12-30,USC00516128,2.37,65


In [70]:
# Create collections of temperature data
june_temps = june_df["tobs"].to_list()
dec_temps = dec_df["tobs"].to_list()

# ensure we have an equal number of datapoints
# for our t-test
while len(june_temps) != len(dec_temps):
    if len(june_temps) > len(dec_temps):
        june_temps.pop(-1)
    else:
        dec_temps.pop(-1)
    

In [71]:
# Identify the average temperature for June
mean(june_temps)

74.6143977191732

In [72]:
# Identify the average temperature for December
mean(dec_temps)

70.93870277975766

In [54]:
# Run paired t-test
t_val, p_val = stats.ttest_rel(june_temps, dec_temps)
print(f"t-stat: {t_val}, p_value: {p_val}")

t-stat: 30.743577628125248, p_value: 4.410577439791356e-159


### Analysis