In [1]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect
from datetime import datetime
from sqlalchemy import func ,bindparam,distinct
import dateutil.relativedelta

import matplotlib
matplotlib.use('nbagg')
from matplotlib import style
style.use('seaborn')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
# Create the connection engine
engine = create_engine("sqlite:///hawaii.sqlite")

In [3]:
Base = automap_base()

In [4]:
Base.prepare(engine, reflect=True)

In [5]:
Base.classes.keys()

['measurements', 'stations']

In [6]:
Station = Base.classes.stations
Measurement = Base.classes.measurements

In [9]:
start_trip_date = input('Enter trip start date in YYYY-MM-DD format')
try:
    year, month, day = map(int, start_trip_date.split('-'))
except ValueError:
    print("Incorrect format,please re enter Start Date")


Enter trip start date in YYYY-MM-DD format2018-01-01


In [10]:
start_date = datetime(year,month,day)

In [11]:
end_trip_date = input('Enter trip end date in YYYY-MM-DD format')
try:
    year, month, day = map(int, end_trip_date.split('-'))
except ValueError:
    print("Incorrect format,please re enter end Date")

Enter trip end date in YYYY-MM-DD format2018-01-11


In [12]:
end_date = datetime(year,month,day)

In [13]:
d = end_date - start_date

In [14]:
if (d.days < 3 or d.days > 15):
    print("Please re-enter dates ,trip should be between 3 and 15 days ")
else:
    session = Session(engine)

In [15]:
    max_date = session.query(func.max(Measurement.date))\
             .all()[0][0]
    d1 = datetime.strptime(max_date,"%Y-%m-%d")
    d2 = d1 - dateutil.relativedelta.relativedelta(months=12)
    min_date = d2.strftime("%Y-%m-%d")
    print(max_date)
    print(min_date)

2017-08-23
2016-08-23


In [16]:
    stmt = session.query(Measurement.date,func.avg(Measurement.prcp).label("precipitation"))\
            .filter(Measurement.date >= min_date)\
            .filter(Measurement.date <= max_date)\
            .group_by(Measurement.date).order_by(Measurement.date.asc()).statement

In [17]:
    df2 = pd.read_sql_query(stmt, session.bind)
    

In [18]:
    df2 = df2.set_index('date')


In [19]:
    ax = df2[["precipitation"]].plot(kind='bar',figsize=(10, 10))
#     ax.set_xticklabels()

#     plt.bar(x_axis, users, color='r', alpha=0.5, align="edge")

In [20]:
    plt.show()


<IPython.core.display.Javascript object>

In [21]:
    df2.describe()

Unnamed: 0,precipitation
count,366.0
mean,0.156734
std,0.275544
min,0.0
25%,0.008571
50%,0.065357
75%,0.176786
max,2.04


#    # Station Analysis

In [22]:
    total_stations = session.query(distinct(Station.id)).count()

In [23]:
    print(f"There are a Total of {total_stations} Stations") 

There are a Total of 9 Stations


In [24]:
    result = session.query(Measurement.station_id,func.count(Measurement.tobs).label("Total_tobs"))\
             .filter(Measurement.tobs > 0 )\
             .group_by(Measurement.station_id).order_by(func.count(Measurement.tobs).desc()).all()

In [25]:
    print("*" * 100)
    print("Station" + "     -    " + " Observation Count")
    print("*" * 100)
    for rec in result:
        
        print(f"{rec[0]} -     {rec[1]}")

****************************************************************************************************
Station     -     Observation Count
****************************************************************************************************
USC00519281 -     2772
USC00519397 -     2724
USC00513117 -     2709
USC00519523 -     2669
USC00516128 -     2612
USC00514830 -     2202
USC00511918 -     1979
USC00517948 -     1372
USC00518838 -     511


In [26]:
    print(f"Station {result[0][0]} has a max of {result[0][1]} Observations") 
    

Station USC00519281 has a max of 2772 Observations


In [27]:
    max_date = session.query(func.max(Measurement.date))\
             .all()[0][0]
    d1 = datetime.strptime(max_date,"%Y-%m-%d")
    d2 = d1 - dateutil.relativedelta.relativedelta(months=12)
    min_date = d2.strftime("%Y-%m-%d")
    print(max_date)
    print(min_date)

2017-08-23
2016-08-23


In [28]:
    stmt = session.query(Measurement.tobs.label("tobs")).filter(Measurement.date >= min_date)\
                  .filter(Measurement.date <= max_date).filter(Measurement.tobs > 0 )\
                  .filter(Measurement.station_id == result[0][0] ).statement

In [29]:
    df3 = pd.read_sql_query(stmt, session.bind)
    df3.head()

Unnamed: 0,tobs
0,77
1,77
2,80
3,80
4,75


In [30]:
    df3["tobs"].hist(bins=12)
    plt.ylabel("Frequency")
    plt.savefig("Histogram Tobs.png")
    plt.show()

<IPython.core.display.Javascript object>

In [31]:
    # Write a function called `calc_temps` that will accept start date and end date in the format '%Y-%m-%d' 
    # and return the minimum, average, and maximum temperatures for that range of dates
    def calc_temps(start_date, end_date):
        """TMIN, TAVG, and TMAX for a list of dates.
    
        Args:
            start_date (string): A date string in the format %Y-%m-%d
            end_date (string): A date string in the format %Y-%m-%d
        
        Returns:
            TMIN, TAVE, and TMAX
        """
    
        return session.query(func.min(Measurement.tobs), func.avg(Measurement.tobs), func.max(Measurement.tobs)).\
            filter(Measurement.date >= start_date).filter(Measurement.date <= end_date).all()


In [32]:
    
    start_date_12 = start_date - dateutil.relativedelta.relativedelta(months=12)
    yr_from_start = d2.strftime("%Y-%m-%d")
    print(yr_from_start)
    
    t_min,t_avg,t_max = calc_temps(yr_from_start, start_trip_date)[0]

2016-08-23


In [33]:
    plt.bar(0, t_avg, color='r', alpha=0.5, align="edge",yerr=(t_max - t_min))
    plt.xlim(-0.25, 1) 
    plt.ylim(0, t_avg + ((t_max - t_min)) + 20)
    plt.xticks([0.4],[' '])
    plt.title("Trip Avg Temp")
    plt.ylabel("Temp (F)")
    plt.tight_layout()
#     plt.figsize=(10, 20)

In [34]:
    plt.show()

<IPython.core.display.Javascript object>

# Optional Recommended Analysis

In [35]:
    d1 = datetime.strptime(end_trip_date,"%Y-%m-%d")
    d2 = d1 - dateutil.relativedelta.relativedelta(months=12)
    prev_max_date = d2.strftime("%Y-%m-%d")
    print(prev_max_date)
    d1 = datetime.strptime(start_trip_date,"%Y-%m-%d")
    d2 = d1 - dateutil.relativedelta.relativedelta(months=12)
    prev_min_date = d2.strftime("%Y-%m-%d")
    print(prev_min_date)
    
    session.query(Measurement.station_id, func.avg(Measurement.prcp)).\
            filter(Measurement.date >= prev_min_date).filter(Measurement.date <= prev_max_date)\
            .group_by(Measurement.station_id).all()


2017-01-11
2017-01-01


[('USC00513117', 0.031818181818181815),
 ('USC00514830', 0.057272727272727274),
 ('USC00516128', 0.057272727272727274),
 ('USC00517948', 0.0),
 ('USC00519281', 0.018181818181818184),
 ('USC00519397', 0.0),
 ('USC00519523', 0.06777777777777777)]

In [36]:
    # Write a function called `calc_temps` that will accept start date and end date in the format '%Y-%m-%d' 
    # and return the minimum, average, and maximum temperatures for that range of dates
    def daily_normals(match_date):
        """TMIN, TAVG, and TMAX for a list of dates.
    
        Args:
            start_date (string): A date string in the format %Y-%m-%d
            end_date (string): A date string in the format %Y-%m-%d
        
        Returns:
            TMIN, TAVE, and TMAX
        """
        match_date = "%" + match_date
        return session.query(func.min(Measurement.tobs), func.avg(Measurement.tobs)\
                             , func.max(Measurement.tobs)).\
                filter(Measurement.date.like(match_date) ).all()

In [37]:
lst = np.arange(np.datetime64(start_trip_date), np.datetime64(end_trip_date))

In [38]:
lst = np.append(lst, np.datetime64(end_trip_date))

In [39]:
df_daily_normals = pd.DataFrame(index=lst ,columns=["tmin","tavg","tmax","date"])

In [None]:
# df_daily_normals = df_daily_normals.fillna(value=0)

In [40]:
for index,row in df_daily_normals.iterrows():
    result = daily_normals(index.strftime("%m-%d"))
    row["tmin"]= result[0][0]
    row["tavg"]= result[0][1]
    row["tmax"]= result[0][2]
    row["date"] = index.strftime("%Y-%m-%d")

In [41]:
df_daily_normals = df_daily_normals.reset_index()

In [42]:
df_daily_normals = df_daily_normals.set_index("date")

In [43]:
df_daily_normals[["tmin","tavg",'tmax']].plot(kind='area',stacked=False)
plt.show()

<IPython.core.display.Javascript object>

[('USC00511918', 'HONOLULU OBSERVATORY 702.2, HI US'), ('USC00513117', 'KANEOHE 838.1, HI US'), ('USC00514830', 'KUALOA RANCH HEADQUARTERS 886.9, HI US'), ('USC00516128', 'MANOA LYON ARBO 785.2, HI US'), ('USC00517948', 'PEARL CITY, HI US'), ('USC00518838', 'UPPER WAHIAWA 874.3, HI US'), ('USC00519281', 'WAIHEE 837.5, HI US'), ('USC00519397', 'WAIKIKI 717.2, HI US'), ('USC00519523', 'WAIMANALO EXPERIMENTAL FARM, HI US')]


2017


OperationalError: (sqlite3.OperationalError) no such function: float [SQL: 'SELECT float(measurements.tobs) AS float_1 \nFROM measurements \nWHERE strftime(?, measurements.date) = ?'] [parameters: ('%Y', '2017')]