# Data Preparation

National Centers for Environmental Information (NCEI) is responsible for hosting and providing access to one of the most significant archives on Earth, with comprehensive oceanic, atmospheric, and geophysical data. From the depths of the ocean to the surface of the sun and from million-year-old ice core records to near real-time satellite images, NCEI is the Nation’s leading authority for environmental information.

The Global Summary of the Month (GSOM) and Global Summary of the Year (GSOY) datasets consist of 55 climatological variables computed from summary of the day observations of the Global Historical Climatology Network Daily dataset. Of these, 53 are monthly and annual summary variables and two are season-to-date variables. 

This project consumes GSOY data. In order to keep volume of data under control, we only consider precipitation and temperature data for US. 

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Import necessary libraries

In [19]:
import os
import pandas as pd
from collections import OrderedDict
from sqlalchemy import create_engine, inspect
import tarfile

### Path to historic yearly weather information

In [4]:
RAW_DATA_FILE=os.path.join("..", "data", "gsoy-latest.tar.gz")

### Process archive downloaded from NCEI FTP site - https://www.ncei.noaa.gov/data/gsoy/archive/

In [16]:
final_df = pd.DataFrame()

with tarfile.open(RAW_DATA_FILE, "r:gz") as tar:
    for index, member in enumerate(tar.getmembers()):
        # Extract only US data
        if not member.name.startswith('US'):
            continue
        csv = tar.extractfile(member)
        member_df = pd.read_csv(csv)
        # Skip records with no PRCP or TAVG. If record has TAVG, it also has TMAX and TMIN
        if 'TAVG' not in member_df.columns or 'PRCP' not in member_df.columns:
            continue
        
        # Extract only columns that are relevant for this analysis
        COLS_OF_INTEREST = ['DATE', 'NAME', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'PRCP', 'TAVG', 'TMAX', 'TMIN']
        final_df = pd.concat([final_df, member_df[COLS_OF_INTEREST]], sort=False)
        #if index > 10:
        #    break
        
final_df.head()

Unnamed: 0,DATE,NAME,STATION,LATITUDE,LONGITUDE,ELEVATION,PRCP,TAVG,TMAX,TMIN
0,2009,"SIOUX FALLS ENVIRON. CANADA, SD US",US009052008,43.7333,-96.6333,482.0,549.7,6.26,12.07,0.45
1,2010,"SIOUX FALLS ENVIRON. CANADA, SD US",US009052008,43.7333,-96.6333,482.0,755.1,7.32,12.91,1.72
2,2011,"SIOUX FALLS ENVIRON. CANADA, SD US",US009052008,43.7333,-96.6333,482.0,632.0,7.12,13.0,1.25
3,2012,"SIOUX FALLS ENVIRON. CANADA, SD US",US009052008,43.7333,-96.6333,482.0,,9.75,16.33,3.17
4,2013,"SIOUX FALLS ENVIRON. CANADA, SD US",US009052008,43.7333,-96.6333,482.0,547.5,6.34,12.06,0.62


In [17]:
final_df.shape

(579764, 10)

In [18]:
final_df.describe()

Unnamed: 0,DATE,LATITUDE,LONGITUDE,ELEVATION,PRCP,TAVG,TMAX,TMIN
count,579764.0,579764.0,579764.0,578073.0,511383.0,408475.0,431044.0,427684.0
mean,1968.361395,39.497033,-98.341976,603.196878,860.981855,11.309188,17.848783,4.789526
std,32.038526,6.185794,17.570509,699.658988,499.924479,5.215137,5.420899,5.366153
min,1845.0,19.0614,-177.35,-59.1,0.0,-15.45,-12.49,-19.91
25%,1946.0,35.4661,-110.3986,146.0,473.4,7.67,14.1,1.03
50%,1971.0,39.68333,-96.25361,310.0,837.5,10.79,17.36,4.49
75%,1996.0,43.3,-85.1313,841.9,1143.9,14.97,21.82,8.29
max,2018.0,71.3213,179.28333,13109.4,10037.2,26.97,34.66,24.27


### Write to SQLite DB

In [37]:
engine = create_engine('sqlite:///../db/adventurously.sqlite', echo=False)
final_df.to_sql('CLIMATE_HISTORY', if_exists='append', con=engine, index=True)

In [38]:
engine = create_engine('sqlite:///../db/adventurously.sqlite', echo=False)
pd.read_sql_query('select * from CLIMATE_HISTORY', con=engine)

Unnamed: 0,index,STATION,DATE,NAME,LATITUDE,LONGITUDE,ELEVATION,PRCP,TAVG,TMAX,TMIN
0,0,US009052008,2009,"SIOUX FALLS ENVIRON. CANADA, SD US",43.7333,-96.6333,482.0,549.7,6.26,12.07,0.45
1,1,US009052008,2010,"SIOUX FALLS ENVIRON. CANADA, SD US",43.7333,-96.6333,482.0,755.1,7.32,12.91,1.72
2,2,US009052008,2011,"SIOUX FALLS ENVIRON. CANADA, SD US",43.7333,-96.6333,482.0,632.0,7.12,13.00,1.25
3,3,US009052008,2012,"SIOUX FALLS ENVIRON. CANADA, SD US",43.7333,-96.6333,482.0,,9.75,16.33,3.17
4,4,US009052008,2013,"SIOUX FALLS ENVIRON. CANADA, SD US",43.7333,-96.6333,482.0,547.5,6.34,12.06,0.62
5,5,US009052008,2014,"SIOUX FALLS ENVIRON. CANADA, SD US",43.7333,-96.6333,482.0,,,,
6,6,USC00010063,1941,"ADDISON, AL US",34.2553,-87.1814,249.3,1186.4,,,
7,7,USC00010063,1949,"ADDISON, AL US",34.2553,-87.1814,249.3,,,,
8,8,USC00010063,1986,"ADDISON, AL US",34.2553,-87.1814,249.3,1156.9,,,
9,9,USC00010063,1987,"ADDISON, AL US",34.2553,-87.1814,249.3,1386.1,,,


In [39]:
from sqlalchemy.ext.automap import automap_base
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

# Save references to each table
print(list(Base.classes))

[<class 'sqlalchemy.ext.automap.CLIMATE_HISTORY'>]
