# Assession Dimension Schema
Objective: Investigate csv files for station and state. Clean files as needed. Determine appropriate schema to use in Redshift. 

In [4]:
!ls ../data/raw/dimension

hpd_states.txt	HPD_v02r02_stationinv_c20221129.csv


In [17]:
import pandas as pd
from sqlalchemy import create_engine

In [18]:
engine = create_engine('postgresql://')

## Station

In [20]:
df_st = pd.read_csv('../data/raw/dimension/HPD_v02r02_stationinv_c20221129.csv')

In [3]:
df_st.dtypes

StnID                     object
Lat                      float64
Lon                      float64
Elev                     float64
State/Province            object
Name                      object
WMO_ID                   float64
Sample_Interval (min)      int64
UTC_Offset                 int64
POR_Date_Range            object
PCT_POR_Good              object
Last_Half_POR             object
PCT_Last_Half_Good        object
Last_Qtr_POR              object
PCT_Last_Qtr_Good         object
dtype: object

In [26]:
df_st.columns = df_st.columns.str.lower()

In [27]:
df_st.rename(columns={'state/province': 'state',
                     'sample_interval (min)': 'sample_interval_min'}, inplace=True)

In [28]:
df_st.columns

Index(['stnid', 'lat', 'lon', 'elev', 'state', 'name', 'wmo_id',
       'sample_interval_min', 'utc_offset', 'por_date_range', 'pct_por_good',
       'last_half_por', 'pct_last_half_good', 'last_qtr_por',
       'pct_last_qtr_good'],
      dtype='object')

In [31]:
df_st

Unnamed: 0,stnid,lat,lon,elev,state,name,wmo_id,sample_interval_min,utc_offset,por_date_range,pct_por_good,last_half_por,pct_last_half_good,last_qtr_por,pct_last_qtr_good
0,USC00010008,31.5702,-85.2482,139.0,AL,ABBEVILLE,,15,-6,19480601-20180418,83.6%,19830510-20180418,70.2%,20001028-20180418,65.8%
1,USC00010063,34.2110,-87.1784,239.6,AL,ADDISON,,15,-6,19480601-20210606,85.8%,19841203-20210606,83.2%,20030306-20210606,74.6%
2,USC00010140,32.2322,-87.4104,53.3,AL,ALBERTA,,15,-6,19630926-20211231,94.4%,19921112-20211231,92.0%,20070607-20211231,89.7%
3,USC00010252,31.3071,-86.5226,76.2,AL,ANDALUSIA 3 W,,15,-6,19800301-20180205,89.1%,19990217-20180205,92.3%,20080812-20180205,97.5%
4,USC00010369,33.2941,-85.7788,311.5,AL,ASHLAND 3 ENE,,15,-6,19480601-20130804,84.6%,19810101-20130804,81.4%,19970418-20130804,70.7%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,AQC00914594,-14.3333,-170.7667,42.4,AS,MALAELOA,,15,-11,20190929-20210102,27.9%,20200516-20210102,55.2%,20200908-20210102,38.5%
2004,USC00914670,13.4619,144.6894,3.0,GU,PITI,,15,10,19780701-20210205,71.7%,19991019-20210205,65.6%,20100613-20210205,72.4%
2005,CQC00914801,14.1717,145.2428,179.2,MP,ROTA AIRPORT,91221.0,15,10,19820601-20210209,61.1%,20011005-20210209,46.0%,20110608-20210209,48.9%
2006,CQC00914855,15.1167,145.7167,65.5,MP,SAIPAN INTL AP,91232.0,15,10,19790901-20210201,68.2%,20000517-20210201,51.6%,20100924-20210201,56.5%


In [34]:
df_st.to_csv('../data/out/dimension/dim_station.csv', index=False)

In [30]:
print(pd.io.sql.get_schema(df_st, 'dim_station', con=engine))


CREATE TABLE dim_station (
	stnid TEXT, 
	lat FLOAT(53), 
	lon FLOAT(53), 
	elev FLOAT(53), 
	state TEXT, 
	name TEXT, 
	wmo_id FLOAT(53), 
	sample_interval_min BIGINT, 
	utc_offset BIGINT, 
	por_date_range TEXT, 
	pct_por_good TEXT, 
	last_half_por TEXT, 
	pct_last_half_good TEXT, 
	last_qtr_por TEXT, 
	pct_last_qtr_good TEXT
)




In [24]:
print(pd.io.sql.get_schema(df_st, 'station'))

CREATE TABLE "station" (
"StnID" TEXT,
  "Lat" REAL,
  "Lon" REAL,
  "Elev" REAL,
  "State/Province" TEXT,
  "Name" TEXT,
  "WMO_ID" REAL,
  "Sample_Interval (min)" INTEGER,
  "UTC_Offset" INTEGER,
  "POR_Date_Range" TEXT,
  "PCT_POR_Good" TEXT,
  "Last_Half_POR" TEXT,
  "PCT_Last_Half_Good" TEXT,
  "Last_Qtr_POR" TEXT,
  "PCT_Last_Qtr_Good" TEXT
)


## State

In [35]:
df_s = pd.read_csv('../data/raw/dimension/hpd_states.txt', names=['state'])

In [36]:
df_s[['state_abbr','state_name']] = df_s.state.str.split(' ', n=1, expand=True)

In [37]:
df_s

Unnamed: 0,state,state_abbr,state_name
0,AK ALASKA,AK,ALASKA
1,AL ALABAMA,AL,ALABAMA
2,AR ARKANSAS,AR,ARKANSAS
3,AZ ARIZONA,AZ,ARIZONA
4,CA CALIFORNIA,CA,CALIFORNIA
5,CO COLORADO,CO,COLORADO
6,CT CONNECTICUT,CT,CONNECTICUT
7,DE DELAWARE,DE,DELAWARE
8,FL FLORIDA,FL,FLORIDA
9,GA GEORGIA,GA,GEORGIA


In [40]:
df_s = df_s.drop(['state'], axis=1)

In [41]:
df_s

Unnamed: 0,state_abbr,state_name
0,AK,ALASKA
1,AL,ALABAMA
2,AR,ARKANSAS
3,AZ,ARIZONA
4,CA,CALIFORNIA
5,CO,COLORADO
6,CT,CONNECTICUT
7,DE,DELAWARE
8,FL,FLORIDA
9,GA,GEORGIA


In [42]:
df_s.to_csv('../data/out/dimension/dim_state.csv', index=False)

In [43]:
print(pd.io.sql.get_schema(df_s, 'dim_state', con=engine))


CREATE TABLE dim_state (
	state_abbr TEXT, 
	state_name TEXT
)


