## Create Dependencies

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import os 

from config import pwd, uname

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import Column, Integer, String, Float 
from sqlalchemy import desc

## Create Variables

In [2]:
caDataSet = os.path.join('..','DataSets','CA Weed Data.csv')
cenDataSet = os.path.join('..','DataSets','Census Data.csv')
coDataSet = os.path.join('..','DataSets','CO Weed Data.csv')
massDataSet = os.path.join('..','DataSets','Mass Weed Data.csv')
stateDataSet = os.path.join('..','DataSets','State Names and Abbr.csv')


## Establish Database Connections and Get Tables Lists

In [3]:
# create engine to postgres db
postgres = f'postgresql://{uname}:{pwd}@localhost:5432/etl_project'  #path to local db

engine = create_engine(postgres)

In [4]:
# reflect an existing database into a new model
base = automap_base()

# reflect the tables
base.prepare(engine, reflect=True)

# View all of the classes that automap found
base.classes.keys()

['ca_raw', 'co_raw', 'mass_raw', 'states', 'sales_by_qtr', 'census']

In [5]:
# Save references to each table
# adding these for fun and for sanity checks

caRaw = base.classes.ca_raw
coRaw = base.classes.co_raw
massRaw = base.classes.mass_raw
states = base.classes.states
sales = base.classes.sales_by_qtr
census = base.classes.census

In [6]:
# Create our session (link) from Python to the DB
session = Session(bind=engine)

## Exploratory Analysis

In [7]:
# Load data into dataframes
massRawDF = pd.read_csv(massDataSet) 
cenRawDF = pd.read_csv(cenDataSet)
stateDF = pd.read_csv(stateDataSet)
caRawDF = pd.read_csv(caDataSet)
coRawDF = pd.read_csv(coDataSet)

### Mass Explore

In [8]:
massRawDF.head()

Unnamed: 0,activitysummarydate,total_plantimmaturecount,total_planttrackedcount,total_plantfloweringcount,total_plantvegetativecount,total_plantdestroyedcount,total_plantharvestedcount,total_plantcount,salestotal,total_active_harvestcount,total_active_packagecount,total_plantbatchcount,total_activeproducts,total_activestrains,total_employees
0,6/4/2021 0:00,140747,225685,114894,110791,231460,966667,1423812,1658394000.0,1231,131289,3388,155452,30337,8334
1,6/3/2021 0:00,140747,225685,114894,110791,231460,966667,1423812,1658393000.0,1231,131271,3388,155452,30337,8334
2,6/2/2021 0:00,146669,221316,111723,109593,229462,961421,1412199,1651406000.0,1208,128029,3466,155053,30165,8282
3,6/1/2021 0:00,153497,211654,108947,102707,228692,960539,1400885,1648229000.0,1212,126709,3520,154325,30045,8282
4,5/31/2021 0:00,155021,210406,108882,101524,228595,959964,1398965,1645279000.0,1232,127630,3626,154348,30010,8274


## State Explore and Transform

In [9]:
stateDF.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [10]:
stateDF=stateDF.drop('Abbrev', 1)
stateDF.head()

Unnamed: 0,State,Code
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [11]:
stateDF = stateDF.rename(columns = {'State' : 'StateDescName' , 'Code' : 'StateAbbrev'})
stateDF.head()

Unnamed: 0,StateDescName,StateAbbrev
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [12]:
StateID = 0 
stateDF.insert(StateID, "StateID", value = range(len(stateDF)))
stateDF.head()

Unnamed: 0,StateID,StateDescName,StateAbbrev
0,0,Alabama,AL
1,1,Alaska,AK
2,2,Arizona,AZ
3,3,Arkansas,AR
4,4,California,CA


## Census Explore and Transform

In [13]:
cenRawDF.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019,RNETMIG2020
0,10,0,0,0,United States,308745538,308758105,309327143,311583481,313877662,...,2.561705,2.745929,2.697816,2.980977,3.317393,3.308478,2.92626,2.208328,1.735857,1.450346
1,20,1,0,0,Northeast Region,55317240,55318414,55380764,55608318,55782661,...,0.951355,0.02291,-0.406301,-0.904675,-2.01199,-2.436197,-1.773545,-2.008364,-3.16071,-3.67449
2,20,2,0,0,Midwest Region,66927001,66929737,66975328,67164092,67348275,...,-0.883842,-0.895516,0.06239,-0.697566,-1.320504,-1.176126,-0.487257,-0.800567,-1.205668,-2.011623
3,20,3,0,0,South Region,114555744,114563042,114869421,116019483,117264196,...,5.218129,5.93444,5.373447,6.236211,7.355913,7.220696,6.272594,5.291755,5.479642,5.743507
4,20,4,0,0,West Region,71945553,71946912,72101630,72791588,73482530,...,2.752027,3.083047,3.183793,4.039593,5.004949,5.284859,4.022332,2.968883,1.822074,1.214309


In [14]:
cenRawDF.describe

<bound method NDFrame.describe of     SUMLEV REGION DIVISION  STATE                  NAME  CENSUS2010POP  \
0       10      0        0      0         United States      308745538   
1       20      1        0      0      Northeast Region       55317240   
2       20      2        0      0        Midwest Region       66927001   
3       20      3        0      0          South Region      114555744   
4       20      4        0      0           West Region       71945553   
5       40      3        6      1               Alabama        4779736   
6       40      4        9      2                Alaska         710231   
7       40      4        8      4               Arizona        6392017   
8       40      3        7      5              Arkansas        2915918   
9       40      4        9      6            California       37253956   
10      40      4        8      8              Colorado        5029196   
11      40      1        1      9           Connecticut        3574097   
12  

In [15]:
# Drop columns we do not need
cenNew = cenRawDF
cenNew = cenNew.drop({'SUMLEV' , 'REGION' , 'DIVISION' , 'CENSUS2010POP' , 'ESTIMATESBASE2010' , 'POPESTIMATE2010' , 'POPESTIMATE2011' , 'POPESTIMATE2012' , 'POPESTIMATE2013' , 'POPESTIMATE2014' , 
                         'POPESTIMATE2015' , 'POPESTIMATE2016' , 'POPESTIMATE2017' , 'NPOPCHG_2010' , 'NPOPCHG_2011' , 'NPOPCHG_2012' , 'NPOPCHG_2013' , 'NPOPCHG_2014' , 'NPOPCHG_2015' , 'NPOPCHG_2016' , 
                         'NPOPCHG_2017' , 'NPOPCHG_2018' , 'NPOPCHG_2019' , 'NPOPCHG_2020' , 'BIRTHS2010' , 'BIRTHS2011' , 'BIRTHS2012' , 'BIRTHS2013' , 'BIRTHS2014' , 'BIRTHS2015' , 'BIRTHS2016' , 'BIRTHS2017' , 
                          'BIRTHS2018' , 'BIRTHS2019' , 'BIRTHS2020' , 'DEATHS2010' , 'DEATHS2011' , 'DEATHS2012' , 'DEATHS2013' , 'DEATHS2014' , 'DEATHS2015' , 'DEATHS2016' , 'DEATHS2017' , 'DEATHS2018' , 'DEATHS2019' , 
                          'DEATHS2020' , 'NATURALINC2010' , 'NATURALINC2011' , 'NATURALINC2012' ,  'NATURALINC2013' ,  'NATURALINC2014' ,  'NATURALINC2015' ,  'NATURALINC2016' ,  'NATURALINC2017' ,  'NATURALINC2018' , 
                          'NATURALINC2019' ,  'NATURALINC2020' , 'INTERNATIONALMIG2010' , 'INTERNATIONALMIG2011' , 'INTERNATIONALMIG2012' , 'INTERNATIONALMIG2013' , 'INTERNATIONALMIG2014' , 'INTERNATIONALMIG2015' , 
                         'INTERNATIONALMIG2016' , 'INTERNATIONALMIG2017' , 'INTERNATIONALMIG2018' , 'INTERNATIONALMIG2019' , 'INTERNATIONALMIG2020' , 'DOMESTICMIG2010' , 'DOMESTICMIG2011' , 'DOMESTICMIG2012' , 'DOMESTICMIG2013' , 
                         'DOMESTICMIG2014' , 'DOMESTICMIG2015' , 'DOMESTICMIG2016' , 'DOMESTICMIG2017' , 'DOMESTICMIG2018' , 'DOMESTICMIG2019' , 'DOMESTICMIG2020' , 'NETMIG2010' , 'NETMIG2011' , 'NETMIG2012' , 'NETMIG2013' ,
                         'NETMIG2014' , 'NETMIG2015' , 'NETMIG2016' , 'NETMIG2017' , 'NETMIG2018' , 'NETMIG2019' , 'NETMIG2020' , 'RESIDUAL2010' ,  'RESIDUAL2011' , 'RESIDUAL2012' , 'RESIDUAL2013' , 'RESIDUAL2014' , 
                         'RESIDUAL2015' , 'RESIDUAL2016' , 'RESIDUAL2017' , 'RESIDUAL2018' , 'RESIDUAL2019' , 'RESIDUAL2020' ,  'RBIRTH2011' , 'RBIRTH2012' , 'RBIRTH2013' , 'RBIRTH2014' , 'RBIRTH2015' , 'RBIRTH2016' , 'RBIRTH2017' , 
                         'RBIRTH2018' ,  'RBIRTH2019' , 'RBIRTH2020' , 'RDEATH2011' ,  'RDEATH2012' , 'RDEATH2013' , 'RDEATH2014' , 'RDEATH2015' , 'RDEATH2016' , 'RDEATH2017' , 'RDEATH2018' , 'RDEATH2019' , 'RDEATH2020' , 
                         'RNATURALINC2011' , 'RNATURALINC2012' , 'RNATURALINC2013' , 'RNATURALINC2014' ,  'RNATURALINC2015' ,  'RNATURALINC2016' , 'RNATURALINC2017' , 'RNATURALINC2018' , 'RNATURALINC2019' , 'RNATURALINC2020' ,
                         'RINTERNATIONALMIG2011' , 'RINTERNATIONALMIG2012' , 'RINTERNATIONALMIG2013' , 'RINTERNATIONALMIG2014' , 'RINTERNATIONALMIG2015' , 'RINTERNATIONALMIG2016' , 'RINTERNATIONALMIG2017' , 'RINTERNATIONALMIG2018' ,
                         'RINTERNATIONALMIG2019' , 'RINTERNATIONALMIG2020' , 'RDOMESTICMIG2011' ,  'RDOMESTICMIG2012' , 'RDOMESTICMIG2013' ,  'RDOMESTICMIG2014' , 'RDOMESTICMIG2015' , 'RDOMESTICMIG2016' , 'RDOMESTICMIG2017' ,
                         'RDOMESTICMIG2018' , 'RDOMESTICMIG2019' , 'RDOMESTICMIG2020' , 'RNETMIG2011' , 'RNETMIG2012' , 'RNETMIG2013' , 'RNETMIG2014' , 'RNETMIG2015' , 'RNETMIG2016' , 'RNETMIG2017' , 'RNETMIG2018' , 'RNETMIG2019' , 
                         'RNETMIG2020'},1)
                    

cenNew.head()

Unnamed: 0,STATE,NAME,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020
0,0,United States,326838199,328329953,329484123
1,0,Northeast Region,56084543,56002934,55849869
2,0,Midwest Region,68263019,68340091,68316744
3,0,South Region,124649156,125686544,126662754
4,0,West Region,77841481,78300384,78654756


In [16]:
# Drop Regional Rows, US row, and Puerto Rico row
cenNew = cenNew.drop([ cenNew.index[0] , cenNew.index[1] , cenNew.index[2] , cenNew.index[3] , cenNew.index[4] , cenNew.index[56] ])
cenNew.head()

Unnamed: 0,STATE,NAME,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020
5,1,Alabama,4891628,4907965,4921532
6,2,Alaska,736624,733603,731158
7,4,Arizona,7164228,7291843,7421401
8,5,Arkansas,3012161,3020985,3030522
9,6,California,39437463,39437610,39368078


In [17]:
# Rename columns to match the other tables, and add in a total population column
cenNew = cenNew.rename(columns = {'NAME' : 'StateDescName' , 'POPESTIMATE2018' : '2018Pop' , 'POPESTIMATE2019' : '2019Pop' , 'POPESTIMATE2020' : '2020Pop' })
cenNew['TotalPop'] = cenNew['2018Pop'] + cenNew['2019Pop'] + cenNew['2020Pop']
cenNew = cenNew.drop('STATE' ,1)
cenNew.head()

Unnamed: 0,StateDescName,2018Pop,2019Pop,2020Pop,TotalPop
5,Alabama,4891628,4907965,4921532,14721125
6,Alaska,736624,733603,731158,2201385
7,Arizona,7164228,7291843,7421401,21877472
8,Arkansas,3012161,3020985,3030522,9063668
9,California,39437463,39437610,39368078,118243151


In [18]:
# Re-set index
cenNew.reset_index(drop=True, inplace=True)

In [19]:
cenNew["StateID"] = ""
cenNew.head()

Unnamed: 0,StateDescName,2018Pop,2019Pop,2020Pop,TotalPop,StateID
0,Alabama,4891628,4907965,4921532,14721125,
1,Alaska,736624,733603,731158,2201385,
2,Arizona,7164228,7291843,7421401,21877472,
3,Arkansas,3012161,3020985,3030522,9063668,
4,California,39437463,39437610,39368078,118243151,


In [20]:
#This confirms that both DF are in same order
cenNew["StateNameMatch"] = np.where(cenNew['StateDescName'] == stateDF['StateDescName'], 'True' , 'False')
cenNew.head()

Unnamed: 0,StateDescName,2018Pop,2019Pop,2020Pop,TotalPop,StateID,StateNameMatch
0,Alabama,4891628,4907965,4921532,14721125,,True
1,Alaska,736624,733603,731158,2201385,,True
2,Arizona,7164228,7291843,7421401,21877472,,True
3,Arkansas,3012161,3020985,3030522,9063668,,True
4,California,39437463,39437610,39368078,118243151,,True


In [21]:
# Now that you know both DFs match row per row, assign StateID for census from StateID for states
cenNew["StateID"] = stateDF["StateID"]
cenNew.head()

Unnamed: 0,StateDescName,2018Pop,2019Pop,2020Pop,TotalPop,StateID,StateNameMatch
0,Alabama,4891628,4907965,4921532,14721125,0,True
1,Alaska,736624,733603,731158,2201385,1,True
2,Arizona,7164228,7291843,7421401,21877472,2,True
3,Arkansas,3012161,3020985,3030522,9063668,3,True
4,California,39437463,39437610,39368078,118243151,4,True


In [22]:
# Drop the StatesNameMatch
cenNew = cenNew.drop('StateNameMatch' ,1)
cenNew.head()

Unnamed: 0,StateDescName,2018Pop,2019Pop,2020Pop,TotalPop,StateID
0,Alabama,4891628,4907965,4921532,14721125,0
1,Alaska,736624,733603,731158,2201385,1
2,Arizona,7164228,7291843,7421401,21877472,2
3,Arkansas,3012161,3020985,3030522,9063668,3
4,California,39437463,39437610,39368078,118243151,4


In [23]:
# Add in column for CensusID
CensusID = 0
cenNew.insert(CensusID, "CensusID", value = range(len(cenNew)))
cenNew.head()

Unnamed: 0,CensusID,StateDescName,2018Pop,2019Pop,2020Pop,TotalPop,StateID
0,0,Alabama,4891628,4907965,4921532,14721125,0
1,1,Alaska,736624,733603,731158,2201385,1
2,2,Arizona,7164228,7291843,7421401,21877472,2
3,3,Arkansas,3012161,3020985,3030522,9063668,3
4,4,California,39437463,39437610,39368078,118243151,4


In [24]:
# Re-org so that columns in same order as SQL table
cenNew = cenNew[["CensusID" , "StateID" , "2018Pop" , "2019Pop" , "2020Pop", "TotalPop"]]
cenNew.head()

Unnamed: 0,CensusID,StateID,2018Pop,2019Pop,2020Pop,TotalPop
0,0,0,4891628,4907965,4921532,14721125
1,1,1,736624,733603,731158,2201385
2,2,2,7164228,7291843,7421401,21877472
3,3,3,3012161,3020985,3030522,9063668
4,4,4,39437463,39437610,39368078,118243151


## Load Raw Data into Database

In [25]:
### TRUNCATE THE TABLE
connection = engine.connect()

connection.execute( '''TRUNCATE TABLE sales_by_qtr CASCADE''')
connection.execute( '''TRUNCATE TABLE census''' )
connection.execute( '''TRUNCATE TABLE mass_raw''')
connection.execute( '''TRUNCATE TABLE census_raw''' )
connection.execute( '''TRUNCATE TABLE ca_raw''' )
connection.execute( '''TRUNCATE TABLE co_raw''' )
connection.execute( '''TRUNCATE TABLE states CASCADE''' )


connection.close()

In [26]:
cenRawDF.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019,RNETMIG2020
0,10,0,0,0,United States,308745538,308758105,309327143,311583481,313877662,...,2.561705,2.745929,2.697816,2.980977,3.317393,3.308478,2.92626,2.208328,1.735857,1.450346
1,20,1,0,0,Northeast Region,55317240,55318414,55380764,55608318,55782661,...,0.951355,0.02291,-0.406301,-0.904675,-2.01199,-2.436197,-1.773545,-2.008364,-3.16071,-3.67449
2,20,2,0,0,Midwest Region,66927001,66929737,66975328,67164092,67348275,...,-0.883842,-0.895516,0.06239,-0.697566,-1.320504,-1.176126,-0.487257,-0.800567,-1.205668,-2.011623
3,20,3,0,0,South Region,114555744,114563042,114869421,116019483,117264196,...,5.218129,5.93444,5.373447,6.236211,7.355913,7.220696,6.272594,5.291755,5.479642,5.743507
4,20,4,0,0,West Region,71945553,71946912,72101630,72791588,73482530,...,2.752027,3.083047,3.183793,4.039593,5.004949,5.284859,4.022332,2.968883,1.822074,1.214309


In [27]:
massRawDF.to_sql('mass_raw', engine, if_exists='append', index=False)
cenRawDF.to_sql('census_raw', engine, if_exists='append', index=False)
stateDF.to_sql('states',engine, if_exists='append',index=False)
cenNew.to_sql('census',engine, if_exists='append',index=False)
caRawDF.to_sql('ca_raw', engine, if_exists='append', index=False)
coRawDF.to_sql('co_raw', engine, if_exists='append', index=False)

## Sanity Checks

In [28]:
#check ca raw
session.query(caRaw).count()

13

In [29]:
#check co raw
session.query(coRaw).count()

88

In [30]:
#check mass raw 
session.query(massRaw).count()

956

In [31]:
#check census raw
engine.execute('select count(*) from census_raw').fetchall()

[(57,)]

In [32]:
#check states raw
session.query(states).count()

51

In [33]:
#check states raw
session.query(census).count()

51