### Sample Business Data modeling

Notebook models the sample dataset that has been retrived from AWS data exchange. 


In [1]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
config = configparser.ConfigParser()

In [3]:
config.read('clustermds.config')

['clustermds.config']

In [4]:
config['POSTGRES']['PG_DB']

'mdsusbiz'

In [5]:
db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

Using the pandas internal read_sql method to get the data from the DB

In [6]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

In [7]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/mdsusbiz'

In [8]:
#using psycopg2 to test connection since there are no tables

import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)

In [9]:
conn.set_session(autocommit=True)

In [10]:
try:
    cur = conn.cursor()
    
except:
    print(e)

Following steps to start the data modeling

0) Ingest the data into the database under mds Raw Table

1) Identify the primary key that will be used as reference. Design the schema around the data

2) Design the facttable and dimension table

4) Create tables and insert data into the tables

In [11]:
mdsdata = pd.read_csv('sample_mdsUsBusinessDataNoContacts.txt',delimiter='|')

#### Writing additional helpers

In [12]:
#Using pandas read_sql for getting schema
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

In [13]:
#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

In [14]:
#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

Data Specification:

Row: 2000 entries, 0 to 1999

Columns: 163 entries, EFX_ID to EFX_EXTRACT_DATE

dtypes: float64(42), int64(26), object(95)

In [15]:
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

In [53]:
schemaGen(mdsdata,'mdsdata')

'CREATE TABLE mdsdata (EFX_ID NUMERIC,  EFX_NAME VARCHAR(255),  EFX_LEGAL_NAME VARCHAR(255),  EFX_ADDRESS VARCHAR(255),  EFX_CITY VARCHAR(255),  EFX_STATE VARCHAR(255),  EFX_STATEC NUMERIC,  EFX_ZIPCODE NUMERIC,  EFX_ZIP4 REAL,  EFX_LAT REAL,  EFX_LO- REAL,  EFX_GEOPREC NUMERIC,  EFX_REGION REAL,  EFX_CTRYISOCD VARCHAR(255),  EFX_CTRYNUM NUMERIC,  EFX_CTRYNAME VARCHAR(255),  EFX_COUNTYNM VARCHAR(255),  EFX_COUNTY NUMERIC,  EFX_CMSA VARCHAR(255),  EFX_CMSADESC VARCHAR(255),  EFX_SOHO VARCHAR(255),  EFX_BIZ VARCHAR(255),  EFX_RES VARCHAR(255),  EFX_CMRA VARCHAR(255),  EFX_CONGRESS NUMERIC,  EFX_SECADR VARCHAR(255),  EFX_SECCTY VARCHAR(255),  EFX_SECSTAT VARCHAR(255),  EFX_STATEC2 REAL,  EFX_SECZIP REAL,  EFX_SECZIP4 REAL,  EFX_SECLAT REAL,  EFX_SECLO- REAL,  EFX_SECGEOPREC REAL,  EFX_SECREGION REAL,  EFX_SECCTRYISOCD VARCHAR(255),  EFX_SECCTRYNUM NUMERIC,  EFX_SECCTRYNAME VARCHAR(255),  EFX_CTRYTELCD NUMERIC,  EFX_GENDER VARCHAR(255),  EFX_ETHNICITY VARCHAR(255),  EFX_MBE VARCHAR(255),  

In [18]:
createTable1 = """CREATE TABLE mdsdata (EFX_ID NUMERIC,  EFX_NAME VARCHAR(255),  EFX_LEGAL_NAME VARCHAR(255),  EFX_ADDRESS VARCHAR(255),  EFX_CITY VARCHAR(255),  EFX_STATE VARCHAR(255),  EFX_STATEC NUMERIC,  EFX_ZIPCODE NUMERIC,  EFX_ZIP4 REAL,  EFX_LAT REAL,  EFX_LO REAL,  EFX_GEOPREC NUMERIC,  EFX_REGION REAL,  EFX_CTRYISOCD VARCHAR(255),  EFX_CTRYNUM NUMERIC,  EFX_CTRYNAME VARCHAR(255),  EFX_COUNTYNM VARCHAR(255),  EFX_COUNTY NUMERIC,  EFX_CMSA VARCHAR(255),  EFX_CMSADESC VARCHAR(255),  EFX_SOHO VARCHAR(255),  EFX_BIZ VARCHAR(255),  EFX_RES VARCHAR(255),  EFX_CMRA VARCHAR(255),  EFX_CONGRESS NUMERIC,  EFX_SECADR VARCHAR(255),  EFX_SECCTY VARCHAR(255),  EFX_SECSTAT VARCHAR(255),  EFX_STATEC2 REAL,  EFX_SECZIP REAL,  EFX_SECZIP4 REAL,  EFX_SECLAT REAL,  EFX_SECLO REAL,  EFX_SECGEOPREC REAL,  EFX_SECREGION REAL,  EFX_SECCTRYISOCD VARCHAR(255),  EFX_SECCTRYNUM NUMERIC,  EFX_SECCTRYNAME VARCHAR(255),  EFX_CTRYTELCD NUMERIC,  EFX_GENDER VARCHAR(255),  EFX_ETHNICITY VARCHAR(255),  EFX_MBE VARCHAR(255),  EFX_WBE VARCHAR(255),  EFX_VET VARCHAR(255),  EFX_BUSSIZE VARCHAR(255),  EFX_GOV VARCHAR(255),  EFX_FGOV VARCHAR(255),  EFX_NONPROFIT VARCHAR(255),  EFX_EDU VARCHAR(255),  EFX_BUSSTAT VARCHAR(255),  EFX_BUSSTATCD REAL,  EFX_WEB VARCHAR(255),  EFX_YREST NUMERIC,  EFX_CORPEMPCNT NUMERIC,  EFX_LOCEMPCNT NUMERIC,  
EFX_CORPEMPCD VARCHAR(255),  EFX_LOCEMPCD VARCHAR(255),  EFX_CORPAMOUNT NUMERIC,  EFX_CORPAMOUNTCD VARCHAR(255),  EFX_CORPAMOUNTTP VARCHAR(255),  EFX_CORPAMOUNTPREC VARCHAR(255),  EFX_LOCAMOUNT NUMERIC,  EFX_LOCAMOUNTCD VARCHAR(255),  EFX_LOCAMOUNTTP VARCHAR(255),  EFX_LOCAMOUNTPREC VARCHAR(255),  EFX_PUBLIC REAL,  
EFX_STKEXC REAL,  EFX_TCKSYM REAL,  EFX_PRIMSIC REAL,  EFX_SECSIC1 REAL,  EFX_SECSIC2 REAL,  EFX_SECSIC3 REAL,  EFX_SECSIC4 REAL,  EFX_PRIMSICDESC VARCHAR(255),  
EFX_SECSICDESC1 VARCHAR(255),  EFX_SECSICDESC2 VARCHAR(255),  EFX_SECSICDESC3 VARCHAR(255),  EFX_SECSICDESC4 VARCHAR(255),  EFX_PRIMNAICSCODE REAL,  EFX_SECNAICS1 REAL,  
EFX_SECNAICS2 REAL,  EFX_SECNAICS3 REAL,  EFX_SECNAICS4 REAL,  EFX_PRIMNAICSDESC VARCHAR(255),  EFX_SECNAICSDESC1 VARCHAR(255),  EFX_SECNAICSDESC2 VARCHAR(255),  EFX_SECNAICSDESC3 VARCHAR(255),  
EFX_SECNAICSDESC4 VARCHAR(255),  EFX_LEGSUBNUMALL NUMERIC,  EFX_LEGSUBNAMEALL VARCHAR(255),  EFX_LEGSUBADDRESSALL VARCHAR(255),  EFX_LEGSUBCITYALL VARCHAR(255),  EFX_LEGSUBSTATEALL VARCHAR(255),  EFX_LEGSUBZIPCODEALL NUMERIC,  
EFX_LEGSUBZIP4ALL REAL,  EFX_LEGSUBCOUNTYALL REAL,  EFX_LEGSUBCTRYISOCDALL VARCHAR(255),  EFX_LEGSUBCTRYNUMALL NUMERIC,  EFX_LEGSUBCTRYNAMEALL VARCHAR(255),  EFX_LEGDOMULTNUMALL NUMERIC,  EFX_LEGDOMULTNAMEALL VARCHAR(255),  EFX_LEGDOMULTADDRESSALL VARCHAR(255),  EFX_LEGDOMULTCITYALL VARCHAR(255),  EFX_LEGDOMULTSTATEALL VARCHAR(255),  EFX_LEGDOMULTZIPCODEALL NUMERIC,  EFX_LEGDOMULTZIP4ALL REAL,  EFX_LEGDOMULTCOUNTYALL REAL,  EFX_LEGDOMULTCTRYISOCDALL VARCHAR(255),  EFX_LEGDOMULTCTRYNUMALL NUMERIC,  EFX_LEGDOMULTCTRYNAMEALL VARCHAR(255),  EFX_LEGULTNUMALL NUMERIC,  EFX_LEGULTNAMEALL VARCHAR(255),  EFX_LEGULTADDRESSALL VARCHAR(255),  EFX_LEGULTCITYALL VARCHAR(255),  EFX_LEGULTSTATEALL VARCHAR(255),  EFX_LEGULTZIPCODEALL VARCHAR(255),  EFX_LEGULTZIP4ALL REAL,  EFX_LEGULTCOUNTYALL REAL,  EFX_LEGULTCTRYISOCDALL VARCHAR(255),  EFX_LEGULTCTRYNUMALL NUMERIC,  EFX_LEGULTCTRYNAMEALL VARCHAR(255),  EFX_LEGULTPARENTIND VARCHAR(255),  EFX_LEGPARENTIND VARCHAR(255),  EFX_LEGLINKEDIND VARCHAR(255),  EFX_AFFLULTNUMALL NUMERIC,  EFX_AFFLULTNAMEALL VARCHAR(255),  EFX_AFFLULTADDRESSALL VARCHAR(255),  EFX_AFFLULTCITYALL VARCHAR(255),  EFX_AFFLULTSTATEALL VARCHAR(255),  EFX_AFFLULTZIPCODEALL REAL,  EFX_AFFLULTZIP4ALL REAL,  EFX_AFFULTCOUNTYALL REAL,  EFX_AFFLULTCTRYISOCDALL VARCHAR(255),  EFX_AFFLULTCTRYNUMALL NUMERIC,  EFX_AFFLULTCTRYNAMEALL VARCHAR(255),  EFX_AFFLLINKEDIND VARCHAR(255),  EFX_FRANCHISE VARCHAR(255),  EFX_AFFLULTPARENTIND VARCHAR(255),  EFX_AFFLPARENTIND VARCHAR(255),  EFX_FOREIGN VARCHAR(255),  EFX_FAILRATE REAL,  EFX_FAILLEVEL REAL,  EFX_FAILREASON1 VARCHAR(255),  EFX_FAILREASON2 VARCHAR(255),  EFX_FAILREASON3 VARCHAR(255),  EFX_FAILREASON4 VARCHAR(255),  EFX_CREDITSCORE REAL,  EFX_CREDITCLASS REAL,  EFX_CREDITPERC REAL,  EFX_CREDITREASON1 VARCHAR(255),  EFX_CREDITREASON2 VARCHAR(255),  EFX_CREDITREASON3 VARCHAR(255),  EFX_CREDITREASON4 VARCHAR(255),  EFX_BANKRUPTCY VARCHAR(255),  EFX_DEAD REAL,  EFX_DEADDT REAL,  EFX_MRKT_TELEVER VARCHAR(255),  EFX_MRKT_TELESCORE NUMERIC,  EFX_MRKT_TOTALSCORE NUMERIC,  EFX_MRKT_TOTALIND VARCHAR(255),  EFX_MRKT_VACANT VARCHAR(255),  EFX_MRKT_SEASONAL REAL,  EFX_EXTRACT_DATE VARCHAR(255))"""
queryBase(createTable1)

ResourceClosedError: This result object does not return rows. It has been closed automatically.

### Loading Data

In [21]:
#check if the table in the database has the correct schema
getSchema("mdsdata",credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,mdsusbiz,public,mdsdata,efx_legsubcountyall,96,,YES,real,,,...,NO,,,,,,NO,NEVER,,YES
1,mdsusbiz,public,mdsdata,efx_locempcnt,55,,YES,numeric,,,...,NO,,,,,,NO,NEVER,,YES
2,mdsusbiz,public,mdsdata,efx_legsubctrynumall,98,,YES,numeric,,,...,NO,,,,,,NO,NEVER,,YES
3,mdsusbiz,public,mdsdata,efx_seclo,33,,YES,real,,,...,NO,,,,,,NO,NEVER,,YES
4,mdsusbiz,public,mdsdata,efx_legdomultnumall,100,,YES,numeric,,,...,NO,,,,,,NO,NEVER,,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,mdsusbiz,public,mdsdata,efx_legultaddressall,113,,YES,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES
159,mdsusbiz,public,mdsdata,efx_legultcityall,114,,YES,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES
160,mdsusbiz,public,mdsdata,efx_legultstateall,115,,YES,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES
161,mdsusbiz,public,mdsdata,efx_legultzipcodeall,116,,YES,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES


In [24]:
copymdsData = """COPY mdsdata from '/var/lib/postgresql/data/sample_mdsUsBusinessDataNoContacts.txt'
        DELIMITER '|' CSV HEADER"""
queryTable(copymdsData)

In [26]:
checkmdsData = """SELECT * FROM mdsdata ORDER BY efx_id ASC LIMIT 5"""
queryBase(checkmdsData)

Unnamed: 0,efx_id,efx_name,efx_legal_name,efx_address,efx_city,efx_state,efx_statec,efx_zipcode,efx_zip4,efx_lat,...,efx_bankruptcy,efx_dead,efx_deaddt,efx_mrkt_telever,efx_mrkt_telescore,efx_mrkt_totalscore,efx_mrkt_totalind,efx_mrkt_vacant,efx_mrkt_seasonal,efx_extract_date
0,1.0,WOODS ARCADE WILLOWS,WOODS ARCADE WILLOWS,123 MAIN SRD,BOSTON,MA,25.0,10011.0,1869.0,42.356213,...,,,,Y,5.0,5.0,M,,,6/21/2021
1,2.0,ABCDE RENTALS,ABCDE RENTALS LLC,123 HOLYOKE SRD,BRISTOL,MA,25.0,10563.0,1220.0,42.345455,...,,,,Y,5.0,5.0,M,,,6/21/2021
2,4.0,ABCD DOWNSTREET BANK,ABCD DOWNSTREET BANK,1655 ABC SRD,PLYMOUTH,MA,25.0,65804.0,2590.0,37.156567,...,,,,Y,5.0,5.0,M,,,6/21/2021
3,5.0,ABCDE FURNITTURE CO,ABCDE CORPURNITTURE CO,320 BOWLES RD,WEST PLYMOUTH,MA,25.0,1089.0,2968.0,41.34544,...,,,,,1.0,1.0,A,,,6/21/2021
4,8.0,DUMMY PARK AND RECREATION INC,"DUMMY PARK ENTERPRISES, INC.",623 ABC SRD,WEST PLYMOUTH,MA,25.0,10890.0,2567.0,41.03747,...,,,,Y,5.0,5.0,M,,,6/21/2021


### What is the objective modeling mdsData?

1) To break the 163 column data into easily manageable chunks of conceptual information
2) Create a star Schema to enable ease of data analysis

In [37]:
dataColumns = mdsdata.columns

In [66]:
dataColumns[:20]

Index(['EFX_ID', 'EFX_NAME', 'EFX_LEGAL_NAME', 'EFX_ADDRESS', 'EFX_CITY',
       'EFX_STATE', 'EFX_STATEC', 'EFX_ZIPCODE', 'EFX_ZIP4', 'EFX_LAT',
       'EFX_LO-', 'EFX_GEOPREC', 'EFX_REGION', 'EFX_CTRYISOCD', 'EFX_CTRYNUM',
       'EFX_CTRYNAME', 'EFX_COUNTYNM', 'EFX_COUNTY', 'EFX_CMSA',
       'EFX_CMSADESC'],
      dtype='object')

### How to select the columns that will be important for the analysis?

1) Select the columns and run a describe(include='all) command 

2) Each column in the result will have a count in 1st row of the describe result

3) If the count is more than 50% of the overall count, then take the data in the row.

4) The columns that are below 50% are saved as miscellaneous. (this calls for unwanted work)

The above process uses the Schema generated by Pandas on the input data frame. It uses the describe() method to identify the important columns based on the count. 

5) The database tables are created from the columns that are selected from the Pandas dataframe. To build the tables, the schema function is used to get the selected schema. Then it is copied as shown below steps of table creation

6) After that immediately insert the data into the Table that has been created already

7) Check the table is having the data by querying it

In [30]:
dim_location = """CREATE TABLE dim_location (EFX_ID NUMERIC PRIMARY KEY,  EFX_NAME VARCHAR(255),  EFX_LEGAL_NAME VARCHAR(255),  EFX_ADDRESS VARCHAR(255),  EFX_CITY VARCHAR(255),  EFX_STATE VARCHAR(255),  EFX_STATEC NUMERIC,  EFX_ZIPCODE NUMERIC,  EFX_ZIP4 REAL,  EFX_LAT REAL,  EFX_LO REAL,  EFX_GEOPREC NUMERIC,  EFX_REGION REAL,  EFX_CTRYISOCD VARCHAR(255),  EFX_CTRYNUM NUMERIC,  EFX_CTRYNAME VARCHAR(255),  EFX_COUNTYNM VARCHAR(255),  EFX_COUNTY NUMERIC)"""
queryTable(dim_location)

In [70]:
insert_location = """INSERT INTO dim_location (EFX_ID, EFX_NAME, EFX_LEGAL_NAME, EFX_ADDRESS, EFX_CITY,
                       EFX_STATE, EFX_STATEC, EFX_ZIPCODE, EFX_ZIP4, EFX_LAT,
                       EFX_LO, EFX_GEOPREC, EFX_REGION, EFX_CTRYISOCD, EFX_CTRYNUM,
                       EFX_CTRYNAME, EFX_COUNTYNM, EFX_COUNTY)
                     SELECT EFX_ID, EFX_NAME, EFX_LEGAL_NAME, EFX_ADDRESS, EFX_CITY,
                       EFX_STATE, EFX_STATEC, EFX_ZIPCODE, EFX_ZIP4, EFX_LAT,
                       EFX_LO, EFX_GEOPREC, EFX_REGION, EFX_CTRYISOCD, EFX_CTRYNUM,
                       EFX_CTRYNAME, EFX_COUNTYNM, EFX_COUNTY
                     FROM mdsdata"""
queryTable(insert_location)

In [71]:
queryBase("""SELECT * FROM dim_location LIMIT 1""")

Unnamed: 0,efx_id,efx_name,efx_legal_name,efx_address,efx_city,efx_state,efx_statec,efx_zipcode,efx_zip4,efx_lat,efx_lo,efx_geoprec,efx_region,efx_ctryisocd,efx_ctrynum,efx_ctryname,efx_countynm,efx_county
0,1.0,WOODS ARCADE WILLOWS,WOODS ARCADE WILLOWS,123 MAIN SRD,BOSTON,MA,25.0,10011.0,1869.0,42.356213,-71.050415,9.0,,USA,840.0,United States of America,Hampden County,13.0


In [62]:
mdsdata[['EFX_PRIMSIC', 'EFX_SECSIC1', 'EFX_SECSIC2','EFX_PRIMSICDESC', 'EFX_SECSICDESC1', 'EFX_SECSICDESC2',
       'EFX_SECSICDESC3', 'EFX_SECSICDESC4', 'EFX_PRIMNAICSCODE',
       'EFX_SECNAICS1','EFX_PRIMNAICSDESC', 'EFX_SECNAICSDESC1', 'EFX_SECNAICSDESC2']].describe(include='all')

Unnamed: 0,EFX_PRIMSIC,EFX_SECSIC1,EFX_SECSIC2,EFX_PRIMSICDESC,EFX_SECSICDESC1,EFX_SECSICDESC2,EFX_SECSICDESC3,EFX_SECSICDESC4,EFX_PRIMNAICSCODE,EFX_SECNAICS1,EFX_PRIMNAICSDESC,EFX_SECNAICSDESC1,EFX_SECNAICSDESC2
count,1988.0,1418.0,961.0,1988,1418,961,602,362,1988.0,1331.0,1988,1331,793
unique,,,,345,378,309,271,200,,,407,411,317
top,,,,EATING AND DRINKING PLACES,EATING AND DRINKING PLACES,"MISCELLANEOUS RETAIL STORES, NEC","SOCIAL SERVICES, NEC","MISCELLANEOUS RETAIL STORES, NEC",,,Full-Service Restaurants,Food Services and Drinking Places,Food Services and Drinking Places
freq,,,,127,49,27,16,11,,,116,66,27
mean,6103.827465,6035.528209,5831.942768,,,,,,557544.133803,555520.930879,,,
std,2304.148349,2302.290898,2280.574656,,,,,,191932.221394,194761.392955,,,
min,181.0,139.0,181.0,,,,,,111211.0,51111.0,,,
25%,5199.0,5086.5,5000.0,,,,,,444190.0,441310.0,,,
50%,6411.0,6035.5,5995.0,,,,,,541310.0,541410.0,,,
75%,8011.0,7919.25,7539.0,,,,,,722511.0,722000.0,,,


In [58]:
dim_business = """CREATE TABLE dim_business(EFX_ID NUMERIC PRIMARY KEY,EFX_BUSSIZE VARCHAR(255),EFX_BUSSTAT VARCHAR(255),  EFX_BUSSTATCD REAL,  EFX_WEB VARCHAR(255),  EFX_YREST NUMERIC,  EFX_CORPEMPCNT NUMERIC,  EFX_LOCEMPCNT NUMERIC,  EFX_CORPEMPCD VARCHAR(255),  EFX_LOCEMPCD VARCHAR(255),  EFX_CORPAMOUNT NUMERIC,  EFX_CORPAMOUNTCD VARCHAR(255),  EFX_CORPAMOUNTTP VARCHAR(255),  EFX_CORPAMOUNTPREC VARCHAR(255),
                                EFX_LOCAMOUNT NUMERIC,  EFX_LOCAMOUNTCD VARCHAR(255),  EFX_LOCAMOUNTTP VARCHAR(255),  EFX_LOCAMOUNTPREC VARCHAR(255))"""
queryTable(dim_business)

In [73]:
insert_business = """ INSERT INTO dim_business(EFX_ID, EFX_BUSSIZE, EFX_BUSSTAT,
               EFX_BUSSTATCD, EFX_WEB, EFX_YREST, EFX_CORPEMPCNT,
               EFX_LOCEMPCNT, EFX_CORPEMPCD, EFX_LOCEMPCD, EFX_CORPAMOUNT,
               EFX_CORPAMOUNTCD, EFX_CORPAMOUNTTP, EFX_CORPAMOUNTPREC,
               EFX_LOCAMOUNT, EFX_LOCAMOUNTCD, EFX_LOCAMOUNTTP,
               EFX_LOCAMOUNTPREC)
            SELECT EFX_ID, EFX_BUSSIZE, EFX_BUSSTAT,
               EFX_BUSSTATCD, EFX_WEB, EFX_YREST, EFX_CORPEMPCNT,
               EFX_LOCEMPCNT, EFX_CORPEMPCD, EFX_LOCEMPCD, EFX_CORPAMOUNT,
               EFX_CORPAMOUNTCD, EFX_CORPAMOUNTTP, EFX_CORPAMOUNTPREC,
               EFX_LOCAMOUNT, EFX_LOCAMOUNTCD, EFX_LOCAMOUNTTP,
               EFX_LOCAMOUNTPREC
            FROM mdsdata"""
queryTable(insert_business)

In [74]:
queryBase("""SELECT * FROM dim_business LIMIT 1""")

Unnamed: 0,efx_id,efx_bussize,efx_busstat,efx_busstatcd,efx_web,efx_yrest,efx_corpempcnt,efx_locempcnt,efx_corpempcd,efx_locempcd,efx_corpamount,efx_corpamountcd,efx_corpamounttp,efx_corpamountprec,efx_locamount,efx_locamountcd,efx_locamounttp,efx_locamountprec
0,1.0,L,Corporation,4.0,www.woodsarcadewillows.com,1998.0,14229.0,77.0,K,E,4565823.0,K,Corporate Revenue,ACTUAL,4623.0,D,Location Sales,MDL


In [78]:
dim_security = """CREATE TABLE dim_security(EFX_ID NUMERIC PRIMARY KEY,EFX_PRIMSIC REAL,  EFX_SECSIC1 REAL,  EFX_SECSIC2 REAL,EFX_PRIMSICDESC VARCHAR(255),  EFX_SECSICDESC1 VARCHAR(255),  EFX_SECSICDESC2 VARCHAR(255), EFX_PRIMNAICSCODE REAL,  EFX_SECNAICS1 REAL,  EFX_SECNAICS2 REAL, EFX_PRIMNAICSDESC VARCHAR(255),  EFX_SECNAICSDESC1 VARCHAR(255),  EFX_SECNAICSDESC2 VARCHAR(255))"""
queryTable(dim_security)

relation "dim_security" already exists



In [80]:
insert_security = """ INSERT INTO dim_security(EFX_ID, EFX_PRIMSIC, EFX_SECSIC1, EFX_SECSIC2, EFX_PRIMSICDESC, EFX_SECSICDESC1, EFX_SECSICDESC2,
                            EFX_PRIMNAICSCODE,EFX_SECNAICS1, EFX_SECNAICS2,
                            EFX_PRIMNAICSDESC, EFX_SECNAICSDESC1, EFX_SECNAICSDESC2)
                    SELECT EFX_ID, EFX_PRIMSIC, EFX_SECSIC1, EFX_SECSIC2, EFX_PRIMSICDESC, EFX_SECSICDESC1, EFX_SECSICDESC2,
                            EFX_PRIMNAICSCODE,EFX_SECNAICS1, EFX_SECNAICS2, EFX_PRIMNAICSDESC, EFX_SECNAICSDESC1, EFX_SECNAICSDESC2
                    FROM mdsdata"""
queryTable(insert_security)

In [81]:
queryBase("""SELECT * FROM dim_security LIMIT 1""")

Unnamed: 0,efx_id,efx_primsic,efx_secsic1,efx_secsic2,efx_primsicdesc,efx_secsicdesc1,efx_secsicdesc2,efx_primnaicscode,efx_secnaics1,efx_secnaics2,efx_primnaicsdesc,efx_secnaicsdesc1,efx_secnaicsdesc2
0,1.0,8361.0,8011.0,8300.0,RESIDENTIAL CARE,OFFICES AND CLINICS OF DOCTORS OF MEDICINE,SOCIAL SERVICES,623990.0,623311.0,623110.0,Other Residential Care Facilities,Continuing Care Retirement Communities,Nursing Care Facilities (Skilled Nursing Facil...
