# ETL Project  12 January 2022

## Brief description of challenge here
what is ETL, what tools are used, what are the important steps.

## Brief description of the data here
Zillow Real Estate Data (used to be quandl, now Nasdaq), table format

In [None]:
# Setting dependencies, etc.

import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', None)

# for importing data:
import quandl
quandl.ApiConfig.api_key = 'vxSYNGB-9owcVsRx_SEF'

import requests
import statistics
import time

# for communicating to database
import psycopgy2
import io
from sqlalchemy import create_engine

# for displays (may not use)
#import matplotlib.pyplot as plt
#import seaborn as sns
#from pprint import pprint

## Step 1: Data Extraction and preparation

### Extraction: source and comments

The Zillow database comprises three tables: Indicators (## market indicators), Regions (over 78,000!) and Data (market value). 

We extract the data using the quandl library. First, I will extract the data, then look at the data frame to identify any inconsistencies, type errors, etc. 

In [None]:
# Table 1: Data - values for all indicators
# Here, we're restricting the extraction to US records only (region_id = 102001)

zillow_data = quandl.get_table('ZILLOW/DATA', region_id = '102001', paginate = True)
zillow_data.head()

Here we see the indicator_id, which is a primary key, the region_id (we can eliminate downstream), the date, and market value. Let's see the info and dtypes, ets. 

In [None]:
zillow_data.info()

There are 11680 records.
The date is in datetime64 dtype, and the value is a float, so that's good. No changes necessary. 

In [None]:
# Table 2: Indicators - names and IDs of all indicators
zillow_ind = quandl.get_table('ZILLOW/INDICATORS', paginate = True)
zillow_ind.head()

The indicator_id is shared with the "Data" table and makes it amenable to querying. Indicator_id is a four-character classification of a market indicator, which is described in the column "indicator". There are 56 indicators, grouped into one of three categories: home values, rentals, and sales and inventories. 

In [None]:
zillow_ind.info()

In [None]:
# Table 3: Regions - names and IDs of all regions
# Even though we're restricting our attention to the US, there are lots of sub-regions.

zillow_reg = quandl.get_table('ZILLOW/REGIONS', paginate = True)
zillow_reg.head()

In [None]:
zillow_reg.tail()

In [None]:
zillow_reg.info()

In [None]:
list(zillow_reg['region_type'].unique())

There's a lot of variety in the information in the 'region' column. For the database, each column has to have one type of entry. The solution is to create additional columns for each one of the six region types (listed above). The data from region will be parsed into those columns, then deleted. 

## Preparation
checking data types, removing uninformative characters, parsing info from one column into many

In [None]:
# Making the necessary additional columns (object Dtype)

zillow_reg['zip'] = ''
zillow_reg['state'] = ''
zillow_reg['neigh'] = ''
zillow_reg['county'] = ''
zillow_reg['city'] = ''

zillow_reg.head()

In [None]:
# Parsing out information for the different region types. Using a double loop. A series of conditionals based on evaluating the contents between splitting character, The semicolon ';' character is what defines splitting fields
# Starting with zipcode. Go l-r down the string. First, look for a set of digits. If they exist, put into zip column. Next, look for a set of 3 characters (state), save as state, etc.

zipcode = zillow_reg[zillow_reg['region_type'] == 'zip']['region'].str.split(';', n=4, expand = True)
for i in range(5):
    for j in zipcode.index:
        if str(zipcode[i][j]).isdigit():
            zillow_reg['zip'][j] = zipcode[i][j]
        elif len(str(zipcode[i][j])) == 3:
            zillow_reg['state'][j] = zipcode[i][j]
        elif str(zipcode[i][j]).find('-') != -1:  ## comment on '-'
            zillow_reg['neigh'][j] = zipcode[i][j]
        elif str(zipcode[i][j]).find(' County') != -1:
            zillow_reg['county'][j] = zipcode[i][j]
        elif str(zipcode[i][j]).find(' Borough') != -1:
            zillow_reg['county'][j] = zipcode[i][j] #classifying boroughs as counties
        else:
            zillow_reg['city'][j] = zipcode[i][j]
               
        

In [None]:
zillow_reg.head()

In [None]:
zillow_reg.tail()

So far, so good. Now, follow same procedure for remaining region types. 

In [None]:
# region type: city

city = zillow_reg[zillow_reg['region_type'] == 'city']['region'].str.split(';', n=4, expand = True)
for i in range(0,1):
    for j in city.index: 
        if str(city[i][j]).isdigit():
            zillow_reg['zip'][j] = city[i][j]
        elif len(str(city[i][j])) == 3:
            zillow_reg['state'][j] = city[i][j]
        elif str(city[i][j]).find('-') != -1:  
            zillow_reg['neigh'][j] = city[i][j]
        elif str(city[i][j]).find(' County') != -1:
            zillow_reg['county'][j] = city[i][j]
        elif str(city[i][j]).find(' Borough') != -1:
            zillow_reg['county'][j] = city[i][j] 
        else:
            zillow_reg['city'][j] = city[i][j]

In [None]:
# region type: neighborhood ('neigh')
# there's not many of them

nhood = zillow_reg[zillow_reg['region_type'] == 'neigh']['region'].str.split(';', n = 4, expand = True)
for i in range (4):
    for j in nhood.index: 
        if str(nhood[i][j]).isdigit():
            zillow_reg['zip'][j] = nhood[i][j]
        elif len(str(nhood[i][j])) == 3:
            zillow_reg['state'][j] = nhood[i][j]
        elif str(nhood[i][j]).find('-') != -1:  
            zillow_reg['neigh'][j] = nhood[i][j]
        elif str(nhood[i][j]).find(' County') != -1:
            zillow_reg['county'][j] = nhood[i][j]
        elif str(nhood[i][j]).find(' Borough') != -1:
            zillow_reg['county'][j] = nhood[i][j] 
        else:
            zillow_reg['city'][j] = nhood[i][j]

In [None]:
nhood

In [None]:
zillow_reg.tail()

In [None]:
city

In [None]:
# region type: county

county = zillow_reg[zillow_reg['region_type'] == 'county']['region'].str.split(';', n = 4, expand = True)
for i in range (2):
    for j in county.index: 
        if str(county[i][j]).isdigit():
            zillow_reg['zip'][j] = county[i][j]
        elif len(str(county[i][j])) == 3:
            zillow_reg['state'][j] = county[i][j]
        elif str(county[i][j]).find('-') != -1:  
            zillow_reg['neigh'][j] = county[i][j]
        elif str(county[i][j]).find(' County') != -1:
            zillow_reg['county'][j] = county[i][j]
        elif str(county[i][j]).find(' Borough') != -1:
            zillow_reg['county'][j] = county[i][j] 
        else:
            zillow_reg['city'][j] = county[i][j]

In [None]:
county

In [None]:
# region type: state

state = zillow_reg[zillow_reg['region_type'] == 'state']['region'].str.split(';', n = 1, expand = True)
for i in range (2):
    for j in state.index: 
        if str(state[i][j]).isdigit():
            zillow_reg['zip'][j] = state[i][j]
        elif len(str(state[i][j])) == 3:
            zillow_reg['state'][j] = state[i][j]
        elif str(state[i][j]).find('-') != -1:  
            zillow_reg['neigh'][j] = state[i][j]
        elif str(state[i][j]).find(' County') != -1:
            zillow_reg['county'][j] = state[i][j]
        elif str(state[i][j]).find(' Borough') != -1:
            zillow_reg['county'][j] = state[i][j] 
        else:
            zillow_reg['city'][j] = state[i][j]

In [None]:
state


In [None]:
# region type: metro

metro = zillow_reg[zillow_reg['region_type'] == 'metro']['region'].str.split(';', n = 1, expand = True)
met = metro[0].str.split(',', n = 2, expand = True) # there's a second delimiter to deal with
for i in range (2):
    for j in met.index: 
        if str(met[i][j]).isdigit():
            zillow_reg['zip'][j] = met[i][j]
        elif len(str(met[i][j])) == 3:
            zillow_reg['state'][j] = met[i][j]
        elif str(met[i][j]).find('-') != -1:  
            zillow_reg['neigh'][j] = met[i][j]
        elif str(met[i][j]).find(' County') != -1:
            zillow_reg['county'][j] = met[i][j]
        elif str(met[i][j]).find(' Borough') != -1:
            zillow_reg['county'][j] = met[i][j] 
        else:
            zillow_reg['city'][j] = met[i][j]

In [None]:
met

In [None]:
# Looking at the data frame:
zillow_reg.tail(15)

In [None]:
zillow_reg.head()

This is imperfect; there are plenty of cities not being parsed out correctly. I'm moving forward with the exercise and will not use 'city' in a query.

In [None]:
# dropping 'region' column from table:

zillow_reg.drop('region', axis = 1, inplace = True)

In [None]:
zillow_reg[zillow_reg['region_type'] == 'zip']

In [None]:
zillow_reg[zillow_reg['region_type'] == 'neigh']

## Step 2: Transformation

### Creating database in PostgresQL using psycopg2
Using the DDL to create tables (talk about staging, goal)

First, we need to create staging tables as a buffer in the database to hold newly extracted data. 
I create three staging tables (data, indicator, region). The creation of the extra columns in the region table requires the creation of additional database tables for querying. Tables are created using Data Description Language (DDL). Tables are related by the primary or foreign keys. The embedded LucidChart figure illustrates the relationships among tables in this project. 

Also, we need to connect to PostgreSL using psycopg2. 

In [None]:
##### DDL for table creation ####

def create_tables():
    
    '''create tables in postgreSQL'''
    
    commands = (
    '''
    CREATE TABLE stage_zillow_indicator(
    stage_indicator_id CHAR(5),
    stage_indicator VARCHAR(255),
    stage_category VARCHAR(255)
                                        )
    ''',
    '''
    CREATE TABLE state_zillow_region(
    stage_region_id INTEGER,
    stage_region_type VARCHAR(255),
    stage_region VARCHAR(255),
    stage_zipcode CHAR(10),
    stage_county VARCHAR (255),
    stage_city VARCHAR(255),
    stage_neigh VARCHAR(255)
                                     )
    ''',
    '''
    CREATE TABLE stage_zillow_data(
    stage_indicator_id CHAR(5),
    stage_region_id INTEGER,
    stage_date DATE,
    stage value MONEY
                                   )
    ''',
    '''
    CREATE TABLE z_indicator(
    z_ind_id SERIAL PRIMARY KEY,
    indicator_name CHAR(5) NOT NULL,
    indicator VARCHAR(255) NOT NULL,
    category_id SERIAL REFERENCES category(category_id)
                             )
    ''',
    '''
    CREATE TABLE region_type(
    region_type_id SERIAL PRIMARY KEY,
    region_type_name VARCHAR(255) NOT NULL
                             )
    ''',
    '''
    CREATE TABLE region(
    region_id SERIAL PRIMARY KEY,
    region_num INTEGER, 
    region_type_id SERIAL REFERENCES region_type(region_type_id),
    neigh VARCHAR(255),
    city VARCHAR(255),
    county VARCHAR(255),
    state CHAR(5),
    zip CHAR(10)
                       )
    ''',
    '''
    CREATE TABLE zillow_data(
    z_data SERIAL PRIMARY KEY,
    region_id SERIAL REFERENCES region(region_id),
    z_ind_id SERIAL REFFERENCES z_indicator(z_ind_id),
    z_date DATE,
    value MONEY
                             )
    ''')
    
    conn = None
    
    
    try:
        conn = psycopyg2.connect(database='zillow', user = 'postgres', password = 'postgresql', host = '127.0.0.1', port = '5432')
        print('Database connected')
        
        cur = conn.cursor()
        print('cursor')
        
        for command in commands: 
            cur.execute(command)
            
        print('Tables created')
        
        cur.close()
        print('Connection closed')
        
        conn.commit()
        print('Tables committed')
        
    except:
        print('Error loading')
        
        
#if __name__ == '__main__':
#    create tables()


## Step 3: Loading the data

### Connecting to postgresQL and loading data

## Step 4: Illustration of data model (perhaps this goes before transformation)

## Step 5: Query the database