<a href="https://colab.research.google.com/github/j-buss/wi-dpi-analysis/blob/development/eda/3.0_Refined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Salary and Education in Wisconsin - 3.0 Refined Data

This it the 3rd in a series of notebooks depicting the steps to acquire, store and analyze data pertaining to teachers in Wisconsin.



1.   List item
2.   List item
3.   Refined Data - Create ***all_staff_record*** tables by year



## Prep

In [0]:
!pip install --upgrade google-cloud-bigquery

Collecting google-cloud-bigquery
[?25l  Downloading https://files.pythonhosted.org/packages/6f/c1/74dce5b9ffde50910082431e9117e221f18978efec88a085e3ec46d63ed4/google_cloud_bigquery-1.12.1-py2.py3-none-any.whl (130kB)
[K     |██▌                             | 10kB 12.5MB/s eta 0:00:01[K     |█████                           | 20kB 1.8MB/s eta 0:00:01[K     |███████▌                        | 30kB 2.7MB/s eta 0:00:01[K     |██████████                      | 40kB 1.7MB/s eta 0:00:01[K     |████████████▋                   | 51kB 2.1MB/s eta 0:00:01[K     |███████████████                 | 61kB 2.6MB/s eta 0:00:01[K     |█████████████████▋              | 71kB 3.0MB/s eta 0:00:01[K     |████████████████████            | 81kB 3.4MB/s eta 0:00:01[K     |██████████████████████▋         | 92kB 3.8MB/s eta 0:00:01[K     |█████████████████████████▏      | 102kB 2.9MB/s eta 0:00:01[K     |███████████████████████████▋    | 112kB 2.9MB/s eta 0:00:01[K     |█████████████████████

### Import Libraries

In [0]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 5)
import seaborn as sns
import matplotlib.pyplot as plt

from google.cloud import bigquery

In [0]:
%matplotlib inline
plt.style.use('bmh')

### Functions

In [0]:
def create_dataset(client, project_id, dataset_name):
  
  
  dataset_id = "{}.{}".format(project_id, dataset_name)
  dataset = bigquery.Dataset(dataset_id)
  dataset.location = "US"

  dataset = client.create_dataset(dataset)
  print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

In [0]:
def convert_currency(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace(',','').replace('$', '')
    return float(new_val)

In [0]:
def prep_name(val):
  """
  Take name and make first leter capital; rest lowercase
  """
  new_val  = val.lower().title()
  return new_val

## Processing

In [0]:
# Authenticate to GCS.
from google.colab import auth
auth.authenticate_user()

In [0]:
project_id='wi-dpi-010'
landing_dataset_name='landing'
refined_dataset_name='refined'

In [0]:
bq_client = bigquery.Client(project=project_id)

In [0]:
#Create Refined Dataset - Likely already done
##create_dataset(bq_client, project_id, refined_dataset_name)


### 2015



In [0]:
 select_2015 = '''SELECT
  all_staff_report.id_nbr,
  TRIM(all_staff_report.first_name) as first_name,
  TRIM(all_staff_report.last_name) as last_name,
  SAFE_CAST(all_staff_report.file_number as INT64) as file_number,
  TRIM(all_staff_report.gndr) as gender,
  TRIM(all_staff_report.raceethn) as race_ethnicity,
  all_staff_report.birth_year,
  SAFE_CAST(all_staff_report.high_degree as INT64) as high_degree_cd,
  TRIM(highest_degree.description) as high_degree_desc,
  TRIM(all_staff_report.year_session),
  all_staff_report.cntrct_days as contract_days,
  all_staff_report.local_exp,
  all_staff_report.total_exp,
  CAST(REGEXP_REPLACE(REGEXP_REPLACE(all_staff_report.tot_salary, r"^[$]",""), r",","") AS FLOAT64) as salary,
  CAST(REGEXP_REPLACE(REGEXP_REPLACE(all_staff_report.tot_fringe, r"^[$]",""), r",","") AS FLOAT64) as benefits,
  SAFE_CAST(all_staff_report.staff_cat as INT64) as staff_category_cd,
  TRIM(staff_cat.description) as staff_category_desc,
  all_staff_report.hire_agncy_cd,
  all_staff_report.work_agncy_cd,
  all_staff_report.hire_agncy_typ,
  hire_agency_type.description as hire_agency_desc,
  all_staff_report.work_agncy_typ,
  work_agency_type.description as work_agency_desc,
  TRIM(all_staff_report.school_cd),
  all_staff_report.position_cd,
  position.position_description,
  position.position_type as position_type_cd,
  pos_type.description as position_type_desc,
  all_staff_report.assgn_area_cd,
  assignment_area.assignment_area_description,
  TRIM(all_staff_report.low_grd),
  TRIM( high_grd ),
  all_staff_report.bilingual,
  all_staff_report.assgn_fte,
  TRIM(all_staff_report.work_location_name),
  TRIM(all_staff_report.school_name),
  TRIM(all_staff_report.grd_level),
  SAFE_CAST(TRIM(all_staff_report.cesa_number) as INT64) as cesa_num,
  all_staff_report.cnty_nbr as county_number,
  TRIM(all_staff_report.cnty_name) as county_name,
  all_staff_report.school_mailing_address1,
  all_staff_report.school_mailing_address2,
  all_staff_report.mail_city,
  all_staff_report.mail_st,
  all_staff_report.mail_zip_cd
FROM
  `wi-dpi-010.landing.2015_all_staff_report` all_staff_report 
  LEFT JOIN `wi-dpi-010.landing.2015_positions` position 
   ON all_staff_report.position_cd = position.code
  LEFT JOIN `wi-dpi-010.landing.2015_assignment_area` assignment_area
   ON all_staff_report.assgn_area_cd = CAST(assignment_area.code as INT64)
  LEFT JOIN `wi-dpi-010.landing.2015_highest_educational_degree` highest_degree
   ON SAFE_CAST(all_staff_report.high_degree as INT64) = highest_degree.code
  LEFT JOIN `wi-dpi-010.landing.2015_staff_category` staff_cat
   ON SAFE_CAST(all_staff_report.staff_cat as INT64) = staff_cat.code
  LEFT JOIN `wi-dpi-010.landing.2015_position_type` pos_type
   ON position.position_type = pos_type.code
  LEFT JOIN `wi-dpi-010.landing.2015_agency_type` hire_agency_type
   ON all_staff_report.hire_agncy_typ = SAFE_CAST(hire_agency_type.code as INT64)
  LEFT JOIN `wi-dpi-010.landing.2015_agency_type` work_agency_type
   ON all_staff_report.hire_agncy_typ = SAFE_CAST(work_agency_type.code as INT64)
'''

## Load Data for each Year

In [0]:
job_config = bigquery.QueryJobConfig()
# Set the destination table
table_ref = bq_client.dataset(refined_dataset_name).table('2015_all_staff_report')
job_config.destination = table_ref

# Start the query, passing in the extra configuration.
query_job = bq_client.query(
    select_2015,
    # Location must match that of the dataset(s) referenced in the query
    # and of the destination table.
    location='US',
    job_config=job_config)  # API request - starts the query

query_job.result()  # Waits for the query to finish
print('Query results loaded to table {}'.format(table_ref.path))

Query results loaded to table /projects/wi-dpi-010/datasets/refined/tables/all_staff_report_2015


## Consolidate Yearly Tables

In [0]:
consolidate_select='''
'''