<a href="https://colab.research.google.com/github/j-buss/wi-dpi-analysis/blob/development/eda/2.0_Landing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Salary and Education in Wisconsin - 2.0 Load Landing BigQuery

This notebook is intended to describe analysis on salaries of teachers within the Wisconsin Department of Public Instruction.

## Introduction

### Load libraries
Install the following packages in order to load data to BigQuery.

*Please note this will require a restart to the runtime*

In [1]:
!pip install --upgrade google-cloud-bigquery
!pip install gcsfs
!pip install pandas-gbq -U

Collecting google-cloud-bigquery
[?25l  Downloading https://files.pythonhosted.org/packages/b3/33/236bdc6f5204bed8f69aecbabbe1a9b3a5be51f959fb51eba0545181ffa0/google_cloud_bigquery-1.11.2-py2.py3-none-any.whl (127kB)
[K     |██▋                             | 10kB 13.3MB/s eta 0:00:01[K     |█████▏                          | 20kB 1.8MB/s eta 0:00:01[K     |███████▊                        | 30kB 2.6MB/s eta 0:00:01[K     |██████████▎                     | 40kB 1.7MB/s eta 0:00:01[K     |████████████▉                   | 51kB 2.1MB/s eta 0:00:01[K     |███████████████▍                | 61kB 2.5MB/s eta 0:00:01[K     |██████████████████              | 71kB 2.9MB/s eta 0:00:01[K     |████████████████████▋           | 81kB 3.3MB/s eta 0:00:01[K     |███████████████████████▏        | 92kB 3.7MB/s eta 0:00:01[K     |█████████████████████████▊      | 102kB 2.9MB/s eta 0:00:01[K     |████████████████████████████▎   | 112kB 2.9MB/s eta 0:00:01[K     |█████████████████████

Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/30/7b/bb9dd860c64f15a06fdefdd3ea6c30ae336f3f5524f800cac59592769bf7/gcsfs-0.2.1.tar.gz (51kB)
[K     |████████████████████████████████| 61kB 2.3MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/58/b5/19/7b0e8a870ef16e1c0b8eee819c511c789be5cde308e59f2752
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.2.1
Collecting pandas-gbq
  Downloading https://files.pythonhosted.org/packages/6a/65/bc46678a5550c0cef1700d7292319deae716751af3f6158250d6a3a454ed/pandas_gbq-0.10.0-py2.py3-none-any.whl
Collecting pydata-google-auth (from pandas-gbq)
  Downloading https://files.pythonhosted.org/packages/89/c5/03b68c114bc2c2bcaa2e40fdf269a14361fa75b70a09415e8bad65413b75/pydata_google_auth-0.1.3-py2.py3-none-any.whl
Installing collected packages: pydata-google-auth, pandas-gbq
  Fou

### Authenticate to Google Cloud

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 5)
import seaborn as sns
import matplotlib.pyplot as plt

from google.cloud import bigquery

import requests

In [0]:
%matplotlib inline
plt.style.use('bmh')

### Functions

In [0]:
def download_file(url, filename):
  r = requests.get(url)
  f = open(filename,'wb')
  f.write(r.content)
  f.close()

In [0]:
def create_dataset(client, project_id, dataset_name):
  
  
  dataset_id = "{}.{}".format(project_id, dataset_name)
  dataset = bigquery.Dataset(dataset_id)
  dataset.location = "US"

  dataset = client.create_dataset(dataset)
  #print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

### Define Values

In [0]:
download_file("https://raw.githubusercontent.com/j-buss/wi-dpi-analysis/development/eda/wi_dpi_utils.py", "wi_dpi_utils.py")

In [0]:
import wi_dpi_utils as utils

In [0]:
file_dict = [
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/95staff.txt",\
		'new_name':"all_staff_report/1995_1996/95staff.txt",'landing_tablename':"1995",'file_type':"fixed"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/96staff.txt",\
		'new_name':"all_staff_report/1996_1997/96staff.txt",'landing_tablename':"1996",'file_type':"fixed"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/97staff.txt",\
		'new_name':"all_staff_report/1997_1998/97staff.txt",'landing_tablename':"1997",'file_type':"fixed"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/98staff.txt",\
		'new_name':"all_staff_report/1998_1999/98staff.txt",'landing_tablename':"1998",'file_type':"fixed"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/99STAFF.DAT",\
		'new_name':"all_staff_report/1999_2000/99STAFF.DAT",'landing_tablename':"1999",'file_type':"fixed"},
    
    
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/00staff.dat",\
		'new_name':"all_staff_report/2000_2001/00staff.dat",'landing_tablename':"2000"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/01staff.dat",\
		'new_name':"all_staff_report/2001_2002/01staff.dat",'landing_tablename':"2001"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/02staff.txt",\
		'new_name':"all_staff_report/2002_2003/02staff.txt",'landing_tablename':"2002"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/03staff.txt",\
		'new_name':"all_staff_report/2003_2004/03staff.txt",'landing_tablename':"2003"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/04staff.dat",\
		'new_name':"all_staff_report/2004_2005/04staff.dat",'landing_tablename':"2004"},
                                                                                                                                                 
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/05staff.txt",\
		'new_name':"all_staff_report/2005_2006/05staff.txt",'landing_tablename':"2005"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/06staff.txt",\
		'new_name':"all_staff_report/2006_2007/06staff.txt",'landing_tablename':"2006"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/07staff.txt",\
		'new_name':"all_staff_report/2007_2008/07staff.txt",'landing_tablename':"2007"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/08STAFF.TXT",\
		'new_name':"all_staff_report/2008_2009/08STAFF.TXT",'landing_tablename':"2008"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/09STAFF.TXT",\
		'new_name':"all_staff_report/2009_2010/09STAFF.TXT",'landing_tablename':"2009"},
                                                                                                                                                 
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/10STAFF.TXT",\
		'new_name':"all_staff_report/2010_2011/10STAFF.TXT",'landing_tablename':"2010"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/11STAFF.txt",\
		'new_name':"all_staff_report/2011_2012/11STAFF.txt",'landing_tablename':"2011"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/12STAFF.txt",\
		'new_name':"all_staff_report/2012_2013/12STAFF.txt",'landing_tablename':"2012"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/13staff.txt",\
		'new_name':"all_staff_report/2013_2014/13staff.txt",'landing_tablename':"2013"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/14staff.txt",\
		'new_name':"all_staff_report/2014_2015/14staff.txt",'landing_tablename':"2014"},
    
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/2015.csv",\
		'new_name':"all_staff_report/2015_2016/2015.csv",'landing_tablename':"2015"},
    {'old_name':"all_staff_report/temp/AllStaff_Open_Files/2016.csv",\
		'new_name':"all_staff_report/2016_2017/2016.csv",'landing_tablename':"2016"}
      
]

In [0]:
project_id='wi-dpi-010'
raw_data_bucket_name='landing-009'

landing_dataset_name='landing'
refined_dataset_name='refined'

## Data Preparation

### Create Dataset

In [0]:
from google.cloud import bigquery

In [0]:
bq_client = bigquery.Client(project=project_id)
create_dataset(bq_client, project_id, "landing")

### Create Tables

In [0]:
from google.cloud import storage
from io import StringIO

In [0]:
i = file_dict[0]

In [16]:
i['new_name']

'all_staff_report/1995_1996/95staff.txt'

In [0]:
storage_client = storage.Client(project=project_id)
bucket = storage_client.get_bucket(raw_data_bucket_name)
data_blob = bucket.get_blob(i['new_name'])
data = blob.download_as_string()
metadata_blob = bucket.get_blob('all_staff_report/1995_1996/95_metadata.csv')
metadata = metadata_blob.download_as_string()

In [0]:
metadata_df = pd.read_csv(StringIO(metadata.decode('utf-8')))
col_widths = metadata_df['length']
col_names = metadata_df['description']

In [0]:
data_df = pd.read_fwf(StringIO(data.decode('utf-8')), widths=col_widths, names=col_names)

### Load Data

In [63]:
data_df.to_gbq('landing.' + i['landing_tablename'],project_id=project_id,if_exists='replace')

1it [00:06,  6.41s/it]
