<a href="https://colab.research.google.com/github/j-buss/wi-dpi-analysis/blob/development/eda/2.0_Landing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Salary and Education in Wisconsin - 2.0 Load Landing BigQuery

This notebook is intended to describe analysis on salaries of teachers within the Wisconsin Department of Public Instruction.

## Introduction

### Load libraries
Install the following packages in order to load data to BigQuery.

*Please note this will require a restart to the runtime*

In [1]:
!pip install --upgrade google-cloud-bigquery
!pip install gcsfs
!pip install pandas-gbq -U

Collecting google-cloud-bigquery
[?25l  Downloading https://files.pythonhosted.org/packages/6f/c1/74dce5b9ffde50910082431e9117e221f18978efec88a085e3ec46d63ed4/google_cloud_bigquery-1.12.1-py2.py3-none-any.whl (130kB)
[K     |████████████████████████████████| 133kB 2.9MB/s 
[?25hCollecting google-cloud-core<2.0dev,>=1.0.0 (from google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/34/ba/251d9b6a1695d25d9f081a3db537b0dfb15edaf2037b423e4a98280df7f9/google_cloud_core-1.0.0-py2.py3-none-any.whl
[31mERROR: google-cloud-translate 1.3.3 has requirement google-cloud-core<0.30dev,>=0.29.0, but you'll have google-cloud-core 1.0.0 which is incompatible.[0m
[31mERROR: google-cloud-storage 1.13.2 has requirement google-cloud-core<0.30dev,>=0.29.0, but you'll have google-cloud-core 1.0.0 which is incompatible.[0m
Installing collected packages: google-cloud-core, google-cloud-bigquery
  Found existing installation: google-cloud-core 0.29.1
    Uninstalling google-cloud-co

Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/76/19/68ab4e6570a7882698058be8ecf1b195b0b784b838ac1b0ea82c422c0f5a/gcsfs-0.2.2.tar.gz (52kB)
[K     |████████████████████████████████| 61kB 2.4MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/9f/0f/b9/5bc5222756d121ccace51ab3084a1c733380908a4e2f939038
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.2.2
Collecting pandas-gbq
  Downloading https://files.pythonhosted.org/packages/6a/65/bc46678a5550c0cef1700d7292319deae716751af3f6158250d6a3a454ed/pandas_gbq-0.10.0-py2.py3-none-any.whl
Collecting pydata-google-auth (from pandas-gbq)
  Downloading https://files.pythonhosted.org/packages/89/c5/03b68c114bc2c2bcaa2e40fdf269a14361fa75b70a09415e8bad65413b75/pydata_google_auth-0.1.3-py2.py3-none-any.whl
Installing collected packages: pydata-google-auth, pandas-gbq
  Fou

### Authenticate to Google Cloud

In [0]:
from google.colab import auth
auth.authenticate_user()

### Import Libraries

In [0]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 5)
import seaborn as sns
import matplotlib.pyplot as plt


In [0]:
%matplotlib inline
plt.style.use('bmh')

In [0]:
from google.cloud import bigquery
from google.cloud import storage
from io import StringIO
import re
import collections

### Functions

In [0]:
def download_file(url, filename):
  r = requests.get(url)
  f = open(filename,'wb')
  f.write(r.content)
  f.close()

In [0]:
def create_dataset(client, project_id, dataset_name):
  
  
  dataset_id = "{}.{}".format(project_id, dataset_name)
  dataset = bigquery.Dataset(dataset_id)
  dataset.location = "US"

  dataset = client.create_dataset(dataset)
  #print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

In [0]:
def delete_dataset(client, project_id, dataset_name):
  
  
  dataset_id = "{}.{}".format(project_id, dataset_name)
  client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True)

In [0]:
def return_blob_list(project_id, bucket_name):
    """Lists all the blobs in the bucket."""
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs()
    return blobs

In [0]:
def clean_column_headers(columns):
  return columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('.','_').str.replace('/','_').str.replace("'","")

### Define Values

In [0]:
project_id='wi-dpi-010'
raw_data_bucket_name='landing-009'

landing_dataset_name='landing'
refined_dataset_name='refined'

## Data Preparation

### Create Dataset

In [0]:
landing_dataset_name = 'landing'
bq_client = bigquery.Client(project=project_id)

In [0]:
create_dataset(bq_client, project_id, landing_dataset_name)

In [0]:
###DELETE DATASET
delete_dataset(bq_client, project_id, landing_dataset_name)

### Create Dataframes and Tables from GCS Blobs

For all files (blobs) in the bucket we will split the blob name into the following components:


1.   Source
2.   Year
3.   File
4.   File Type

Additionally, we will use a NamedTuple to define the components and ensure they are callable.

In [0]:
blob_to_process = collections.namedtuple('blob_to_process','source year file file_type fullname')

In [0]:
list_of_blobs = []
for blob in return_blob_list('wi-dpi-010','landing-009'):
  temp = re.findall(r"[\w']+",blob.name)
  temp.append(blob.name)
  if len(temp) == 5:
    list_of_blobs.append(blob_to_process(*temp))

In [0]:
#TEMPORARY REPROCESS
#list_of_blobs = [blob_to_process(source='all_staff_report', year='2015', file='2015_assignment_area', file_type='csv', fullname='all_staff_report/2015/2015_assignment_area.csv')]

In [19]:
print(list_of_blobs)

[blob_to_process(source='all_staff_report', year='2015', file='2015_assignment_area', file_type='csv', fullname='all_staff_report/2015/2015_assignment_area.csv')]


Now the object 
`list_of_blobs`
has all the blob object information. We will cycle through it to create the tables as needed in BigQuery.

In [0]:
storage_client = storage.Client(project=project_id)
bucket = storage_client.get_bucket(raw_data_bucket_name)

In [21]:
for blob in list_of_blobs:
  if blob.file_type == 'csv':
    data_blob = bucket.get_blob(blob.fullname)
    data = data_blob.download_as_string()
    df = pd.read_csv(StringIO(data.decode('utf-8')),low_memory=False)
    df.columns = clean_column_headers(df.columns)
    print (landing_dataset_name + '.' + blob.file)
    df.to_gbq(landing_dataset_name + '.' + blob.file,project_id=project_id,if_exists='replace')
  elif blob.file_type == 'fwf':
    data_blob = bucket.get_blob(blob.fullname)
    data = data_blob.download_as_string()
    metadata_file = [x_blob.source + '/' + x_blob.year + '/' + x_blob.file + '.' + x_blob.file_type for x_blob in list_of_blobs\
           if (x_blob.file == blob.file and x_blob.file_type != blob.file_type)][0]
    metadata_blob = bucket.get_blob(metadata_file)
    metadata = metadata_blob.download_as_string()
    metadata_df = pd.read_csv(StringIO(metadata.decode('utf-8')))
    col_widths = metadata_df['length'].apply(int)
    col_names = metadata_df['name']
    df = pd.read_fwf(StringIO(data.decode('utf-8')), widths=col_widths, names=col_names)
    df.columns = clean_column_headers(df.columns)
    print (landing_dataset_name + '.' + blob.file)
    df.to_gbq(landing_dataset_name + '.' + blob.file,project_id=project_id,if_exists='replace')
    

landing.2015_assignment_area


1it [00:03,  3.17s/it]
