<a href="https://colab.research.google.com/github/j-buss/wi-dpi-analysis/blob/development/eda/2.0_Landing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Salary and Education in Wisconsin - 2.0 Load Landing BigQuery

This notebook is intended to describe analysis on salaries of teachers within the Wisconsin Department of Public Instruction.

## Introduction

### Load libraries
Install the following packages in order to load data to BigQuery.

*Please note this will require a restart to the runtime*

In [0]:
!pip install --upgrade google-cloud-bigquery
!pip install gcsfs
!pip install pandas-gbq -U

### Authenticate to Google Cloud

In [0]:
from google.colab import auth
auth.authenticate_user()

### Import Libraries

In [0]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 5)
import seaborn as sns
import matplotlib.pyplot as plt


In [0]:
%matplotlib inline
plt.style.use('bmh')

In [0]:
from google.cloud import bigquery
from google.cloud import storage
from io import StringIO
import re

### Functions

In [0]:
def download_file(url, filename):
  r = requests.get(url)
  f = open(filename,'wb')
  f.write(r.content)
  f.close()

In [0]:
def create_dataset(client, project_id, dataset_name):
  
  
  dataset_id = "{}.{}".format(project_id, dataset_name)
  dataset = bigquery.Dataset(dataset_id)
  dataset.location = "US"

  dataset = client.create_dataset(dataset)
  #print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

In [0]:
def return_blob_list(project_id, bucket_name):
    """Lists all the blobs in the bucket."""
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs()
    return blobs

### Define Values

In [0]:
project_id='wi-dpi-010'
raw_data_bucket_name='landing-009'

landing_dataset_name='landing'
refined_dataset_name='refined'

## Data Preparation

### Create Dataset

In [0]:
bq_client = bigquery.Client(project=project_id)
create_dataset(bq_client, project_id, "landing")

### Create Dataframes from GCS Blobs

For all files (blobs) in the bucket we will split the blob name into the following components:


1.   Source
2.   Year
3.   File
4.   File Type



In [0]:
import collections
blob_to_process = collections.namedtuple('blob_to_process','source year file file_type')

In [61]:
for blob in return_blob_list('wi-dpi-010','landing-009'):
  print(re.findall(r"[\w']+",blob.name))

['all_staff_report']
['all_staff_report', '1995', '1995_all_staff_report', 'fwf']
['all_staff_report', '1995', '1995_all_staff_report', 'metadata']
['all_staff_report', '1995', '1995_degree', 'csv']
['all_staff_report', '1995', '1995_grade_code', 'csv']
['all_staff_report', '1995', '1995_race', 'csv']
['all_staff_report', '1995', '1995_salary_fund_source', 'csv']
['all_staff_report', '1995', '1995_school_type', 'csv']
['all_staff_report', '1995', '1995_staff_type', 'csv']
['all_staff_report', '1995', '1995_work_agency_type', 'csv']
['all_staff_report', '1996', '1996_all_staff_report', 'fwf']
['all_staff_report', '1996', '1996_all_staff_report', 'metadata']
['all_staff_report', '2015', '2015_agency_type', 'csv']
['all_staff_report', '2015', '2015_all_staff_report', 'csv']
['all_staff_report', '2015', '2015_assignment_area', 'csv']
['all_staff_report', '2015', '2015_grade_level', 'csv']
['all_staff_report', '2015', '2015_highest_educational_degree', 'csv']
['all_staff_report', '2015', '2

In [0]:
fixed_width_file = 'all_staff_report/1995/1995_all_staff_report.fwf'
metadata_file = 'all_staff_report/1995/1995_all_staff_report.metadata'
staff_type_file = 'all_staff_report/1995/1995_staff_type.csv'

In [0]:
storage_client = storage.Client(project=project_id)
bucket = storage_client.get_bucket(raw_data_bucket_name)
data_blob = bucket.get_blob(fixed_width_file)
data = data_blob.download_as_string()
metadata_blob = bucket.get_blob(metadata_file)
metadata = metadata_blob.download_as_string()

In [0]:
metadata_df = pd.read_csv(StringIO(metadata.decode('utf-8')))

In [12]:
print (metadata_df)

                   name  start  length
0             id_number    1.0     9.0
1             last_name   10.0    20.0
..                  ...    ...     ...
60  administrators_name  499.0    30.0
61               filler  529.0     2.0

[62 rows x 3 columns]


In [0]:
col_widths = metadata_df['length'].apply(int)

In [0]:
col_names = metadata_df['name']

In [15]:
col_names

0               id_number
1               last_name
             ...         
60    administrators_name
61                 filler
Name: name, Length: 62, dtype: object

In [0]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [27]:
metadata_df.groupby(['name']).count()

Unnamed: 0_level_0,start,length
name,Unnamed: 1_level_1,Unnamed: 2_level_1
administrators_name,1,1
agency_name_of_work_location,1,1
...,...,...
work_agency_type,1,1
year_and_session,1,1


In [29]:
data_df = pd.read_fwf(StringIO(data.decode('utf-8')), widths=col_widths, names=col_names)

  return _read(filepath_or_buffer, kwds)


In [0]:
data_df.columns = data_df.columns.str.replace('.','_')

### Load Data

In [31]:
data_df.to_gbq('landing.1995_test',project_id=project_id,if_exists='replace')

1it [00:06,  6.34s/it]
