In [19]:
# import necessary libraries
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import numpy as np

# build credentials and client to connect to open-baltimore project
credentials = service_account.Credentials.from_service_account_file(
"open-baltimore-data-4e3d67e89601.json")
project_id = "open-baltimore-data"
client = bigquery.Client(credentials=credentials, project=project_id)

In [20]:
# test query
query_job = client.query(
    """
    SELECT *
    FROM city_employee_salaries.main
    LIMIT 10 
    """)

results = query_job.result().to_dataframe()

In [21]:
# import employee name info
employee_salaries_query = client.query(
    """
    SELECT 
        info.employeeSlug,
        main.objectId,
        main.agencyName,
        main.agencyId,
        main.annualSalary,
        main.fiscalYear
    FROM city_employee_salaries.main as main
    LEFT JOIN city_employee_salaries.employee_info as info
    ON main.ObjectId = info.ObjectId
    """
)
employee_salaries = employee_salaries_query.result().to_dataframe()

In [25]:
# count number of records per city agency
agency_record_count = employee_salaries.groupby(["agencyId", "agencyName"], as_index=False).agg(
    salarieRecords=pd.NamedAgg(column="agencyId", aggfunc="count")
    )   
# calculate mean salary, max salary, min salary, growth, tenure, average raise
employee_salary_quality = employee_salaries.groupby(["employeeSlug", "agencyId", "agencyName"], as_index=False).agg(
    avgSalary=pd.NamedAgg(column="annualSalary", aggfunc="mean"),
    highestSalary=pd.NamedAgg(column="annualSalary", aggfunc="max"),
    lowestSalary=pd.NamedAgg(column="annualSalary", aggfunc="min"),
    growth=pd.NamedAgg(column="annualSalary", aggfunc=lambda salary: max(salary) - min(salary)),
    tenure=pd.NamedAgg(column="annualSalary", aggfunc="nunique"),
    avgRaise=pd.NamedAgg(column="annualSalary", aggfunc=lambda salary: (max(salary) - min(salary)) / len(salary))
    )
agency_salary_quality = employee_salary_quality.groupby(["agencyId", "agencyName"], as_index=False).agg(
    medSalary=pd.NamedAgg(column="avgSalary", aggfunc="mean"),
    medHighestSalary=pd.NamedAgg(column="highestSalary", aggfunc="median"),
    medLowestSalary=pd.NamedAgg(column="lowestSalary", aggfunc="median"),
    medGrowth=pd.NamedAgg(column="growth", aggfunc="median"),
    medTenure=pd.NamedAgg(column="tenure", aggfunc="median"),
    medRaise=pd.NamedAgg(column="avgRaise", aggfunc="median")
    )