In [1]:
# import necessary libraries
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import numpy as np

# build client to connect to open-baltimore project
project_id = "open-baltimore-data"
client = bigquery.Client(project=project_id)

In [2]:
# test query
query_job = client.query(
    """
    SELECT *
    FROM city_employee_salaries.main
    LIMIT 10 
    """)

results = query_job.result().to_dataframe()

In [48]:
# import employee name info
employee_salaries_query = client.query(
    """
    SELECT 
        info.employeeSlug,
        main.objectId,
        TRIM(REGEXP_REPLACE(main.agencyName, r'\(.*?\)', '')) as cleanAgencyName,
        main.agencyId,
        main.annualSalary,
        main.fiscalYear
    FROM city_employee_salaries.main as main
    LEFT JOIN city_employee_salaries.employee_info as info
    ON main.ObjectId = info.ObjectId
    """
)
employee_salaries = employee_salaries_query.result().to_dataframe()

In [50]:
#create dictionary of agency names and codes
agencyDict =  {
    "A01": "Mayors Office",
    "A02": "City Council",
    "A02": "Mayors OED",
    "A04": "Parks & Rec",
    "A05": "MONSE",
    "A06": "Housing & Community Dev",
    "A08": "M-R Human Services",
    "A09": "Liquor License Board",
    "A10": "Mayors Office of Children & Families",
    "A11": "Office of the Inspector General",
    "A12": "Finance - Accounting & Payroll",
    "A14": "Finance - Collections",
    "A15": "Comptroller - Real Estate",
    "A16": "Comptrollers Office",
    "A17": "Finance - Purchasing",
    "A18": "Finance - Treasury",
    "A19": "City Planning",
    "A23": "Finance - Admin & Budgets",
    "A24": "Comptroller - Audits",
    "A26": "M-R Labor Commissioner",
    "A28": "Wage Commissioner"
}
                                                                     

SyntaxError: ':' expected after dictionary key (1129860010.py, line 11)

In [49]:
# count number of records per city agency
agency_record_count = employee_salaries.groupby(["agencyId", "cleanAgencyName"], as_index=False).agg(
    salarieRecords=pd.NamedAgg(column="agencyId", aggfunc="count")
    )   
# calculate mean salary, max salary, min salary, growth, tenure, average raise
employee_salary_quality = employee_salaries.groupby(["employeeSlug", "agencyId", "cleanAgencyName"], as_index=False).agg(
    avgSalary=pd.NamedAgg(column="annualSalary", aggfunc="mean"),
    highestSalary=pd.NamedAgg(column="annualSalary", aggfunc="max"),
    lowestSalary=pd.NamedAgg(column="annualSalary", aggfunc="min"),
    growth=pd.NamedAgg(column="annualSalary", aggfunc=lambda salary: max(salary) - min(salary)),
    tenure=pd.NamedAgg(column="annualSalary", aggfunc="nunique"),
    avgRaise=pd.NamedAgg(column="annualSalary", aggfunc=lambda salary: (max(salary) - min(salary)) / len(salary))
    )
agency_salary_quality = employee_salary_quality.groupby(["agencyId", "cleanAgencyName"], as_index=False).agg(
    medSalary=pd.NamedAgg(column="avgSalary", aggfunc="mean"),
    medHighestSalary=pd.NamedAgg(column="highestSalary", aggfunc="median"),
    medLowestSalary=pd.NamedAgg(column="lowestSalary", aggfunc="median"),
    medGrowth=pd.NamedAgg(column="growth", aggfunc="median"),
    medTenure=pd.NamedAgg(column="tenure", aggfunc="median"),
    medRaise=pd.NamedAgg(column="avgRaise", aggfunc="median")
    )