In [1]:
import numpy as np
import pandas as pd

In [2]:
# link that was used to obtain the data file (last accessed 8-MAR-2020)
link_stg0 = "https://data.cityofnewyork.us/api/views/k397-673e/rows.csv?accessType=DOWNLOAD"

# downloaded data file (not included in remote repository)
file_stg0 = "data_stg0/Citywide_Payroll_Data__Fiscal_Year_.csv"

In [3]:
# Read stage 0 "NYC payroll" data
nyc_pr = pd.read_csv(
    
    link_stg0, # link to the file (last accessed 8-MAR-2020)
    #file_stg0, # due to its size, the file is not included in the remote repository
    
    usecols = [
        'Fiscal Year',
        'Agency Name',
        'Agency Start Date',
        'Work Location Borough',
        'Title Description',
        'Leave Status as of June 30',
        'Base Salary',
        'Pay Basis',
        'OT Hours',
        'Total OT Paid'
    ],
    
    dtype = {
        'Fiscal Year'          : np.int64,
        'Agency Name'          : object,
        'Agency Start Date'    : object,
        'Work Location Borough': object,
        'Title Description'    : object,
        'Leave Status as of June 30': object,
        'Base Salary'          : np.float64,
        'Pay Basis'            : object,
        'OT Hours'             : np.float64,
        'Total OT Paid'        : np.float64
    },
    
    parse_dates = [
        'Agency Start Date'
    ]
)

In [4]:
# Create stage 1 "NYC payroll" DataFrames

# Define filter
pr_filter = (
    ( nyc_pr['Leave Status as of June 30'] == 'ACTIVE') & # ACTIVE employees whose base salary
    ( nyc_pr['Pay Basis'] == 'per Annum' ) &              #  * is defined per annum
    ( nyc_pr['Base Salary'] > 1 )                         #  * exceeds 1 Dollar
)

# Define columns to keep
pr_columns = [
    'Fiscal Year',
    'Agency Name',
    'Agency Start Date',
    'Work Location Borough',
    'Title Description',
    'Base Salary',
    'OT Hours',
    'Total OT Paid'
]

# Apply row and column filters
nyc_pr_2018 = nyc_pr.loc[nyc_pr[pr_filter & ( nyc_pr['Fiscal Year'] == 2018 )].index, pr_columns]
nyc_pr_2019 = nyc_pr.loc[nyc_pr[pr_filter & ( nyc_pr['Fiscal Year'] == 2019 )].index, pr_columns]

In [5]:
# Save stage 1 "NYC payroll" data
nyc_pr_2018.to_csv('data_stg1/nyc_pr_2018.csv', index=False)
nyc_pr_2019.to_csv('data_stg1/nyc_pr_2019.csv', index=False)