# Exploratory Data Analysis for Florida Employee Salary data

In [3]:
import pandas as pd
from pathlib import Path

In [9]:
root = Path.cwd()
archive_path = root / "archives" / "archive_raw"

## Import data
Eventually we'll have a whole database. For now I just have two consecutive weekly data drops. 

In [28]:
first_path = archive_path / "fl_salaries_2021-01-18.csv"
second_path = archive_path / "fl_salaries_2021-01-25.csv"

first = pd.read_csv(first_path)
second = pd.read_csv(second_path)

## Examine length of the two files
The second dataset has a lot fewer employees than the first one. 

In [17]:
print(f"The first salaries file has {len(first):,} rows.")
print(f"The second salaries file has {len(second):,} rows.")

print(f"The first salaries file has {len(first) - len(second):,} more rows than the second one.")

The first salaries file has 108,130 rows.
The second salaries file has 92,400 rows.
The first salaries file has 15,730 more rows than the second one.


## Examine a sample of rows
Looks like there is no employee ID column here. Too bad; that would have made things easier. Let's assume for now that if we take the first, last, and middle names, plus the date of hire, that's enough to unique identify the same employee across two of these data drops.

In [30]:
first.sort_values(['Last Name', 'First Name']).head(5)

Unnamed: 0,Agency Name,Budget Entity,Position Number,Last Name,First Name,Middle Name,Employee Type,Full/Part Time,Class Code,Class Title,State Hire Date,Salary,OPS Hourly Rate
94154,Justice Admin Commission,STW/GUARDIAN AD LITEM,13132,AABERG,ELIZABETH,M,Salaried,Full Time,8004.0,ADMINISTRATIVE SPECIALIST II,2013-06-14,"$ 3,2443.20",
32857,Department of Health,DISABILITY BENEFITS DETERM,6863,AABERG,LEAH,MARIE,Salaried,Full Time,3471.0,MEDICAL DISABILITY EXAMINER,2020-08-21,"$ 3,5110.14",
54044,Department of Revenue,GENERAL TAX ADMINISTRATION,1074,AAMODT,JACQUELINE,K,Salaried,Full Time,1619.0,SENIOR REVENUE ADMINISTRATOR - SES,2005-04-01,"$ 7,2100.08",
53634,Department of Military Affairs,MILITARY READINESS,48,AARNIO,JAMES,M,Salaried,Full Time,425.0,DOCUMENT SPECIALIST,2020-05-08,"$ 2,5579.68",
8808,Department of Corrections,SPECIALTY INST OPERATIONS,37484,AARON,BRITTANY,NICOLE,Salaried,Full Time,8005.0,CORRECTIONAL OFFICER SERGEANT,2016-07-29,"$ 4,1955.42",


## Merge the two files

In [36]:
merged = first.merge(
    second[["Last Name", "First Name", "Middle Name", "State Hire Date", "Salary", "Class Title"]],
    how="inner",
    on=["Last Name", "First Name", "Middle Name", "State Hire Date"],
)

## Find all the rows where the salary changed

In [37]:
cond_1 = merged['Salary_x'] != merged["Salary_y"]
cond_2 = ~merged['Salary_x'].isna()

changed_salaries = merged[cond_1 & cond_2]
print(len(changed_salaries))
changed_salaries.sample(3)

293


Unnamed: 0,Agency Name,Budget Entity,Position Number,Last Name,First Name,Middle Name,Employee Type,Full/Part Time,Class Code,Class Title_x,State Hire Date,Salary_x,OPS Hourly Rate,Salary_y,Class Title_y
63990,Dept of Financial Services,Finance Regulation,4668,FERNANDEZ,RA'TANYA,SHEREE,Salaried,Full Time,7736.0,ATTORNEY,2007-07-06,"$ 4,6506.48",,"$ 4,7151.24",FINANCIAL SPECIALIST
1976,Agency for Persons w Disabilit,DEVELOPMENTAL DISABILITY CNTRS. CIVIL,32291,GRANT,TONYA,,Salaried,Full Time,5709.0,HUMAN SERVICES WORKER II,2020-08-21,"$ 3,0626.18",,"$ 2,4500.84",HUMAN SERVICES WORKER II
65608,Dept of Children and Families,MENTAL HEALTH SERVICES,47844,GLASS,DANIELLE,RENEE,Salaried,Full Time,5784.0,HUMAN SERVICES WORKER II - F/C,2017-09-08,"$ 2,6751.92",,"$ 2,8089.62",UNIT TREATMNT & REHAB SR SUPV I-F/C -SES


In [39]:
cond_3 = merged['Class Title_x'] != merged["Class Title_y"]
cond_4 = ~merged['Class Title_x'].isna()
changed_title = merged[cond_3 & cond_4]
print(len(changed_title))
changed_title.sample(3)

174


Unnamed: 0,Agency Name,Budget Entity,Position Number,Last Name,First Name,Middle Name,Employee Type,Full/Part Time,Class Code,Class Title_x,State Hire Date,Salary_x,OPS Hourly Rate,Salary_y,Class Title_y
6531,Agriculture and Consumer Svcs,ANIMAL/PEST/DISEASE CONTRL,1975,STUCKEY,MYCALAH,,Salaried,Full Time,7530.0,AGRICULTURE & CONSUMER PROTECT INSPECTOR,2019-06-07,"$ 2,8867.80",,"$ 3,1754.58",AGRICULTURE AND CONSUMER PROTECT SPEC
34881,Department of Health,STATEWIDE HLTH SUPPORT SVC,68969,BURNS,MELISSA,LAUREN,Salaried,Full Time,4812.0,ENVIRONMENTAL SPECIALIST III,2017-06-16,"$ 4,2903.90",,"$ 4,7208.46",GOVERNMENT OPERATIONS CONSULTANT III
68732,Dept of Children and Families,FAMILY SAFETY & PRESERVATION SVCS,71546,MEYER,JUSTIN,L,Salaried,Full Time,8374.0,CHILD PROTECTIVE FIELD SUPPORT CONSULTAN,2016-10-24,"$ 4,6410.00",,"$ 5,1051.00",CHILD PROTECTIVE INVESTIGATOR SUPV-SES
