### Salaries Dataset Cleaning and Aggregation

##### This notebooks highlights case study analysis on salaries dataset gathered from Kaggle, and will be used to highlight:
- Data Cleaning Techniques
- Data Aggregations and Pivot Tables
- Exploratory Data Analysis (EDA)

In [363]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##### Simple 'ETL' Pipeline Class for the Dataset

In [364]:
# a basic data pipeline class that loads and deletes empty cols
class Pipeline:

    def __init__(self) -> None:
        self.data = None
    
    # loads csv file
    def load_csv(self, src: str):
        
        try:
            self.data = pd.read_csv(src, low_memory=False)
            return self.data
        except ValueError as e:
            return "The DataFrame is empty."

    # drops empty columns
    def drop_cols(self, cols: list):
        self.data = self.data.drop(columns=cols, axis=1)
        return self.data

    # drops specific rows
    def drop_rows(self, row: list):
        self.data = self.data.drop(rows=row)
        return self.data
    
    # fill values
    def fill_values(self):
        self.data.fillna(0, inplace=True)
        return self.data
    
    def drop_missing(self):
        self.data = self.data.dropna()
        return self.data

In [365]:
data = Pipeline()

df = data.load_csv("..\data\Salaries.csv")

In [366]:
df.columns

Index(['Id', 'EmployeeName', 'JobTitle', 'BasePay', 'OvertimePay', 'OtherPay',
       'Benefits', 'TotalPay', 'TotalPayBenefits', 'Year', 'Notes', 'Agency',
       'Status'],
      dtype='object')

In [367]:
df.shape

(148654, 13)

In [368]:
# renaming columns
headers = [
    "id", "employee-name", "job-title", "base-pay", "overtime-pay", "other-pay",
    "benefits", "total-pay", "total-pay-benefits", "year", "notes", "agency", "status"
]

df.columns = headers

In [369]:
# dropping unnecessary/empty columns
df = data.drop_cols(['agency', 'notes', 'status'])
df = data.fill_values()

df.loc[:, 'base-pay':'benefits'].isna().sum()

base-pay        0
overtime-pay    0
other-pay       0
benefits        0
dtype: int64

In [370]:
df.dtypes

id                      int64
employee-name          object
job-title              object
base-pay               object
overtime-pay           object
other-pay              object
benefits               object
total-pay             float64
total-pay-benefits    float64
year                    int64
dtype: object

In [371]:
# converting numerical object types into float data type
df = df[~df.astype(str).apply(lambda row: row.str.contains("Not provided", case=False).any(), axis=1)].copy()

df[['base-pay', 'overtime-pay', 'other-pay', 'benefits']] = df[['base-pay', 'overtime-pay', 'other-pay', 'benefits']].astype(float)

# verify data types
df.dtypes

id                      int64
employee-name          object
job-title              object
base-pay              float64
overtime-pay          float64
other-pay             float64
benefits              float64
total-pay             float64
total-pay-benefits    float64
year                    int64
dtype: object

In [372]:
# subsetting the dataframe with numerical values to remove 0 or negative values
df = df[(df.loc[:, "base-pay":"total-pay-benefits"] >= 0).all(axis=1)]

# dropping the 0 values on total-pay as we treat them as not provided
df = df[df['total-pay'] > 0]

# lambda function to standardize string values
df.loc[:, "employee-name":"job-title"] = df.loc[:, "employee-name":"job-title"].apply(lambda cap: cap.str.title())

# counting values that still contains "Not Provided"
count = (df.loc[:, "employee-name":"job-title"] == "Not Provided").any(axis=1).sum()

print(count)

0


In [373]:
# formatting job title strings
df['job-title'] = df['job-title'].str.replace(r'\bIii\b', 'III', regex=True)
df['job-title'] = df['job-title'].str.replace(r'\bIi\b', 'II', regex=True)
df['job-title'] = df['job-title'].str.replace(r'\bIv\b', 'IV', regex=True)

df

Unnamed: 0,id,employee-name,job-title,base-pay,overtime-pay,other-pay,benefits,total-pay,total-pay-benefits,year
0,1,Nathaniel Ford,General Manager-Metropolitan Transit Authority,167411.18,0.00,400184.25,0.00,567595.43,567595.43,2011
1,2,Gary Jimenez,Captain III (Police Department),155966.02,245131.88,137811.38,0.00,538909.28,538909.28,2011
2,3,Albert Pardini,Captain III (Police Department),212739.13,106088.18,16452.60,0.00,335279.91,335279.91,2011
3,4,Christopher Chong,Wire Rope Cable Maintenance Mechanic,77916.00,56120.71,198306.90,0.00,332343.61,332343.61,2011
4,5,Patrick Gardner,"Deputy Chief Of Department,(Fire Department)",134401.60,9737.00,182234.59,0.00,326373.19,326373.19,2011
...,...,...,...,...,...,...,...,...,...,...
148621,148622,Kenneth Mackey,Transit Operator,0.00,0.00,15.35,0.00,15.35,15.35,2014
148622,148623,Jacqueline Hubbard,Senior Clerk,0.00,0.00,13.77,0.00,13.77,13.77,2014
148623,148624,Deborah B Honig,Attorney (Civil/Criminal),0.00,0.00,13.35,0.13,13.35,13.48,2014
148624,148625,Lorraine Rosenthal,Senior Clerk,0.00,0.00,12.89,0.00,12.89,12.89,2014


In [374]:
# summary stats check on the curr. dataframe
df.describe().round(2)

Unnamed: 0,id,base-pay,overtime-pay,other-pay,benefits,total-pay,total-pay-benefits,year
count,148266.0,148266.0,148266.0,148266.0,148266.0,148266.0,148266.0,148266.0
mean,74239.0,66223.56,5078.85,3658.14,18956.17,74960.54,93916.71,2012.52
std,42921.43,42811.29,11466.13,8064.84,17175.13,50440.14,62718.75,1.12
min,1.0,0.0,0.0,0.0,0.0,0.3,0.3,2011.0
25%,37071.25,33297.32,0.0,0.0,0.0,36690.87,44716.52,2012.0
50%,74174.5,64957.53,0.0,819.59,23376.78,71555.94,92577.83,2013.0
75%,111533.75,94691.01,4680.88,4255.35,33493.14,105967.1,133008.42,2014.0
max,148626.0,319275.01,245131.88,400184.25,96570.66,567595.43,567595.43,2014.0


##### Load the Cleaned DataFrame into CSV File

In [375]:
# load csv into
df.to_csv("..\data\salaries_cleaned.csv", index=False)