## Salaries Dataset Cleaning and Aggregation

##### This notebooks highlights case study analysis on salaries dataset gathered from Kaggle, and will be used to highlight:
- Data Cleaning Techniques
- Data Aggregations and Pivot Tables
- Exploratory Data Analysis (EDA)

In [118]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Simple 'ETL' Pipeline Class for the Dataset

In [119]:
# a basic data pipeline class that loads and deletes empty cols
class Pipeline:

    def __init__(self) -> None:
        self.data = None
    
    # loads csv file
    def load_csv(self, src: str):
        
        try:
            self.data = pd.read_csv(src)
            return self.data
        except ValueError as e:
            return "The DataFrame is empty."

    # drops empty columns
    def drop_cols(self, cols: list):
        self.data = self.data.drop(columns=cols, axis=1)
        return self.data

    # drops specific rows
    def drop_rows(self, row: list):
        self.data = self.data.drop(rows=row)
        return self.data
    
    # fill values
    def fill_values(self):
        self.data.fillna(0, inplace=True)
        return self.data
    
    def drop_missing(self):
        self.data = self.data.dropna()
        return self.data

In [120]:
data = Pipeline()

df = data.load_csv("..\data\Salaries.csv")

  self.data = pd.read_csv(src)


In [121]:
df.columns

Index(['Id', 'EmployeeName', 'JobTitle', 'BasePay', 'OvertimePay', 'OtherPay',
       'Benefits', 'TotalPay', 'TotalPayBenefits', 'Year', 'Notes', 'Agency',
       'Status'],
      dtype='object')

In [122]:
df.shape

(148654, 13)

In [123]:
# renaming columns
headers = [
    "id", "employee-name", "job-title", "base-pay", "overtime-pay", "other-pay",
    "benefits", "total-pay", "total-pay-benefits", "year", "notes", "agency", "status"
]

df.columns = headers

In [124]:
# dropping unnecessary/empty columns
df = data.drop_cols(['agency', 'notes', 'status'])
df = data.fill_values()

df.loc[:, 'base-pay':'benefits'].isna().sum()

base-pay        0
overtime-pay    0
other-pay       0
benefits        0
dtype: int64

In [125]:
df.dtypes

id                      int64
employee-name          object
job-title              object
base-pay               object
overtime-pay           object
other-pay              object
benefits               object
total-pay             float64
total-pay-benefits    float64
year                    int64
dtype: object

In [126]:
# converting numerical object types into float data type
df = df[~df.astype(str).apply(lambda row: row.str.contains("Not provided", case=False).any(), axis=1)].copy()

df[['base-pay', 'overtime-pay', 'other-pay', 'benefits']] = df[['base-pay', 'overtime-pay', 'other-pay', 'benefits']].astype(float)

# verify data types
df.dtypes

id                      int64
employee-name          object
job-title              object
base-pay              float64
overtime-pay          float64
other-pay             float64
benefits              float64
total-pay             float64
total-pay-benefits    float64
year                    int64
dtype: object

In [127]:
# subsetting the dataframe with numerical values
df = df[(df.loc[:, "base-pay":"total-pay-benefits"] >= 0).all(axis=1)]

# lambda function to standardize string values
df.loc[:, "employee-name":"job-title"] = df.loc[:, "employee-name":"job-title"].apply(lambda cap: cap.str.title())

# counting values that still contains "Not Provided"
count = (df.loc[:, "employee-name":"job-title"] == "Not Provided").any(axis=1).sum()

print(count)

0


In [None]:
# formatting job title strings
df['job-title'] = df['job-title'].str.replace(r'\bIii\b', 'III', regex=True)
df['job-title'] = df['job-title'].str.replace(r'\bIi\b', 'II', regex=True)
df['job-title'] = df['job-title'].str.replace(r'\bIv\b', 'IV', regex=True)