## Salaries Dataset Cleaning and Aggregation

##### This notebooks highlights case study analysis on salaries dataset gathered from Kaggle, and will be used to highlight:
- Data Cleaning Techniques
- Data Aggregations and Pivot Tables
- Exploratory Data Analysis (EDA)

In [49]:
# import necessary libraries
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Simple 'ETL' Pipeline Class for the Dataset

In [50]:
# a basic data pipeline class that loads and deletes empty cols
class Pipeline:

    def __init__(self) -> None:
        self.data = None

    # sql query method
    def query(self, query: any):
        pass
    
    # loads csv file
    def load_csv(self, src: str):
        
        try:
            self.data = pd.read_csv(src)
            return self.data
        except ValueError as e:
            return "The DataFrame is empty."

    # drops empty columns
    def drop_cols(self, cols: list):
        self.data = self.data.drop(columns=cols, axis=1)
        return self.data

    def drop_rows(self, row: list):
        self.data = self.data.drop(rows=row)
        return self.data
    
    # fill values
    def fill_values(self, column: str, values: int):
        self.data = self.data[column].fillna(value=values)
        return self.data
    
    def drop_missing(self):
        self.data = self.data.dropna()
        return self.data

In [51]:
data = Pipeline()

df = data.load_csv("..\data\Salaries.csv")

  self.data = pd.read_csv(src)


In [52]:
df.columns

Index(['Id', 'EmployeeName', 'JobTitle', 'BasePay', 'OvertimePay', 'OtherPay',
       'Benefits', 'TotalPay', 'TotalPayBenefits', 'Year', 'Notes', 'Agency',
       'Status'],
      dtype='object')

In [53]:
# renaming columns
headers = [
    "id", "employee-name", "job-title", "base-pay", "overtime-pay", "other-pay",
    "benefits", "total-pay", "total-pay-benefits", "year", "notes", "agency", "status"
]

df.columns = headers

In [54]:
df.describe(include='all')

Unnamed: 0,id,employee-name,job-title,base-pay,overtime-pay,other-pay,benefits,total-pay,total-pay-benefits,year,notes,agency,status
count,148654.0,148654,148654,148049.0,148654.0,148654.0,112495.0,148654.0,148654.0,148654.0,0.0,148654,38119
unique,,110811,2159,109900.0,66555.0,84968.0,99635.0,,,,,1,2
top,,Kevin Lee,Transit Operator,0.0,0.0,0.0,0.0,,,,,San Francisco,FT
freq,,13,7036,875.0,66103.0,35218.0,1053.0,,,,,148654,22334
mean,74327.5,,,,,,,74768.321972,93692.554811,2012.522643,,,
std,42912.857795,,,,,,,50517.005274,62793.533483,1.117538,,,
min,1.0,,,,,,,-618.13,-618.13,2011.0,,,
25%,37164.25,,,,,,,36168.995,44065.65,2012.0,,,
50%,74327.5,,,,,,,71426.61,92404.09,2013.0,,,
75%,111490.75,,,,,,,105839.135,132876.45,2014.0,,,


In [55]:
# summary statistics on all numerical columns
df.describe()

Unnamed: 0,id,total-pay,total-pay-benefits,year,notes
count,148654.0,148654.0,148654.0,148654.0,0.0
mean,74327.5,74768.321972,93692.554811,2012.522643,
std,42912.857795,50517.005274,62793.533483,1.117538,
min,1.0,-618.13,-618.13,2011.0,
25%,37164.25,36168.995,44065.65,2012.0,
50%,74327.5,71426.61,92404.09,2013.0,
75%,111490.75,105839.135,132876.45,2014.0,
max,148654.0,567595.43,567595.43,2014.0,


In [56]:
# dropping unnecessary/empty columns
data.drop_cols(['agency', 'notes', 'status'])

data.fill_values('benefits', 0)

0                    0
1                    0
2                    0
3                    0
4                    0
              ...     
148649            0.00
148650    Not Provided
148651    Not Provided
148652    Not Provided
148653            0.00
Name: benefits, Length: 148654, dtype: object