In [1]:
import pandas as pd
from bokeh.models import NumeralTickFormatter
#import janitor
# import numpy as np
# from plotly import __version__
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.graph_objs as go
import pprint
# init_notebook_mode(connected=True)
%matplotlib inline
pd.set_option('display.max_columns', 999)
pd.set_option('plotting.backend', 'pandas_bokeh') # new in pandas 0.25 - able to now set plotting backend
pd.plotting.output_notebook()

In [2]:
pd.get_option('backend')

'pandas_bokeh'

In [3]:
pd.__version__

'0.25.0'

In [16]:
test = pd.DataFrame(pd.Series({'a': 1, 'b': 2}))

In [4]:
df = pd.read_hdf('processed_data/YourMoney_Agency_Payroll.hdf', 'njpayroll') # loading cleaned csv from hdf

In [5]:
df_master = df[df['record_type']=='master']
df_detail = df[df['record_type']=='detail']
#df_master.to_pickle('raw_data/df_master.pkl')
#df_detail.to_pickle('raw_data/df_detail.pkl')

In [6]:
pprint.pprint(list(df_master.columns))

['calendar_year',
 'calendar_quarter',
 'as_of_date',
 'payroll_id',
 'last_name',
 'first_name',
 'middle_initial',
 'full_name',
 'salary_hourly_rate',
 'master_department_agency_desc',
 'master_section_desc',
 'master_title_desc',
 'employee_relations_group',
 'compensation_method',
 'master_ytd_regular_pay',
 'master_ytd_overtime_payments',
 'master_ytd_all_other_payments',
 'master_ytd_earnings',
 'paid_department_agency_desc',
 'paid_section_desc',
 'regular_pay',
 'supplemental_pay',
 'one_time_payments',
 'legislator_or_back_pay',
 'overtime_payments',
 'clothing_uniform_payments',
 'retroactive_pay',
 'lump_sum_pay',
 'cash_in_lieu_maintenance',
 'ytd_earnings',
 'record_type']


In [22]:
df_by_year = df_master['calendar_year'].value_counts()
pd.DataFrame(df_by_year.plot(kind='bar', xlabel='Calendar Year', ylabel='Counts');

In [29]:
sorted(df_master['calendar_year'].unique())

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [31]:
from bokeh.io import output_file, show
from bokeh.models.widgets import Slider

# output_file("slider.html")
years = sorted(df_master['calendar_year'].unique())
slider = Slider(start=years[0], end=years[-1], value=years[0], step=1, title="Year")

show(slider)

In [None]:
df_master.hvplot.bar('master_department_agency_desc', 'master_ytd_earnings',
               xlabel='Department Agency', ylabel='Earnings',
              width=1600, height=800, rot=90, yformatter=NumeralTickFormatter(),
              title='New Jersey Payroll Grouped by Department Agency',
                    groupby='calendar_year')

In [None]:
columns = ['calendar_year',
 'master_department_agency_desc',
 'master_section_desc',
 'master_title_desc',
 'employee_relations_group',
 'compensation_method',
 'master_ytd_earnings'
]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data = df_master[columns]

object_col = data.select_dtypes(include='object').columns
for i in object_col:
    data[i+'enc'] = le.fit_transform(data[i])
    data[i+'enc'] = data[i+'enc'].astype('category')
    
data.drop(object_col, axis=1, inplace=True)


x = data.drop('master_ytd_earnings', axis=1)
y = data['master_ytd_earnings']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)

rf = RandomForestRegressor()
rf.fit(x_train, y_train)

In [None]:
pred = rf.predict(x_test)

In [None]:
len(pred)

In [None]:
len(y_test)

In [None]:
rf.score(x_test, y_test)

In [None]:
rf.feature_importances_

In [None]:
for i, v in zip(rf.feature_importances_, x.columns):
    print(v, i)

In [None]:
master_list = ['calendar_year', 'payroll_id', 'salary_hourly_rate',
 'master_ytd_regular_pay',
 'master_ytd_overtime_payments',
 'master_ytd_all_other_payments',
 'master_ytd_earnings']
df_master2 = df_master[master_list]
df_detail2 = df_detail.drop(columns=master_list[2:], axis=1)

In [None]:
df3 = df_master[df['calendar_year']==2018].groupby('master_department_agency_desc').sum().sort_values(by='master_ytd_earnings', ascending=False)

In [None]:
df3.head()

In [None]:
help(df_master.)

In [None]:
df3.hvplot.bar('master_department_agency_desc', 'master_ytd_earnings',
               xlabel='Department Agency', ylabel='Earnings',
              width=1600, height=800, rot=90, yformatter=NumeralTickFormatter(),
              title='New Jersey Payroll Grouped by Department Agency')

In [None]:
df3 = df_master[df['calendar_year']==2018].groupby('master_department_agency_desc').sum().sort_values(by='master_ytd_earnings', ascending=False)
fig, ax = plt.subplots(figsize=(32,20))
fig = sns.barplot(df3.index, df3['master_ytd_earnings'])
plt.xticks(rotation='vertical')
plt.title('New Jersey Payroll Grouped by Department Agency')
plt.tight_layout()
plt.ylabel('Earnings')
plt.xlabel('Department Agency')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
fig.get_figure().savefig('output.png')
plt.show()

In [None]:
sorted(df_master['calendar_year'].unique())

In [None]:
df_by_years = df_master[['calendar_year', 'calendar_quarter', 'as_of_date', 'payroll_id',
       'last_name', 'first_name', 'middle_initial', 'full_name',
       'salary_hourly_rate', 'master_department_agency_desc',
       'master_section_desc', 'master_title_desc', 'employee_relations_group',
       'compensation_method', 'master_ytd_regular_pay',
       'master_ytd_overtime_payments', 'master_ytd_all_other_payments',
       'master_ytd_earnings']].sort_values(by='calendar_year', ascending=True)

In [None]:
max_pct_change = 1000 # np.inf
min_ytd_earnings = 20000
df2 = df_by_years[df_by_years['master_ytd_earnings'] > 20000].set_index(['calendar_year', 'payroll_id'])['master_ytd_earnings'].unstack(['payroll_id'])
df3 = df2.pct_change().max()
df4 = df3 < max_pct_change
df3[df4].sort_values(ascending=False)

In [None]:
payroll_id = 160585
df_master[df_master['payroll_id']==payroll_id].sort_values(by='calendar_year')

In [None]:
df_master.groupby('master_department_agency_desc')['master_ytd_earnings'].describe()

In [None]:
df_master[(df_master['master_department_agency_desc']=='none')].sort_values(by="master_ytd_earnings",ascending=False)

In [None]:
df_unit_q = df_master[df_master['employee_relations_group'].str.contains('unit q')]

In [None]:
retired_pension = pd.read_csv('raw_data/YourMoney_Retired_Pension_Members.csv')
retired_pension = janitor.clean_names(retired_pension)


In [None]:
retired_pension.head(2).T

In [None]:
year=2018
ret_still_salary = retired_pension.merge(df_unit_q, left_on='member_full_name', right_on='full_name')
ret_still_salary[(ret_still_salary['last_standard_allowance_yearmo'].str.contains('2018')) & (ret_still_salary['calendar_year']==year)]

In [None]:
ret_still_salary[ret_still_salary['calendar_year']==year].head()

In [None]:
df_master2.head().T

In [None]:
df_master3 = df_master2.set_index(['calendar_year', 'payroll_id'])
master_payroll_year = df_master3['master_ytd_earnings'].unstack()
top_earners = master_payroll_year.sum().sort_values(ascending=False)

In [None]:
name_list = df[['payroll_id', 'last_name', 'first_name', 'middle_initial', 'full_name']].drop_duplicates()

In [None]:
payroll_id = 91928
name_list[name_list['payroll_id']==payroll_id]

In [None]:
a = df_master[df_master['calendar_year']==2018].sort_values(by='master_ytd_earnings', ascending=False).groupby('master_department_agency_desc').agg({'master_ytd_earnings': 'sum'}).sort_values(by='master_ytd_earnings', ascending=False)
fig, ax = plt.subplots(figsize=(24,15))
fig = sns.barplot(a.index, a['master_ytd_earnings'])
plt.xticks(rotation='vertical')
plt.title('New Jersey Payroll Grouped by Department Agency (Top 1000 Earners from 2018)')
plt.tight_layout()
plt.ylabel('Earnings')
plt.xlabel('Department Agency')
ax.get_yaxis().set_major_formatter(
    ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
fig.get_figure().savefig('Top1000_2018only.png')
plt.show()

In [None]:
init_notebook_mode(connected=True)
data = [go.Bar(
            x=a.index,
            y=a['master_ytd_earnings']
    )]
layout = go.Layout(
    autosize=False,
    width=2000,
    height=1200,
    xaxis=dict(
        title = ''.join(['Department Agency'] + [u'\xa0'] * 20 + [u'\n\xa0'] * 10),
        tickangle=-45,
        tickprefix= u'\xa0' * 30 + '\n' * 10,
        automargin=True
    ),
    yaxis=dict(
        title = 'Net Earnings',
        tickprefix= ' ' * 10,
        automargin=True
    ),
    margin=go.layout.Margin(
        l=100,
        r=50,
        b=250,
        t=50,
        pad=4
    ),
    barmode='group',
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
# Check distribution of compensation methods
df_master['compensation_method'].unique()

In [None]:
compensation_method_cnt = df_master.groupby('compensation_method').agg({'payroll_id': ['count', ('ratio', lambda x: x.count() / len(df_master))]})

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
fig = sns.barplot(compensation_method_cnt['payroll_id'].index, compensation_method_cnt['payroll_id']['ratio'])
plt.xticks(rotation='vertical')
plt.title('Compensation Method Ratio')
plt.tight_layout()
plt.ylabel('Percentage')
plt.xlabel('Compenstation Method')
fig.get_figure().savefig('compensation_method_ratio.png')
plt.show()

In [None]:
b = df_master[df_master['calendar_year']==2018].sort_values(by='master_ytd_earnings', ascending=False).groupby(['master_department_agency_desc', 'compensation_method']).agg({'master_ytd_earnings': 'sum'}).sort_values(by='master_ytd_earnings', ascending=False)

In [None]:
c = b.unstack()
c['master_ytd_earnings'].fillna(0, inplace=True)
d = c.unstack().unstack().reset_index().T[1:]
d = d.rename(columns=d.iloc[0]).drop(d.index[0])

In [None]:
data = []
for i in d.columns:
    data.append(go.Bar(
        x = d.index,
        y = d[i],
        name = i.lower()
    ))

layout = go.Layout(
    autosize=False,
    width=1000,
    height=600,
    xaxis=dict(
        title = ''.join(['Department Agency'] + [u'\xa0'] * 20 + [u'\n\xa0'] * 10),
        tickangle=-45,
        tickprefix= u'\xa0' * 20 + '\n' * 10,
        automargin=True
    ),
    yaxis=dict(
        title = 'Net Earnings',
        tickprefix= ' ' * 10,
        automargin=True
    ),
    margin=go.layout.Margin(
        l=100,
        r=50,
        b=250,
        t=50,
        pad=4
    ),
    barmode='stack',
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
relative_compensation_method_2018 = d.copy(deep=True)
for i in relative_compensation_method_2018.columns:
    relative_compensation_method_2018[i] = pd.to_numeric(relative_compensation_method_2018[i])
relative_compensation_method_2018['total'] = relative_compensation_method_2018.sum(axis=1)

In [None]:
init_notebook_mode(connected=True)

data = []

for i in relative_compensation_method_2018.columns:
    if i != 'total':
        data.append(go.Bar(
            x = relative_compensation_method_2018.index,
            y = relative_compensation_method_2018[i] / relative_compensation_method_2018['total'],
            name = i.lower()
        ))

layout = go.Layout(
    autosize=False,
    width=1000,
    height=600,
    title='100% ',
    xaxis=dict(
        title = ''.join(['Department Agency'] + [u'\xa0'] * 20 + [u'\n\xa0'] * 10),
        tickangle=-45,
        tickprefix= u'\xa0' * 20 + '\n' * 10,
        automargin=True
    ),
    yaxis=dict(
        title = 'Net Earnings Ratio',
        tickprefix= ' ' * 10,
        automargin=True
    ),
    margin=go.layout.Margin(
        l=100,
        r=50,
        b=250,
        t=50,
        pad=4
    ),
    barmode='stack',
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='')

## Use Social Security Administration Popular Baby Names by Year to try to find names with gender

In [None]:
all_names = pd.read_csv('names/allnames.txt', names=['name', 'gender', 'count'])
unique_names = all_names.drop_duplicates(subset='name')
unique_names['name'] = unique_names['name'].str.lower()

In [None]:
# Left join df_master with unique_names. We can see which names are not matched with unique names database
df_names_merged = df_master.merge(unique_names, left_on='first_name', right_on='name', how='left')

In [None]:
print(df_names_merged.shape, df_master.shape)

In [None]:
df_master_no_name_match = df_names_merged[df_names_merged['name'].isna()]
print(df_master_no_name_match.shape)
df_master_no_name_match_single_first_name = df_master_no_name_match[df_master_no_name_match['first_name'].str.len() > 1]
print(df_master_no_name_match_single_first_name.shape)

51,955 entries do not have a name that fits with the unique names.
10,670 entries only use a single initial for the first name.

In [None]:
import detector

In [None]:
d = detector.Detector()