In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

In [27]:
import os    
from chardet import detect

# get file encoding type
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

In [28]:
dictionary = pd.read_csv('../data/data_dictionary.csv', encoding=get_encoding_type('../data/data_dictionary.csv'))
data = pd.read_csv('../data/parental_leave.csv', encoding=get_encoding_type('../data/parental_leave.csv'))

In [29]:
dict_dict = {field : description for field, description in zip(dictionary['Field'][:-1], dictionary['Description'][:-1])}

dict_dict

{'Company': 'Company name',
 'Industry': 'Company industry & sub-industry (Industry: Sub-industry)',
 'Paid Maternity Leave': 'Paid weeks off from work for mothers after the birth of their child',
 'Unpaid Maternity Leave': 'Unpaid weeks off from work for mothers after the birth of their child',
 'Paid Paternity Leave': 'Paid weeks off from work for fathers after the birth of their child',
 'Unpaid Paternity Leave': 'Unpaid weeks off from work for fathers after the birth of their child'}

In [30]:
data = data.drop(columns=data.columns[6:])
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [31]:
data['main_industry'] = data.industry.str.split(':').str[0].str.replace('&', 'and').str.strip()
data['sub_industry'] = data.industry.str.split(':').str[1].str.replace('&', 'and').str.strip()
data = data.drop(columns='industry')

In [32]:
data[data.main_industry.isna()]

Unnamed: 0,company,paid_maternity_leave,unpaid_maternity_leave,paid_paternity_leave,unpaid_paternity_leave,main_industry,sub_industry
1023,ASML,52.0,40.0,,,,
1024,INK Communications Co.,10.0,2.0,,,,
1025,Rokt,15.0,0.0,,,,


In [33]:
data.loc[data[data.company == 'ASML'].index, 'main_industry'] = 'Electronics'
data.loc[data[data.company == 'Rokt'].index, 'main_industry'] = 'Technology'
data.loc[data[data.company == 'Rokt'].index, 'sub_industry'] = 'Software'
data.loc[data[data.company == 'INK Communications Co.'].index, 'main_industry'] = 'Public Relations'

In [34]:
data.head()

Unnamed: 0,company,paid_maternity_leave,unpaid_maternity_leave,paid_paternity_leave,unpaid_paternity_leave,main_industry,sub_industry
0,Epsilon,6.0,6.0,6.0,6.0,Advertising,
1,The Walt Disney Company,5.0,4.0,4.5,4.0,Arts and Entertainment,
2,Guild Education,14.0,0.0,8.0,4.0,Business Services,Other
3,WeWork,14.0,2.0,16.0,4.0,Business Services,Other
4,Randstad USA,5.0,7.0,0.0,0.0,Business Services,Staffing and Outsourcing


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1601 entries, 0 to 1600
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   company                 1601 non-null   object 
 1   paid_maternity_leave    1601 non-null   float64
 2   unpaid_maternity_leave  1494 non-null   float64
 3   paid_paternity_leave    289 non-null    float64
 4   unpaid_paternity_leave  64 non-null     float64
 5   main_industry           1601 non-null   object 
 6   sub_industry            1316 non-null   object 
dtypes: float64(4), object(3)
memory usage: 87.7+ KB


In [36]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
paid_maternity_leave,1601.0,10.909119,8.024514,0.0,6.0,11.0,12.0,52.0
unpaid_maternity_leave,1494.0,6.628849,9.274953,0.0,0.0,4.0,10.0,52.0
paid_paternity_leave,289.0,7.33218,6.817591,0.0,2.0,6.0,12.0,51.0
unpaid_paternity_leave,64.0,7.734375,6.831863,0.0,2.0,6.0,12.0,29.0


In [37]:
profile = ProfileReport(data, title="Pandas Profiling Report", html={'style':{'full_width':True}}, explorative=True)

In [38]:
profile.to_widgets()

Summarize dataset: 100%|██████████| 33/33 [00:02<00:00, 14.41it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

                                                             

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [39]:
data.columns

Index(['company', 'paid_maternity_leave', 'unpaid_maternity_leave',
       'paid_paternity_leave', 'unpaid_paternity_leave', 'main_industry',
       'sub_industry'],
      dtype='object')

In [43]:
data['avg_unpaid_leave'] = data[['unpaid_maternity_leave', 'unpaid_paternity_leave']].mean(axis=1)
data['avg_paid_leave'] = data[['paid_maternity_leave', 'paid_paternity_leave']].mean(axis=1)
data['avg_leave'] = data[['avg_unpaid_leave', 'avg_paid_leave']].mean(axis=1)
data['paid_leave_discrepancy'] = data.paid_maternity_leave - data.paid_paternity_leave
data['unpaid_leave_discrepancy'] = data.unpaid_maternity_leave - data.unpaid_paternity_leave
data['paid_to_unpaid_leave_ratio'] = data.avg_paid_leave / data.avg_unpaid_leave
data['paid_to_unpaid_leave_ratio'] = data['paid_to_unpaid_leave_ratio'].replace(np.inf, np.nan)

In [46]:
data.to_csv('../data/clean.csv', index=False)