Dataset: https://www.kaggle.com/datasets/osmi/mental-health-in-tech-2016

In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA, KernelPCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


%matplotlib inline

pd.options.display.max_columns = 63

df = pd.read_csv('data/mental.csv')

In [68]:
new_column_names = ['is_self_employed', 'employee_count_bracket', 'is_tech_org', 'role_is_IT', 'empl_provides_mh_cov', 'knows_mh_cov_options',
                    'empl_discussed_mh','empl_offers_mh_rsrcs','anon_protec','if_askfor_mh_medical_leave_how_easy', 'discuss_mh_with_empl_wouldcause_neg_conseq', 'discuss_ph_with_empl_wouldcause_neg_conseq',
                    'comfy_discussing_mh_with_coworkers','comfy_discussing_mh_with_supervisors' , 'empl_takes_mh_asseriously_as_ph', 'observed_neg_conseq_for_coworkers_openabout_mh_inworkspace',
                    'has_medical_cov_incl_mh', 'knowsof_mh_resources','if_diag_would_reveal_toclients/bn_contacts', 'if_reveal_diag_toclient_didthis_impact_neg', 'if_diag_would_reveal_tocoworkers/employees',
                    'if_reveal_diag_tocoworker_didthis_impact_neg', 'productivity_isaffected_by_mh', 'percentage_worktime_affected_by_mh', 'has_prev_employers','prev_empl_provided_mh_benefits',
                    'was_aware_of_prevemployers_mhcare_options','prev_empl_discussed_mh', 'prev_empl_provided_mh_rsrc', 'prev_anon_protec', 'prev_discuss_mh_with_empl_wouldcause_neg_conseq',
                    'prev_discuss_ph_with_empl_wouldcause_neg_conseq', 'prev_wouldhavebeen_willing_discuss_mh_coworkers', 'prev_wouldhavebeen_willing_discuss_mh_supervisors',
                    'prev_empl_takes_mh_asseriously_as_ph', 'prev_observed_neg_conseq_for_coworkers_openabout_mh_inworkspace', 'willingto_bringup_ph_interview', 'why', 'willingto_bringup_mh_interview',
                    'why2', 'believes_beingident_as_mh_wouldhurt_career', 'thinks_coworkers_wouldviewthem_neg_if_mh', 'howwilling_share_mh_with_friendsfamily',
                    'observed_badly_handled_response_to_mh_inworkplace', 'observed_madethem_notwantto_talk_about_mh', 'has_family_history_mh', 'hashad_mh_inpast', 'HAS_MH', 'diagnoses_notprof',
                    'conditions_suspected', 'hasbeen_diag_byprof', 'diagnoses_prof', 'sought_treatment', 'mh_interferes_w/work_effective_trt', 'mh_interferes_w/work_ineffective_trt',
                    'age', 'sex', 'country_livesin', 'us_state_livesin', 'country_worksin', 'us_state_worksin', 'work_position', 'is_remote_working']

df.columns = new_column_names

In [69]:
why_cols = [columns for columns in df.columns if 'why' in columns]

#dropping 'why' questions, as each one of them is too unique
for column in why_cols:
    df.drop(column, axis='columns', inplace=True)

#most people (all but 26) work in the same country as they live. this info will still be preserved in remote working column. dropping the country_livesin_column
df.drop('country_livesin', axis='columns', inplace=True)

In [70]:
df.drop(['us_state_livesin', 'us_state_worksin', 'diagnoses_prof'], axis='columns', inplace=True)

In [71]:
#dropping rows with 25 missing values or more
df = df.loc[df.isna().sum(axis=1) < 25]

In [72]:
#Replacing gender names to a male/female/other convention

df['sex'].replace(to_replace=['Male', 'male', 'Male ','M','m','man','Male.','male 9:1 female, roughly','Male (cis)','Sex is male','Man',
                              'cis male','Malr','Dude',"I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
                              'mail', 'M|', 'Male/genderqueer','male ','Cis Male', 'Male (trans, FtM)','cisdude','cis man','MALE','Cis male'], value='MALE', inplace=True)

df['sex'].replace(to_replace=['Female', 'female', 'I identify as female.','female ','Female assigned at birth ','F', 'Woman', 'fm', 'f', 'Cis female ', 'Transitioned, M2F',
                                        'Female or Multi-Gender Femme', 'Female ', 'woman', 'female/woman','Cisgender Female','genderqueer woman','mtf','fem', 'Female (props for making this a freeform field, though)',
                                        ' Female','Cis-woman','Transgender woman'
                                        ],value='FEMALE', inplace=True)

df['sex'].replace(to_replace=['Bigender', 'non-binary',
       'Genderfluid (born female)', 'Other/Transfeminine', 'Androgynous', 'Other', 'nb masculine', 'none of your business',
       'genderqueer', 'Human', 'Genderfluid', 'Enby', 'Queer', 'Agender',
       'Fluid', 'Nonbinary', 'human', 'Unicorn', 'Genderqueer',
       'Genderflux demi-girl', 'female-bodied; no feelings about gender',
       'AFAB'], value='OTHER', inplace=True)

In [73]:
def show_value_counts(df):
    for column in df.columns:
        print(df[column].value_counts(), end='\n\n')

In [74]:
def get_unique_conditions(cond_series):
    conditions = cond_series.unique()

    condition_unsep = set()
    for condition in conditions:
        condition_unsep.add(condition)

    condition_set = set()
    pipes = []

    for condition in condition_unsep:
        if '|' in condition:
            pipes.append(condition.split('|'))
        else:
            condition_set.add(condition)

    for cond_list in pipes:
        for cond in cond_list:
            condition_set.add(cond)
    
    return list(condition_set)

In [75]:
#manually added based on unique_diagnosed_conditions_by_prof - faster to discern the actually unqique ones by hand
actually_unique_by_prof = set(['Autism Spectrum Disorder', 'Mood disorder', 'PTSD', 'PDD-NOS', 'Addictive Disorder', 'ADHD', 'ADD',
                   'Anxiety Disorder', 'Burnout', 'Dissociative Disorder', 'Eating Disorder', 'Gender Dysphoria', 
                   'Gender Identity Disorder', 'Intimate Disorder', 'MCD', 'OCD', 'Personality Disorder', 'Psychotic Disorder',
                   'SAD', 'Stress Response Syndromes','Substance Use Disorder','Suicidal Ideation'])


In [76]:
#majority from USA, UK, CD, GER, NED, AUS - dropping others for now
df['country_worksin'].value_counts()
df = df.loc[df['country_worksin'].isin(['United States of America', 'United Kingdom', 'Canada', 'Germany', 'Netherlands', 'Australia'])]

In [77]:
get_unique_conditions(df['work_position'])

tech_roles = ['Back-end Developer', 'Front-end Developer','DevOps/SysAdmin', 'Dev Evangelist/Advocate']

joined = '|'.join(tech_roles)


In [78]:
def has_tech_role(row):
    
    tech_roles = ['Back-end Developer', 'Front-end Developer','DevOps/SysAdmin', 'Dev Evangelist/Advocate']
    
    for role in tech_roles:
        if role in row['work_position']:
            return 1
    
    return 0


df['has_tech_role'] = df.apply(lambda row: has_tech_role(row), axis=1)
df.drop('role_is_IT', axis='columns', inplace=True)

In [79]:
def get_new_bracket(row):
    bracket = row['employee_count_bracket']
    
    if bracket in ['1-5', '6-25']:
        return '1-25'
    
    elif bracket in ['26-100', '100-500']:
        return '26-500'
    
    elif bracket in ['500-1000', 'More than 1000']:
        return '500 or more'
    
    else:
        return bracket
        

df['employee_count_bracket'] = df.apply(lambda row: get_new_bracket(row), axis=1) 

In [80]:
#columns that will have missing values for self-employed respondents (209 people)
excluded_for_self_employed = ['employee_count_bracket', 'empl_provides_mh_cov', 'is_tech_org', 'empl_discussed_mh', 'empl_offers_mh_rsrcs',
                               'anon_protec', 'if_askfor_mh_medical_leave_how_easy', 'discuss_mh_with_empl_wouldcause_neg_conseq',
                              'discuss_ph_with_empl_wouldcause_neg_conseq', 'comfy_discussing_mh_with_coworkers', 'comfy_discussing_mh_with_supervisors',
                              'empl_takes_mh_asseriously_as_ph', 'observed_neg_conseq_for_coworkers_openabout_mh_inworkspace', 'observed_badly_handled_response_to_mh_inworkplace']

#columns that will have missing values for respondents with no previous employers (104 people)
excluded_for_no_prev_employers = ['prev_empl_provided_mh_benefits', 'was_aware_of_prevemployers_mhcare_options', 'prev_empl_discussed_mh', 'prev_empl_provided_mh_rsrc',
                                  'prev_anon_protec', 'prev_discuss_mh_with_empl_wouldcause_neg_conseq', 'prev_discuss_ph_with_empl_wouldcause_neg_conseq',
                                  'prev_wouldhavebeen_willing_discuss_mh_coworkers', 'prev_wouldhavebeen_willing_discuss_mh_supervisors', 'prev_empl_takes_mh_asseriously_as_ph',
                                  'prev_observed_neg_conseq_for_coworkers_openabout_mh_inworkspace']

In [81]:
#replacing the relevant missing values with 'N/A' - not applicable
df.loc[df['is_self_employed'] == 1,       excluded_for_self_employed] = df.loc[df['is_self_employed'] == 1, excluded_for_self_employed].fillna("N/A")
df.loc[df['has_prev_employers'] == 0, excluded_for_no_prev_employers] = df.loc[df['has_prev_employers'] == 0, excluded_for_no_prev_employers].fillna("N/A")

In [82]:
#dropping column with clear majority missing values

df.drop(['knowsof_mh_resources', 'if_diag_would_reveal_toclients/bn_contacts', 'if_reveal_diag_toclient_didthis_impact_neg', 'if_diag_would_reveal_tocoworkers/employees', 'if_reveal_diag_tocoworker_didthis_impact_neg',
         'productivity_isaffected_by_mh','percentage_worktime_affected_by_mh'], axis='columns', inplace=True)

#dropping non-professional and self-diagnoses - 1. because mostly are missing anyway and 2. self diagnoses can are very innaccurate
df.drop(['diagnoses_notprof', 'conditions_suspected'], axis='columns', inplace=True)

In [83]:
#filling in the 3 remaining with male - safest bet as the overwhelming smajority of the respondents are male
df.loc[pd.isna(df['sex']), 'sex'] = df.loc[pd.isna(df['sex']), 'sex'].fillna('MALE')

In [84]:
df['observed_madethem_notwantto_talk_about_mh'].value_counts()

observed_madethem_notwantto_talk_about_mh
Yes      220
No       202
Maybe    152
Name: count, dtype: int64

In [85]:
df.loc[pd.isna(df['observed_badly_handled_response_to_mh_inworkplace'])] = df.loc[pd.isna(df['observed_badly_handled_response_to_mh_inworkplace'])].fillna("Maybe/Not sure")

In [86]:
empl_cov_yes_filt = (pd.isna(df['has_medical_cov_incl_mh'])) & (df['empl_provides_mh_cov'] == 'Yes')

#Filling in the missing values for medical coverage - if the employer provides it, means they have it
#df.loc[empl_cov_yes_filt, "has_medical_cov_incl_mh" ] = df.loc[empl_cov_yes_filt, "has_medical_cov_incl_mh" ].fillna(1.0)

#dropping med coverage for now and remaining missing val columns- even with the above commented out step, we still end up with almost 500 missing values
df.drop(['has_medical_cov_incl_mh', 'knows_mh_cov_options', 'observed_madethem_notwantto_talk_about_mh'], axis='columns', inplace=True)

In [87]:
df.isna().sum()

is_self_employed                                                   0
employee_count_bracket                                             0
is_tech_org                                                        0
empl_provides_mh_cov                                               0
empl_discussed_mh                                                  0
empl_offers_mh_rsrcs                                               0
anon_protec                                                        0
if_askfor_mh_medical_leave_how_easy                                0
discuss_mh_with_empl_wouldcause_neg_conseq                         0
discuss_ph_with_empl_wouldcause_neg_conseq                         0
comfy_discussing_mh_with_coworkers                                 0
comfy_discussing_mh_with_supervisors                               0
empl_takes_mh_asseriously_as_ph                                    0
observed_neg_conseq_for_coworkers_openabout_mh_inworkspace         0
has_prev_employers                

In [88]:
df.shape

(1219, 45)

In [89]:
df

Unnamed: 0,is_self_employed,employee_count_bracket,is_tech_org,empl_provides_mh_cov,empl_discussed_mh,empl_offers_mh_rsrcs,anon_protec,if_askfor_mh_medical_leave_how_easy,discuss_mh_with_empl_wouldcause_neg_conseq,discuss_ph_with_empl_wouldcause_neg_conseq,comfy_discussing_mh_with_coworkers,comfy_discussing_mh_with_supervisors,empl_takes_mh_asseriously_as_ph,observed_neg_conseq_for_coworkers_openabout_mh_inworkspace,has_prev_employers,prev_empl_provided_mh_benefits,was_aware_of_prevemployers_mhcare_options,prev_empl_discussed_mh,prev_empl_provided_mh_rsrc,prev_anon_protec,prev_discuss_mh_with_empl_wouldcause_neg_conseq,prev_discuss_ph_with_empl_wouldcause_neg_conseq,prev_wouldhavebeen_willing_discuss_mh_coworkers,prev_wouldhavebeen_willing_discuss_mh_supervisors,prev_empl_takes_mh_asseriously_as_ph,prev_observed_neg_conseq_for_coworkers_openabout_mh_inworkspace,willingto_bringup_ph_interview,willingto_bringup_mh_interview,believes_beingident_as_mh_wouldhurt_career,thinks_coworkers_wouldviewthem_neg_if_mh,howwilling_share_mh_with_friendsfamily,observed_badly_handled_response_to_mh_inworkplace,has_family_history_mh,hashad_mh_inpast,HAS_MH,hasbeen_diag_byprof,sought_treatment,mh_interferes_w/work_effective_trt,mh_interferes_w/work_ineffective_trt,age,sex,country_worksin,work_position,is_remote_working,has_tech_role
0,0,26-500,1.0,Not eligible for coverage / N/A,No,No,I don't know,Very easy,No,No,Maybe,Yes,I don't know,No,1,"No, none did",N/A (not currently aware),I don't know,None did,I don't know,Some of them,None of them,Some of my previous employers,Some of my previous employers,I don't know,None of them,Maybe,Maybe,Maybe,"No, I don't think they would",Somewhat open,No,No,Yes,No,Yes,0,Not applicable to me,Not applicable to me,39,MALE,United Kingdom,Back-end Developer,Sometimes,1
1,0,1-25,1.0,No,Yes,Yes,Yes,Somewhat easy,No,No,Maybe,Yes,Yes,No,1,"Yes, they all did",I was aware of some,None did,Some did,"Yes, always",None of them,None of them,"No, at none of my previous employers",Some of my previous employers,Some did,None of them,Maybe,No,"No, I don't think it would","No, I don't think they would",Somewhat open,No,Yes,Yes,Yes,Yes,1,Rarely,Sometimes,29,MALE,United States of America,Back-end Developer|Front-end Developer,Never,1
2,0,1-25,1.0,No,No,No,I don't know,Neither easy nor difficult,Maybe,No,Maybe,Maybe,I don't know,No,1,"No, none did",N/A (not currently aware),None did,Some did,I don't know,I don't know,Some of them,Some of my previous employers,I don't know,I don't know,Some of them,Yes,Yes,Maybe,Maybe,Somewhat open,Maybe/Not sure,No,Maybe,No,No,1,Not applicable to me,Not applicable to me,38,MALE,United Kingdom,Back-end Developer,Always,1
3,1,,,,,,,,,,,,,,1,Some did,N/A (not currently aware),None did,None did,I don't know,Some of them,Some of them,Some of my previous employers,Some of my previous employers,I don't know,Some of them,Yes,Maybe,"Yes, I think it would",Maybe,Neutral,No,No,Yes,Yes,Yes,1,Sometimes,Sometimes,43,MALE,United Kingdom,Supervisor/Team Lead,Sometimes,0
4,0,1-25,0.0,Yes,No,No,No,Neither easy nor difficult,Yes,Maybe,Maybe,No,No,No,1,I don't know,N/A (not currently aware),Some did,None did,I don't know,Some of them,Some of them,"No, at none of my previous employers",Some of my previous employers,Some did,Some of them,Maybe,No,"Yes, I think it would",Maybe,Somewhat open,"Yes, I experienced",Yes,Yes,Yes,Yes,1,Sometimes,Sometimes,43,FEMALE,United States of America,Executive Leadership|Supervisor/Team Lead|Dev ...,Sometimes,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,0,500 or more,1.0,Yes,No,No,Yes,Somewhat easy,No,No,Yes,Yes,Yes,No,1,Some did,I was aware of some,None did,None did,I don't know,Some of them,None of them,Some of my previous employers,"No, at none of my previous employers",None did,"Yes, all of them",No,No,Maybe,"No, I don't think they would",Very open,"Yes, I experienced",Yes,Yes,Yes,Yes,1,Rarely,Often,38,FEMALE,United States of America,Support,Always,0
1428,1,,,,,,,,,,,,,,1,"Yes, they all did",I was aware of some,Some did,Some did,I don't know,I don't know,None of them,Some of my previous employers,Some of my previous employers,Some did,None of them,No,No,Maybe,Maybe,Somewhat open,Maybe/Not sure,Yes,No,No,No,1,Not applicable to me,Not applicable to me,34,FEMALE,United States of America,Other,Sometimes,0
1430,0,26-500,1.0,Yes,Yes,Yes,I don't know,Somewhat difficult,Maybe,Maybe,Yes,Yes,I don't know,Yes,1,Some did,I was aware of some,None did,Some did,Sometimes,"Yes, all of them",Some of them,Some of my previous employers,Some of my previous employers,None did,Some of them,Maybe,No,"Yes, it has","No, I don't think they would",Somewhat open,"Yes, I observed",Yes,Yes,Maybe,Yes,1,Rarely,Sometimes,52,MALE,United States of America,Back-end Developer,Sometimes,1
1431,0,26-500,0.0,I don't know,No,Yes,I don't know,Somewhat difficult,Maybe,No,Maybe,Yes,No,No,1,"No, none did",N/A (not currently aware),None did,None did,I don't know,"Yes, all of them",None of them,"No, at none of my previous employers","No, at none of my previous employers",None did,None of them,Maybe,No,"No, I don't think it would","No, I don't think they would",Somewhat open,"Yes, I experienced",Yes,Maybe,Yes,Yes,0,Sometimes,Often,30,FEMALE,United States of America,DevOps/SysAdmin,Sometimes,1


In [90]:
columns_to_not_encode = ['is_self_employed', 'has_prev_employers', 'sought_treatment', 'age', 'work_position']
to_encode = [col for col in df.columns if col not in columns_to_not_encode]

#DROPPING WORK POSITION FOR NOW
df.drop('work_position', axis='columns', inplace=True)

In [91]:
encoded = pd.get_dummies(df, columns=to_encode, dtype=float)

In [92]:
encoded

Unnamed: 0,is_self_employed,has_prev_employers,sought_treatment,age,employee_count_bracket_1-25,employee_count_bracket_26-500,employee_count_bracket_500 or more,employee_count_bracket_N/A,is_tech_org_0.0,is_tech_org_1.0,is_tech_org_N/A,empl_provides_mh_cov_I don't know,empl_provides_mh_cov_N/A,empl_provides_mh_cov_No,empl_provides_mh_cov_Not eligible for coverage / N/A,empl_provides_mh_cov_Yes,empl_discussed_mh_I don't know,empl_discussed_mh_N/A,empl_discussed_mh_No,empl_discussed_mh_Yes,empl_offers_mh_rsrcs_I don't know,empl_offers_mh_rsrcs_N/A,empl_offers_mh_rsrcs_No,empl_offers_mh_rsrcs_Yes,anon_protec_I don't know,anon_protec_N/A,anon_protec_No,anon_protec_Yes,if_askfor_mh_medical_leave_how_easy_I don't know,if_askfor_mh_medical_leave_how_easy_N/A,if_askfor_mh_medical_leave_how_easy_Neither easy nor difficult,...,hashad_mh_inpast_No,hashad_mh_inpast_Yes,HAS_MH_Maybe,HAS_MH_No,HAS_MH_Yes,hasbeen_diag_byprof_No,hasbeen_diag_byprof_Yes,mh_interferes_w/work_effective_trt_Never,mh_interferes_w/work_effective_trt_Not applicable to me,mh_interferes_w/work_effective_trt_Often,mh_interferes_w/work_effective_trt_Rarely,mh_interferes_w/work_effective_trt_Sometimes,mh_interferes_w/work_ineffective_trt_Never,mh_interferes_w/work_ineffective_trt_Not applicable to me,mh_interferes_w/work_ineffective_trt_Often,mh_interferes_w/work_ineffective_trt_Rarely,mh_interferes_w/work_ineffective_trt_Sometimes,sex_FEMALE,sex_MALE,sex_OTHER,country_worksin_Australia,country_worksin_Canada,country_worksin_Germany,country_worksin_Netherlands,country_worksin_United Kingdom,country_worksin_United States of America,is_remote_working_Always,is_remote_working_Never,is_remote_working_Sometimes,has_tech_role_0,has_tech_role_1
0,0,1,0,39,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0,1,1,29,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0,1,1,38,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,1,1,43,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0,1,1,43,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,0,1,1,38,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1428,1,1,1,34,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1430,0,1,1,52,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1431,0,1,0,30,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [93]:
df['employee_count_bracket'].value_counts()

employee_count_bracket
26-500         481
500 or more    300
1-25           229
N/A            209
Name: count, dtype: int64