In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
data = pd.read_csv('SalesForce_Contact.csv', encoding='latin', low_memory=False)
data.shape

(132445, 391)

# General Methods

In [3]:
def count_value_null(feature, data):
    #print("Feature = ", feature)
    print("Number of records:", len(data[feature].index))
    print("Null Values:", data[feature].isna().sum())
    print("Null Values %:", data[feature].isna().sum()/len(data.index)*100)
    print("Unique Values:", data[feature].nunique())
    print("Top 10 Value Counts:")
    print(data[feature].value_counts().head(10))
    print("\n")

# Features exploration 1

This section are be removed without any impact to the result.

In [4]:
count_value_null('Volunteer__c', data)
count_value_null('Client__c', data)

Number of records: 132445
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
0    130070
1      2375
Name: Volunteer__c, dtype: int64


Number of records: 132445
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
1    105744
0     26701
Name: Client__c, dtype: int64




In [5]:
#Contacts that are neither Clients nor Volunteers (Who are they? Donor?)
data[(data['Client__c']== 0) & (data['Volunteer__c']== 0)].shape

(24484, 391)

In [6]:
#Contacts that are either Clients or Volunteers
data[(data['Client__c']== 1) & (data['Volunteer__c']== 1)].shape

(158, 391)

In [7]:
count_value_null('Hire_Heroes_USA_Confirmed_Hire__c', data)

Number of records: 132445
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
0    102095
1     30350
Name: Hire_Heroes_USA_Confirmed_Hire__c, dtype: int64




In [8]:
count_value_null('Regional_Manager_Approved__c', data)

Number of records: 132445
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
0    109447
1     22998
Name: Regional_Manager_Approved__c, dtype: int64




# Categorize clients (job seekers)

Job seeker include Hire and Non_hire. Hire include Approved_hire and Non_approved_hire.
Hire Hero has a multilayered approval process which is in place to ensure that their services are directly correlated to the client's success.

In [9]:
# All job seekers
client = data[(data['Client__c']==1)]
client.shape

(105744, 391)

In [10]:
# Job seekers who haven't got hired
non_hire = data[(data['Client__c']==1) & (data['Hire_Heroes_USA_Confirmed_Hire__c']==0)]
non_hire.shape

(75394, 391)

In [11]:
# NOTE: These cases should not be considered as non_hire because they are not clients to start with
data[(data['Client__c']==0) & (data['Hire_Heroes_USA_Confirmed_Hire__c']==0)].shape

(26701, 391)

In [12]:
# Job seekers who already got hired
hire = data[(data['Client__c']==1) & (data['Hire_Heroes_USA_Confirmed_Hire__c']==1)]
hire.shape

(30350, 391)

In [13]:
# Job seekers who got hired due to the Hire_Hero Serive
approved_hire = data[data['Regional_Manager_Approved__c'] == 1]
approved_hire.shape

(22998, 391)

In [14]:
# Job seekers who got hired but not correlated to Hire_Hero Serive
non_approved_hire = data[(data['Hire_Heroes_USA_Confirmed_Hire__c']==1) & (data['Regional_Manager_Approved__c'] == 0)]
non_approved_hire.shape

(7355, 391)

In [15]:
# NOTE: There are 3 cases that were not confirmed hire but approved by Division Manager. They are not counted as approved_hire
data[(data['Hire_Heroes_USA_Confirmed_Hire__c']==0) & (data['Regional_Manager_Approved__c'] == 1)].shape

(3, 391)

# Volunteers vs Owners

Examine if owners and volunteers are the same

In [16]:
# All volunteers
volunteer = data[(data['Volunteer__c']==1)]
volunteer.shape

(2375, 391)

In [17]:
count_value_null('OwnerId', client)

Number of records: 105744
Null Values: 0
Null Values %: 0.0
Unique Values: 196
Top 10 Value Counts:
00550000000ztRtAAI    30267
00550000001YDwUAAW     3049
00550000001zVqjAAE     2491
00550000001Vw8JAAS     1400
005500000042X0TAAU     1304
00538000004lJWlAAM     1244
00538000004SivDAAS     1221
005500000042GlIAAU     1200
00538000004kgf9AAA     1199
00538000004SiwfAAC     1177
Name: OwnerId, dtype: int64




In [18]:
count_value_null('OwnerId', volunteer)

Number of records: 2375
Null Values: 0
Null Values %: 0.0
Unique Values: 99
Top 10 Value Counts:
00550000001WkUeAAK    1496
00538000004kmwWAAQ     243
00550000000xADhAAM     180
005380000062eOzAAI     170
00538000004tArfAAE      87
00550000000z4FcAAI      10
00538000004gsNsAAI      10
00550000000ztRtAAI       7
00550000003KNnDAAW       7
00550000001XN4zAAG       7
Name: OwnerId, dtype: int64




In [19]:
data[data['Id'] == "00550000000ztRtAAI"]

Unnamed: 0,Id,AccountId,RecordTypeId,MailingState,MailingPostalCode,MailingCountry,LeadSource,OwnerId,HasOptedOutOfEmail,HasOptedOutOfFax,...,Send_Green_Survey__c,Professional_Certification__c,TS_Referral_Requested_By__c,RealZip__RealZip__c,Discharge_Type__c,Discharge_Disposition__c,Date_Turned_Black__c,Litmos__Litmos_Login_Access__c,Litmos__Total_Sum_Percentages__c,Willing_to_Relocate_to_High_Risk_Area__c


In [20]:
ownerId = data['OwnerId'].unique()
count = 0
for i in ownerId:
    if (i in data["Id"]):
        count = count + 1
count

0

Observations: Both Client and Volunteer has Owner. Owner itself doesn't have a record in the Contacts (which include Volunteers and Clients) because Owners are staff members. Conclusion: Owner and Volunteer are different. We want to assess performance of the Owner, not Volunteer.

# Owners

Check if the owner actually completed the assessment and resume for a case. And from there decide who should be the case owner.

In [21]:
print(approved_hire.shape)
result3 = approved_hire[(approved_hire['OwnerId']== approved_hire['Assessment_Completed_By__c']) & (approved_hire['OwnerId']== approved_hire['Resume_Completed_By__c'])]
print(result3.shape)

(22998, 391)
(17336, 391)


In [22]:
result4 = approved_hire[(approved_hire['Resume_Completed_By__c']== approved_hire['Assessment_Completed_By__c'])]
result4.shape

(22455, 391)

In [23]:
# How many cases the resume and assessment were completed by the same person but not the OwnerId
result4.shape[0]-result3.shape[0]

5119

In [24]:
# How many cases the resume and assessment were completed by diffrent person
result5 = approved_hire[(approved_hire['Resume_Completed_By__c']!= approved_hire['Assessment_Completed_By__c'])]
result5.shape
result5[['OwnerId','Resume_Completed_By__c','Assessment_Completed_By__c']]

Unnamed: 0,OwnerId,Resume_Completed_By__c,Assessment_Completed_By__c
864,00550000002ME8BAAW,00550000002ME8BAAW,
3606,0050z0000079dpsAAA,00538000004SivXAAS,005500000043hc0AAA
6254,0050z0000079ElgAAE,00538000004kCkJAAU,00550000000ztRtAAI
12641,005500000042TrsAAE,005500000042TrsAAE,
12650,0050z0000079ElvAAE,005500000044HdTAAU,00550000003HPPXAA4
12752,00550000003JObzAAG,,00550000003JObzAAG
12779,005500000043hc0AAA,005500000043hc0AAA,005500000043hbvAAA
12856,005500000042c6MAAQ,00550000002ME8LAAW,005500000042c6MAAQ
12988,00538000004lJWlAAM,00538000004lJWlAAM,005500000042I1HAAU
13058,00538000004lJWlAAM,00538000004lJWlAAM,005500000042I1HAAU


In [25]:
result7 = approved_hire[(approved_hire['OwnerId']!= approved_hire['Assessment_Completed_By__c']) & (approved_hire['OwnerId'] != approved_hire['Resume_Completed_By__c'])]
result5.shape


(543, 391)

In [26]:
count_value_null('Assessment_Completed_By__c', approved_hire)

Number of records: 22998
Null Values: 112
Null Values %: 0.4869988694669101
Unique Values: 142
Top 10 Value Counts:
005500000042TrsAAE    555
00550000001zVqjAAE    533
00550000001YDwUAAW    499
00550000002dthaAAA    499
005500000042c6HAAQ    494
00550000003HPPXAA4    457
00538000004lAqgAAE    443
005500000042GlIAAU    435
005500000042pkoAAA    403
00538000004lAqqAAE    399
Name: Assessment_Completed_By__c, dtype: int64




In [27]:
count_value_null('Resume_Completed_By__c', approved_hire)

Number of records: 22998
Null Values: 120
Null Values %: 0.5217845030002608
Unique Values: 142
Top 10 Value Counts:
005500000042TrsAAE    555
00550000001zVqjAAE    541
00550000002dthaAAA    519
00550000001YDwUAAW    499
005500000042c6HAAQ    492
00550000003HPPXAA4    456
005500000042GlIAAU    445
00538000004lAqgAAE    440
00550000003HPRrAAO    408
005500000042pkoAAA    407
Name: Resume_Completed_By__c, dtype: int64




In [28]:
count_value_null('OwnerId', approved_hire)

Number of records: 22998
Null Values: 0
Null Values %: 0.0
Unique Values: 150
Top 10 Value Counts:
00550000001zVqjAAE    649
005500000042TrsAAE    604
00550000001YDwUAAW    588
005500000042c6HAAQ    556
00538000004lAqgAAE    545
005500000042GlIAAU    542
00538000004sKV8AAM    529
005500000042X0TAAU    466
00550000003HPPXAA4    461
00538000004lAqqAAE    455
Name: OwnerId, dtype: int64




In [29]:
approved_hire = approved_hire.dropna(subset=['Assessment_Completed_By__c', 'Resume_Completed_By__c'], how='all')
approved_hire.shape

(22949, 391)

In [30]:
approved_hire['Case_Owner'] = approved_hire['OwnerId']
approved_hire.loc[approved_hire['Resume_Completed_By__c'] == approved_hire['Assessment_Completed_By__c'], 'Case_Owner'] = approved_hire['Resume_Completed_By__c']
approved_hire[['Case_Owner', 'OwnerId','Resume_Completed_By__c','Assessment_Completed_By__c']]

Unnamed: 0,Case_Owner,OwnerId,Resume_Completed_By__c,Assessment_Completed_By__c
380,00538000005F2ADAA0,00538000005F2ADAA0,00538000005F2ADAA0,00538000005F2ADAA0
864,00550000002ME8BAAW,00550000002ME8BAAW,00550000002ME8BAAW,
898,00538000005Z3wrAAC,00538000005Z3wrAAC,00538000005Z3wrAAC,00538000005Z3wrAAC
1638,00550000002dthaAAA,005500000042c6HAAQ,00550000002dthaAAA,00550000002dthaAAA
1676,00538000005U4bgAAC,00538000005U4bgAAC,00538000005U4bgAAC,00538000005U4bgAAC
1679,00538000005Z3wcAAC,00538000005Z3wcAAC,00538000005Z3wcAAC,00538000005Z3wcAAC
1689,005500000042GlIAAU,005500000042GlIAAU,005500000042GlIAAU,005500000042GlIAAU
1697,00538000004lAqqAAE,00538000004lAqqAAE,00538000004lAqqAAE,00538000004lAqqAAE
1716,00538000004yM8rAAE,00538000005TfVXAA0,00538000004yM8rAAE,00538000004yM8rAAE
1723,00538000005CuSeAAK,00538000004scF3AAI,00538000005CuSeAAK,00538000005CuSeAAK


In [31]:
count_value_null('Case_Owner', approved_hire)

Number of records: 22949
Null Values: 0
Null Values %: 0.0
Unique Values: 146
Top 10 Value Counts:
005500000042TrsAAE    558
00550000001zVqjAAE    543
00550000002dthaAAA    518
00550000001YDwUAAW    508
005500000042c6HAAQ    495
00550000003HPPXAA4    459
005500000042GlIAAU    448
00538000004lAqgAAE    445
00550000003HPRrAAO    409
005500000042pkoAAA    407
Name: Case_Owner, dtype: int64




# Actions Taken by the Owners

In [32]:
# How many times Staff member has attempted to reach contact by phone (successful AND unsuccessful)
count_value_null('ringdna100__Call_Attempts__c', approved_hire)
count_value_null('ringdna100__Call_Attempts__c', non_hire)

Number of records: 22949
Null Values: 15931
Null Values %: 69.4191468037823
Unique Values: 22
Top 10 Value Counts:
1.0     1316
2.0     1198
3.0     1097
4.0      971
5.0      768
6.0      574
7.0      360
8.0      262
9.0      186
10.0     113
Name: ringdna100__Call_Attempts__c, dtype: int64


Number of records: 75394
Null Values: 53493
Null Values %: 70.95126933177707
Unique Values: 31
Top 10 Value Counts:
1.0     4558
3.0     4046
4.0     2552
2.0     2550
5.0     1792
6.0     1466
7.0     1147
8.0      914
9.0      701
10.0     585
Name: ringdna100__Call_Attempts__c, dtype: int64




In [33]:
count_value_null('ringdna100__Email_Attempts__c', approved_hire)
count_value_null('ringdna100__Email_Attempts__c', non_hire)

Number of records: 22949
Null Values: 15252
Null Values %: 66.46041221839731
Unique Values: 53
Top 10 Value Counts:
1.0     1410
2.0     1015
3.0      905
4.0      767
5.0      638
6.0      529
7.0      394
8.0      371
9.0      290
10.0     275
Name: ringdna100__Email_Attempts__c, dtype: int64


Number of records: 75394
Null Values: 53467
Null Values %: 70.9167838289519
Unique Values: 52
Top 10 Value Counts:
1.0     3797
3.0     3787
2.0     2266
4.0     2166
5.0     1691
6.0     1543
7.0     1158
8.0      964
9.0      814
10.0     713
Name: ringdna100__Email_Attempts__c, dtype: int64




In [34]:
approved_hire['ringdna100__Email_Attempts__c'][approved_hire['ringdna100__Email_Attempts__c']> 10.0].value_counts()

11.0     204
12.0     158
14.0     119
13.0     117
15.0      85
16.0      71
17.0      58
18.0      49
19.0      41
20.0      33
21.0      27
23.0      20
26.0      15
22.0      14
24.0      12
27.0      11
28.0       9
30.0       7
29.0       7
25.0       7
31.0       5
33.0       5
40.0       3
34.0       3
39.0       2
51.0       2
44.0       2
47.0       2
58.0       1
92.0       1
36.0       1
32.0       1
37.0       1
53.0       1
43.0       1
104.0      1
38.0       1
42.0       1
35.0       1
100.0      1
63.0       1
50.0       1
95.0       1
Name: ringdna100__Email_Attempts__c, dtype: int64

In [35]:
# How many times Staff member has attempted to reach contact by phone (successful AND unsuccessful)
approved_hire['ringdna100__Call_Attempts__c'][approved_hire['ringdna100__Call_Attempts__c']>10.0].value_counts()

11.0    51
13.0    36
12.0    32
14.0    18
15.0    14
16.0     7
17.0     6
18.0     3
20.0     3
21.0     1
25.0     1
30.0     1
Name: ringdna100__Call_Attempts__c, dtype: int64

In [36]:
# Date contact was deemd unresponsive
count_value_null('Date_turned_grey__c', approved_hire)

Number of records: 22949
Null Values: 20104
Null Values %: 87.60294566212036
Unique Values: 887
Top 10 Value Counts:
7/27/2017 0:00     87
6/10/2016 0:00     16
6/3/2016 0:00      12
6/2/2017 0:00      12
6/27/2017 0:00     11
4/29/2016 0:00     11
7/7/2017 0:00      11
6/30/2017 0:00     11
12/20/2016 0:00    10
4/4/2016 0:00      10
Name: Date_turned_grey__c, dtype: int64




In [37]:
temp = approved_hire[['CreatedDate','Date_turned_grey__c', 'Date_Turned_Black__c','Date_turned_green__c','Date_Submitted_for_Hire__c']]
temp = temp.dropna(subset=['Date_turned_grey__c', 'Date_Turned_Black__c'], how='all')
temp


Unnamed: 0,CreatedDate,Date_turned_grey__c,Date_Turned_Black__c,Date_turned_green__c,Date_Submitted_for_Hire__c
1638,4/23/2014 23:19,10/8/2014 0:00,,10/8/2014 0:00,9/10/2015 0:00
2290,5/21/2015 19:15,11/23/2015 0:00,,5/27/2015 0:00,6/1/2016 0:00
4811,4/27/2016 13:38,8/14/2018 0:00,,1/2/2018 0:00,3/14/2017 0:00
4846,11/19/2013 5:33,5/20/2016 0:00,,4/7/2014 0:00,6/20/2018 0:00
6255,8/13/2015 13:27,3/15/2016 0:00,,,3/16/2016 0:00
6310,1/31/2017 18:52,,2/24/2017 0:00,6/7/2018 0:00,7/3/2018 0:00
6311,2/16/2014 3:23,7/20/2016 0:00,,11/9/2018 0:00,3/19/2018 0:00
9506,9/24/2015 14:37,12/10/2018 0:00,,10/1/2015 0:00,2/3/2016 0:00
12584,8/24/2015 0:02,5/24/2017 0:00,,9/3/2015 0:00,1/2/2018 0:00
12585,8/24/2015 0:41,9/15/2015 0:00,,9/19/2016 0:00,11/3/2016 0:00


In [38]:
approved_hire['Days_To_Grey'] = int()
from datetime import datetime
for index, row in approved_hire.iterrows():
    if pd.isnull(row['Date_turned_grey__c']) == False:
        date_format = "%m/%d/%Y %H:%M"
        a = datetime.strptime(str(row['CreatedDate']), date_format)
        b = datetime.strptime(str(row['Date_turned_grey__c']), date_format)
        approved_hire.at[index,'Days_To_Grey']=(b-a).days

In [39]:
approved_hire['Days_To_Grey'].value_counts()

0        20111
9          111
13          94
10          92
7           92
12          84
6           82
11          82
8           82
14          53
16          51
15          48
17          45
19          41
3           39
5           37
20          34
4           34
18          32
21          27
2           21
22          20
23          20
24          20
26          17
25          15
28          12
195         11
27          11
29          11
         ...  
617          1
633          1
681          1
969          1
1177         1
568          1
488          1
68014        1
823          1
135          1
151          1
359          1
439          1
471          1
599          1
663          1
679          1
759          1
887          1
472          1
903          1
1191         1
1879         1
1631         1
72           1
248          1
360          1
424          1
456          1
1855         1
Name: Days_To_Grey, Length: 666, dtype: int64

In [40]:
#Date contact was indicated to be "inactive"
count_value_null('Date_Turned_Black__c', approved_hire)

Number of records: 22949
Null Values: 22910
Null Values %: 99.83005795459498
Unique Values: 34
Top 10 Value Counts:
3/6/2018 0:00     2
3/7/2018 0:00     2
3/29/2018 0:00    2
4/4/2018 0:00     2
1/31/2018 0:00    2
4/26/2018 0:00    1
1/19/2018 0:00    1
2/13/2018 0:00    1
4/8/2016 0:00     1
5/17/2018 0:00    1
Name: Date_Turned_Black__c, dtype: int64




In [41]:
approved_hire['Days_To_Black'] = int()
from datetime import datetime
for index, row in approved_hire.iterrows():
    if pd.isnull(row['Date_Turned_Black__c']) == False:
        date_format = "%m/%d/%Y %H:%M"
        a = datetime.strptime(str(row['CreatedDate']), date_format)
        b = datetime.strptime(str(row['Date_Turned_Black__c']), date_format)
        approved_hire.at[index,'Days_To_Black']=(b-a).days

In [42]:
approved_hire['Days_To_Black'].value_counts()

0       22910
7           6
5           4
6           3
2           3
4           3
10          2
23          2
69          1
16          1
3           1
1283        1
20          1
564         1
188         1
12          1
813         1
216         1
9           1
25          1
90          1
11          1
91          1
15          1
Name: Days_To_Black, dtype: int64

In [43]:
count_value_null('Date_Submitted_for_Hire__c', approved_hire)

Number of records: 22949
Null Values: 7
Null Values %: 0.03050241840603076
Unique Values: 1007
Top 10 Value Counts:
4/20/2018 0:00    86
4/13/2018 0:00    84
3/30/2018 0:00    68
4/16/2018 0:00    65
12/1/2017 0:00    63
4/12/2018 0:00    62
6/5/2017 0:00     60
9/1/2017 0:00     60
4/5/2018 0:00     60
7/11/2017 0:00    60
Name: Date_Submitted_for_Hire__c, dtype: int64




In [44]:
approved_hire['Days_To_Hire'] = int()
from datetime import datetime
for index, row in approved_hire.iterrows():
    if pd.isnull(row['Date_Submitted_for_Hire__c']) == False:
        date_format = "%m/%d/%Y %H:%M"
        a = datetime.strptime(str(row['CreatedDate']), date_format)
        b = datetime.strptime(str(row['Date_Submitted_for_Hire__c']), date_format)
        approved_hire.at[index,'Days_To_Hire']=(b-a).days

In [45]:
approved_hire['Days_To_Hire'].value_counts()
#approved_hire['Days_To_Hire'].mean()

76      110
111     106
69      106
34      103
55      102
97      101
84      100
103     100
90       99
98       99
50       97
118      96
56       96
77       96
126      95
124      93
82       93
133      91
83       91
139      91
70       90
63       90
71       90
73       90
49       89
105      88
41       88
101      87
68       87
48       86
       ... 
1157      1
1365      1
1540      1
1396      1
1364      1
1284      1
1804      1
1500      1
1452      1
1404      1
1388      1
1292      1
1260      1
1196      1
1164      1
1132      1
1100      1
2693      1
724       1
852       1
868       1
932       1
964       1
980       1
1012      1
1108      1
1172      1
1220      1
1268      1
1887      1
Name: Days_To_Hire, Length: 1241, dtype: int64

In [46]:
print(approved_hire["Gender__c"].value_counts())
print(non_hire["Gender__c"].value_counts())
print(client["Gender__c"].value_counts())

Male      16542
Female     4216
Name: Gender__c, dtype: int64
Male      29457
Female     9841
Name: Gender__c, dtype: int64
Male      51515
Female    15429
Name: Gender__c, dtype: int64


In [47]:
count_value_null('Months_Unemployed__c', approved_hire)
count_value_null('Months_Unemployed__c', non_hire)
# Months_unemployed seemed to be calculated after a client get hired because 99.9% non_hire don't have months_unemployed


Number of records: 22949
Null Values: 17181
Null Values %: 74.86600723343065
Unique Values: 25
Top 10 Value Counts:
0.0     2364
1.0      734
2.0      644
3.0      501
4.0      352
6.0      247
5.0      224
12.0     165
8.0      122
7.0      110
Name: Months_Unemployed__c, dtype: int64


Number of records: 75394
Null Values: 75351
Null Values %: 99.94296628378916
Unique Values: 8
Top 10 Value Counts:
0.0    15
1.0    14
2.0     5
3.0     3
6.0     2
5.0     2
4.0     1
9.0     1
Name: Months_Unemployed__c, dtype: int64




In [48]:
count_value_null('Date_of_Service_Entry__c', approved_hire)

Number of records: 22949
Null Values: 21301
Null Values %: 92.81885920955162
Unique Values: 1447
Top 10 Value Counts:
1998         9
2007         5
11/4/2004    4
2004         4
1995         4
2001         4
2003         4
6-Jan        4
1990         4
2008         4
Name: Date_of_Service_Entry__c, dtype: int64




In [49]:
count_value_null('Date_of_Separation__c', approved_hire)

Number of records: 22949
Null Values: 21006
Null Values %: 91.53340014815461
Unique Values: 1356
Top 10 Value Counts:
UNK           17
Retirement    16
Unknown       16
TBD           15
5/1/2015      14
11/1/2014     13
RETIRED       12
2014          12
Retiring      11
1/15/2015     11
Name: Date_of_Separation__c, dtype: int64




Cannot calculate service length from these entry date and seperation date because there's a lot of null values

In [50]:
count_value_null('Finalized_HHUSA_revised_resume_on_file__c', approved_hire)
count_value_null('Finalized_HHUSA_revised_resume_on_file__c', non_hire)

Number of records: 22949
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
1    22919
0       30
Name: Finalized_HHUSA_revised_resume_on_file__c, dtype: int64


Number of records: 75394
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
0    49939
1    25455
Name: Finalized_HHUSA_revised_resume_on_file__c, dtype: int64




In [51]:
count_value_null('Dat_Initial_Assessment_was_Completed__c', approved_hire)

Number of records: 22949
Null Values: 106
Null Values %: 0.4618937644341801
Unique Values: 1363
Top 10 Value Counts:
5/16/2016 0:00     124
12/5/2016 0:00     117
5/2/2016 0:00      106
2/8/2016 0:00      106
4/11/2016 0:00     105
1/23/2017 0:00     104
4/18/2016 0:00     103
1/25/2016 0:00     103
2/27/2017 0:00     102
11/28/2016 0:00    101
Name: Dat_Initial_Assessment_was_Completed__c, dtype: int64




In [52]:
approved_hire['Days_To_Assessment'] = int()

for index, row in approved_hire.iterrows():
    if pd.isnull(row['Dat_Initial_Assessment_was_Completed__c']) == False:
        date_format = "%m/%d/%Y %H:%M"
        a = datetime.strptime(str(row['CreatedDate']), date_format)
        b = datetime.strptime(str(row['Dat_Initial_Assessment_was_Completed__c']), date_format)
        approved_hire.at[index,'Days_To_Assessment']=(b-a).days
approved_hire['Days_To_Assessment'].value_counts()
#approved_hire[approved_hire['Days_To_Assessment'] > 30].shape

-1       2559
 5       2558
 4       2335
 6       2204
 3       2118
 2       1542
 0       1268
 7       1126
 1        860
 8        652
 9        521
 10       513
 12       464
 11       459
 13       391
-2        347
 14       239
 17       154
 19       134
 15       132
 18       129
-3        124
 16       122
 20       110
 25        80
 21        77
 24        77
 23        64
 26        63
-4         60
         ... 
 281        1
 345        1
 327        1
 409        1
 473        1
 489        1
 617        1
 921        1
 1449       1
 1529       1
 2601       1
 552        1
 376        1
 360        1
 423        1
 455        1
 615        1
 695        1
 823        1
 3030       1
 999        1
 1031       1
 1239       1
 2023       1
 136        1
 168        1
 232        1
 280        1
 312        1
 2358       1
Name: Days_To_Assessment, Length: 444, dtype: int64

In [82]:
non_hire['Days_To_Current'] = int()

for index, row in non_hire.iterrows():
    date_format = "%m/%d/%Y %H:%M"
    a = datetime.strptime(str(row['CreatedDate']), date_format)
    b = datetime.strptime('1/25/2019 0:00', date_format)
    non_hire.at[index,'Days_To_Current']=(b-a).days
#non_hire['Days_To_Current'].value_counts()
non_hire['Days_To_Current'].mean()
non_hire = non_hire[non_hire['Days_To_Current'] > 60]
non_hire.shape

(72262, 392)

In [54]:
count_value_null('Date_Assigned_To_HHUSA__c', client)
client['CreatedDate']

Number of records: 105744
Null Values: 31451
Null Values %: 29.742585867756087
Unique Values: 1574
Top 10 Value Counts:
12/5/2016 0:00     447
12/12/2016 0:00    438
5/16/2016 0:00     435
11/28/2016 0:00    426
6/6/2016 0:00      418
1/23/2017 0:00     407
1/30/2017 0:00     402
10/3/2016 0:00     398
2/6/2017 0:00      395
10/24/2016 0:00    387
Name: Date_Assigned_To_HHUSA__c, dtype: int64




0           8/5/2018 5:28
1           8/5/2018 5:59
2          8/5/2018 11:45
3          8/5/2018 14:05
4          8/5/2018 15:41
5          8/5/2018 15:55
6          8/5/2018 16:28
7          8/5/2018 17:10
8          8/5/2018 17:24
9          8/5/2018 19:06
10         8/5/2018 20:27
11         8/5/2018 20:33
12         8/5/2018 20:40
13          8/6/2018 0:12
14          8/6/2018 0:24
16          8/6/2018 1:55
17          8/6/2018 2:23
18          8/6/2018 3:39
19          8/6/2018 4:47
20         8/6/2018 10:28
21         8/6/2018 11:06
26         8/6/2018 13:29
29         8/6/2018 13:40
30         8/6/2018 13:41
31         8/6/2018 14:28
32         8/6/2018 14:29
33         8/6/2018 14:41
34         8/6/2018 14:44
36         8/6/2018 14:54
37         8/6/2018 15:09
               ...       
132413    8/21/2015 16:53
132414    8/21/2015 17:01
132415    8/21/2015 17:32
132416    8/21/2015 17:39
132418    8/21/2015 18:25
132419    8/21/2015 18:32
132420    8/21/2015 18:33
132421    8/

In [86]:
count_value_null('Used_Volunteer_Services__c', approved_hire)
count_value_null('Used_Volunteer_Services__c', non_hire)

Number of records: 22949
Null Values: 0
Null Values %: 0.0
Unique Values: 2
Top 10 Value Counts:
0.0    20125
1.0     2824
Name: Used_Volunteer_Services__c, dtype: int64


Number of records: 72262
Null Values: 6304
Null Values %: 8.723810578173866
Unique Values: 2
Top 10 Value Counts:
0.0    64194
1.0     1764
Name: Used_Volunteer_Services__c, dtype: int64




In [90]:
count_value_null('Security_Clearance_Description__c', approved_hire)
count_value_null('Security_Clearance_Description__c', non_hire)
#Security_Clearance_Description__c
#If_Security_Clearance_Yes_What_kind__c

Number of records: 22949
Null Values: 1398
Null Values %: 6.091768704518715
Unique Values: 23
Top 10 Value Counts:
Yes                               13618
No                                 4675
Inactive                           1792
Active                              978
Secret                              329
Top Secret, SBI/SCI                  72
Top Secret                           26
SBI/SCI                              16
Top Secret, SBI/SCI, Polygraph        7
Confidential                          6
Name: Security_Clearance_Description__c, dtype: int64


Number of records: 72262
Null Values: 29714
Null Values %: 41.11981401012981
Unique Values: 188
Top 10 Value Counts:
Yes                     20420
No                      17474
Inactive                 1811
Secret                   1298
Active                    869
Top Secret, SBI/SCI       135
Top Secret                121
SBI/SCI                    49
Confidential               25
Confidential, Secret       17
Name: Securi

# Calculate Responsive Points

In [91]:
approved_hire['Responsive_Point'] = 0.0
count_value_null('Responsive_Point', approved_hire)

Number of records: 22949
Null Values: 0
Null Values %: 0.0
Unique Values: 1
Top 10 Value Counts:
0.0    22949
Name: Responsive_Point, dtype: int64




Create function to calculate responsive points for confirmed hires from actions

In [92]:
def update_responsive_points(feature, point):
    for index, rowData in approved_hire.iterrows():
        if rowData[feature] == 1.0:
            approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + point
    print(approved_hire['Responsive_Point'].value_counts().sort_index())

In [93]:
#True / False (indicates O2O coordinator has had first contact with client)
update_responsive_points('O2O_Initial_Assessment_Complete__c', 0.5)  

0.0    22621
0.5      328
Name: Responsive_Point, dtype: int64


In [94]:
# True / False (indicates new resume created / revised)
update_responsive_points('Finalized_HHUSA_revised_resume_on_file__c', 0.5)    

0.0       30
0.5    22591
1.0      328
Name: Responsive_Point, dtype: int64


In [95]:
# True / False (indicates client has created a profile on HHUSA job board)
update_responsive_points('On_Job_Board__c', 1)           

0.0       29
0.5    19877
1.0      188
1.5     2714
2.0      141
Name: Responsive_Point, dtype: int64


In [96]:
# Client has attended one or more virtual workshop events
update_responsive_points('Virtual_Workshop_Participant__c', 1.5)  

0.0       29
0.5    19854
1.0      188
1.5     2689
2.0      164
3.0       25
Name: Responsive_Point, dtype: int64


In [97]:
#Client has attended / participated in a virtual career fair
update_responsive_points('VCF_Participant__c', 1.5)  

0.0       28
0.5    19667
1.0      173
1.5     2525
2.0      321
2.5       15
3.0      176
3.5       30
4.5       14
Name: Responsive_Point, dtype: int64


In [98]:
#Client has attended one or more webinar events
update_responsive_points('Webinar_Participant__c', 1.5)  

0.0       28
0.5    19611
1.0      171
1.5     2486
2.0      364
2.5       16
3.0      204
3.5       40
4.0        1
4.5       24
5.0        3
6.0        1
Name: Responsive_Point, dtype: int64


In [99]:
#Participated in a mentoring session or mock phone interview
update_responsive_points('Used_Volunteer_Services__c', 2) 

0.0       25
0.5    17536
1.0      117
1.5     2014
2.0      251
2.5     2086
3.0      195
3.5      492
4.0      117
4.5       16
5.0       64
5.5       20
6.5       13
7.0        2
8.0        1
Name: Responsive_Point, dtype: int64


In [100]:
#Job Seeking client had a federal resume reviewed by HHUSA team
update_responsive_points('Used_Federal_Services__c', 2)

0.0       25
0.5    16898
1.0      109
1.5     1850
2.0      230
2.5     2561
3.0      179
3.5      591
4.0      124
4.5      175
5.0       73
5.5       82
6.0       14
6.5       14
7.0       17
7.5        3
8.0        1
8.5        3
Name: Responsive_Point, dtype: int64


In [101]:
# How many times Staff member has attempted to reach contact by phone (successful AND unsuccessful)
for index, rowData in approved_hire.iterrows():
    if rowData['ringdna100__Call_Attempts__c'] > 10.0:
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + 2
print(approved_hire['Responsive_Point'].value_counts().sort_index())

0.0        25
0.5     16844
1.0       109
1.5      1831
2.0       228
2.5      2584
3.0       168
3.5       586
4.0       117
4.5       199
5.0        81
5.5        99
6.0        22
6.5        19
7.0        18
7.5        10
8.0         2
8.5         4
9.0         2
10.5        1
Name: Responsive_Point, dtype: int64


In [102]:
# How many times Staff member has attempted to reach contact by phone (successful AND unsuccessful)
for index, rowData in approved_hire.iterrows():
    if rowData['ringdna100__Email_Attempts__c'] > 10.0:
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + 2
print(approved_hire['Responsive_Point'].value_counts().sort_index())

0.0        25
0.5     16536
1.0       102
1.5      1701
2.0       203
2.5      2641
3.0       145
3.5       585
4.0       108
4.5       387
5.0        82
5.5       185
6.0        42
6.5        70
7.0        36
7.5        48
8.0        15
8.5        14
9.0        11
9.5         7
10.0        1
10.5        2
11.0        2
12.5        1
Name: Responsive_Point, dtype: int64


In [103]:
for index, rowData in approved_hire.iterrows():
    if rowData['Days_To_Grey']> 0 & rowData['Days_To_Grey'] < 30:
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] - 2
print(approved_hire['Responsive_Point'].value_counts().sort_index())

-2.0         4
-1.5      2035
-1.0        20
-0.5       222
 0.0        54
 0.5     14793
 1.0       103
 1.5      1553
 2.0       184
 2.5      2398
 3.0       137
 3.5       532
 4.0        98
 4.5       355
 5.0        73
 5.5       169
 6.0        39
 6.5        55
 7.0        33
 7.5        43
 8.0        14
 8.5        12
 9.0        11
 9.5         7
 10.0        1
 10.5        2
 11.0        1
 12.5        1
Name: Responsive_Point, dtype: int64


In [104]:
for index, rowData in approved_hire.iterrows():
    if rowData['Days_To_Black']> 0 & rowData['Days_To_Black'] < 30:
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] - 3
print(approved_hire['Responsive_Point'].value_counts().sort_index())

-3.5         2
-2.5        17
-2.0         4
-1.5      2039
-1.0        20
-0.5       230
 0.0        56
 0.5     14779
 1.0       103
 1.5      1550
 2.0       184
 2.5      2388
 3.0       135
 3.5       529
 4.0        98
 4.5       354
 5.0        73
 5.5       169
 6.0        39
 6.5        55
 7.0        33
 7.5        43
 8.0        14
 8.5        12
 9.0        11
 9.5         7
 10.0        1
 10.5        2
 11.0        1
 12.5        1
Name: Responsive_Point, dtype: int64


In [105]:
approved_hire['Highest_Level_of_Education_Completed__c'].value_counts()

High School/GED                            7903
4 Year Degree (BA, BS, etc.)               6987
Post-Graduate Degree (MA, MS, JD, etc.)    3722
2 Year Degree (AA, AS, etc.)               3595
Doctorate (PhD, MD, etc.)                   153
Associate degree (A.A., A.S., etc.)          21
College/University                            6
Other, please specify                         1
HS/GED                                        1
Master?s degree                               1
Name: Highest_Level_of_Education_Completed__c, dtype: int64

In [106]:
for index, rowData in approved_hire.iterrows():
    if rowData['Highest_Level_of_Education_Completed__c'] == "Doctorate (PhD, MD, etc.)":
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + 10
    elif rowData['Highest_Level_of_Education_Completed__c'] == "Post-Graduate Degree (MA, MS, JD, etc.)":
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + 8
    elif rowData['Highest_Level_of_Education_Completed__c'] == "4 Year Degree (BA, BS, etc.)":
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + 6
    elif rowData['Highest_Level_of_Education_Completed__c'] == "2 Year Degree (AA, AS, etc.)":
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] + 4
    elif rowData['Highest_Level_of_Education_Completed__c'] == "High School/GED":
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] - 4
print(approved_hire['Responsive_Point'].value_counts().sort_index())

-6.5        7
-6.0        2
-5.5      796
-5.0        5
-4.5       66
-4.0       16
-3.5     5625
-3.0       37
-2.5      435
-2.0       36
-1.5      657
-1.0       20
-0.5      120
 0.0       26
 0.5      500
 1.0       14
 1.5       40
 2.0       10
 2.5      429
 3.0        5
 3.5       63
 4.0       16
 4.5     2916
 5.0       21
 5.5      336
 6.0       27
 6.5     4771
 7.0       57
 7.5      625
 8.0       96
 8.5     3042
 9.0       74
 9.5      527
 10.0      97
 10.5     726
 11.0      72
 11.5     225
 12.0      40
 12.5     135
 13.0      42
 13.5      81
 14.0      22
 14.5      27
 15.0      25
 15.5      17
 16.0       7
 16.5       5
 17.0       3
 17.5       2
 18.0       1
 18.5       2
 19.0       1
 19.5       1
 20.5       1
Name: Responsive_Point, dtype: int64


In [107]:
for index, rowData in approved_hire.iterrows():
    if rowData['Months_Unemployed__c']> 6.0:
        approved_hire.at[index,'Responsive_Point'] = rowData['Responsive_Point'] - 3
print(approved_hire['Responsive_Point'].value_counts().sort_index())

-8.5       37
-8.0        2
-7.5        7
-7.0        1
-6.5      120
-6.0        6
-5.5      776
-5.0        5
-4.5       79
-4.0       15
-3.5     5518
-3.0       34
-2.5      424
-2.0       35
-1.5      640
-1.0       20
-0.5      138
 0.0       27
 0.5      497
 1.0       14
 1.5      113
 2.0       11
 2.5      416
 3.0        4
 3.5      170
 4.0       17
 4.5     2870
 5.0       25
 5.5      411
 6.0       29
 6.5     4687
 7.0       59
 7.5      629
 8.0      100
 8.5     2978
 9.0       72
 9.5      507
 10.0      98
 10.5     698
 11.0      66
 11.5     205
 12.0      44
 12.5     130
 13.0      37
 13.5      77
 14.0      19
 14.5      25
 15.0      20
 15.5      16
 16.0       8
 16.5       3
 17.0       3
 17.5       2
 18.0       1
 18.5       2
 19.5       1
 20.5       1
Name: Responsive_Point, dtype: int64


In [108]:
approved_hire['Case_Type'] = "Medium"
for index, rowData in approved_hire.iterrows():
    if rowData['Responsive_Point'] < 0.0:
        approved_hire.at[index,'Case_Type'] = "Hard"
    if rowData['Responsive_Point'] > 5.0:
        approved_hire.at[index,'Case_Type'] = "Easy"
print(approved_hire['Case_Type'].value_counts())

Easy      10928
Hard       7857
Medium     4164
Name: Case_Type, dtype: int64


In [109]:
table = pd.crosstab(approved_hire['Case_Owner'], columns=approved_hire['Case_Type'])
table

Case_Type,Easy,Hard,Medium
Case_Owner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0050z0000079ElgAAE,1,0,0
0050z0000079ElvAAE,0,0,1
0050z0000079dpsAAA,1,0,0
00538000003nrcXAAQ,1,1,0
005380000044j39AAA,170,116,48
005380000044j3JAAQ,3,3,1
00538000004SivDAAS,149,116,87
00538000004SivXAAS,86,66,28
00538000004SiwBAAS,68,35,22
00538000004SiwLAAS,102,65,38
