# Read me

This code clean the data listed in our DSP data dictionary

In [1]:
import numpy as np
import pandas as pd

# Load Contact Dataset

In [2]:
contact_dataset = pd.read_csv('SalesForce_Contact.csv', encoding='latin', low_memory=False)
contact_dataset.shape

(132445, 391)

# Load Hire Info Dataset

In [3]:
Hireinfo_dataset = pd.read_csv('SalesForce_Hire_Information__c.csv', encoding='latin', low_memory=False)
Hireinfo_dataset.shape

(30754, 34)

## General Methods

In [4]:
def count_records_unique_null(feature, data):
    print("Number of records:", len(data[feature].index))
    print("Unique Values:", data[feature].nunique())
    print("Null Values:", data[feature].isna().sum())
    print("Null Values %:", data[feature].isna().sum()/len(data.index)*100)
def count_table(feature, data):
    data_by_feature = data.groupby(feature)
    countTable = data_by_feature[feature].agg(['count'])
    total_feature = countTable.values.sum()
    proportion = np.divide(countTable['count'], total_feature/100)
    countTable['%'] = proportion
    countTable = countTable.reset_index()
    return countTable.nlargest(30, 'count')
def drop_null(feature, data):
    print("*** Before ***")
    count_records_unique_null(feature, data)
    clean_dataset = data.dropna(subset = [feature])
    return clean_dataset
def summary(feature, data):
    print("*** After ***")
    count_records_unique_null(feature, data)
    return count_table(feature, data)

## 1. State

In [5]:
# Maxim's Code edited by Nameetha

In [6]:
clean_contact_dataset = drop_null('MailingState', contact_dataset)
summary('MailingState', clean_contact_dataset)

*** Before ***
Number of records: 132445
Unique Values: 381
Null Values: 8083
Null Values %: 6.102910642153347
*** After ***
Number of records: 124362
Unique Values: 381
Null Values: 0
Null Values %: 0.0


Unnamed: 0,MailingState,count,%
281,TX,14532,11.685241
57,CA,12999,10.45255
295,VA,9651,7.760409
96,GA,9223,7.416253
184,NC,7982,6.418359
90,FL,7686,6.180344
61,CO,6515,5.238739
309,WA,4418,3.552532
197,NY,4055,3.260642
154,MD,2740,2.203245


# Focusing on United States of America as they constitute most part of the data

In [7]:
states_dict = pd.read_csv('States_dict.csv', names=['State_abr', 'State_name'])
abr_list=states_dict['State_abr'].tolist()
name_list=states_dict['State_name'].tolist()

In [8]:
clean_contact_dataset['MailingState'] = clean_contact_dataset['MailingState'].str.upper()
clean_contact_dataset.MailingState.replace(abr_list,name_list,inplace=True)
clean_contact_dataset = clean_contact_dataset[clean_contact_dataset['MailingState'].isin(states_dict['State_name'] )]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [9]:
summary('MailingState', clean_contact_dataset)

*** After ***
Number of records: 120070
Unique Values: 50
Null Values: 0
Null Values %: 0.0


Unnamed: 0,MailingState,count,%
42,Texas,14591,12.152078
4,California,13081,10.894478
45,Virginia,9692,8.071958
9,Georgia,9252,7.705505
32,North Carolina,8007,6.66861
8,Florida,7753,6.457067
5,Colorado,6531,5.439327
46,Washington,4453,3.70867
31,New York,4070,3.389689
19,Maryland,2749,2.289498


## 2. Gender

In [10]:
# Alhasan's Code

### Clean the data by droping all the rows with missing values

In [11]:
clean_contact_dataset = drop_null('Gender__c', clean_contact_dataset)
summary('Gender__c', clean_contact_dataset)

*** Before ***
Number of records: 120070
Unique Values: 3
Null Values: 53035
Null Values %: 44.17006746064796
*** After ***
Number of records: 67035
Unique Values: 3
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Gender__c,count,%
2,Male,51175,76.340718
1,Female,15849,23.642873
0,--None--,11,0.016409


### Clean the data by droping all the rows with "--None--"

In [12]:
clean_contact_dataset = clean_contact_dataset.drop(clean_contact_dataset[clean_contact_dataset.Gender__c == "--None--"].index)
summary('Gender__c', clean_contact_dataset)

*** After ***
Number of records: 67024
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Gender__c,count,%
1,Male,51175,76.353247
0,Female,15849,23.646753


## 3. Race

In [13]:
# Alhasan's Code

## 4. Service_Branch

In [14]:
# Alhasan's Code

### Clean the data by droping all the rows with missing values

In [15]:
clean_contact_dataset = drop_null('Service_Branch__c', clean_contact_dataset)
summary('Service_Branch__c', clean_contact_dataset)

*** Before ***
Number of records: 67024
Unique Values: 8
Null Values: 6129
Null Values %: 9.14448555741227
*** After ***
Number of records: 60895
Unique Values: 8
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Service_Branch__c,count,%
1,Army,33341,54.751622
5,Navy,9970,16.372444
0,Air Force,8543,14.029066
3,Marines,8482,13.928894
2,Coast Guard,553,0.908121
6,Not Applicable,3,0.004927
7,Spouse,2,0.003284
4,Merchant Marine,1,0.001642


### Clean the data by including {Spouse, Merchant Marine} into Not Applicable

In [16]:
col = clean_contact_dataset['Service_Branch__c']
col = col.where(col != "Spouse", "Not Applicable")
col = col.where(col != "Merchant Marine", "Not Applicable")
clean_contact_dataset['Service_Branch__c'] = col

In [17]:
summary('Service_Branch__c', clean_contact_dataset)

*** After ***
Number of records: 60895
Unique Values: 6
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Service_Branch__c,count,%
1,Army,33341,54.751622
4,Navy,9970,16.372444
0,Air Force,8543,14.029066
3,Marines,8482,13.928894
2,Coast Guard,553,0.908121
5,Not Applicable,6,0.009853


## 5. Last_Service_Rank

In [18]:
# Alhasan's Code

### Clean the data by droping all the rows with missing values

In [19]:
clean_contact_dataset = drop_null('Service_Rank__c', clean_contact_dataset)
summary('Service_Rank__c', clean_contact_dataset)

*** Before ***
Number of records: 60895
Unique Values: 27
Null Values: 982
Null Values %: 1.6126118728959686
*** After ***
Number of records: 59913
Unique Values: 27
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Service_Rank__c,count,%
6,E-4,13289,22.180495
7,E-5,12970,21.648056
8,E-6,8745,14.596164
9,E-7,7903,13.190793
10,E-8,3512,5.861833
15,O-3,2913,4.86205
5,E-3,2603,4.344633
16,O-4,1529,2.552034
11,E-9,1371,2.288318
17,O-5,1337,2.231569


### Adding Last_Service_Rank Column accourding to:
1. Enlisted Personnel (E)
2. Warrant Officers (W, CW)
3. Commissioned Officers (O)

source: https://www.infoplease.com/us/military-personnel/us-military-ranks

In [20]:
col = clean_contact_dataset['Service_Rank__c']
col = col.where(col.str.startswith('O') == False, "O")
col = col.where(col.str.startswith('E') == False, "E")
col = col.where(col.str.contains('W') == False, "W")
clean_contact_dataset['Last_Service_Rank'] = col

In [21]:
summary('Last_Service_Rank', clean_contact_dataset)

*** After ***
Number of records: 59913
Unique Values: 3
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Last_Service_Rank,count,%
0,E,51250,85.540701
1,O,7361,12.286148
2,W,1302,2.173151


## 8. Education

In [22]:
# Maxim's Code edited by Nameetha

In [23]:
clean_contact_dataset = drop_null('Highest_Level_of_Education_Completed__c', clean_contact_dataset)
summary('Highest_Level_of_Education_Completed__c', clean_contact_dataset)

*** Before ***
Number of records: 59913
Unique Values: 26
Null Values: 6688
Null Values %: 11.162852803231353
*** After ***
Number of records: 53225
Unique Values: 26
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Highest_Level_of_Education_Completed__c,count,%
14,High School/GED,20912,39.289807
2,"4 Year Degree (BA, BS, etc.)",14732,27.678722
0,"2 Year Degree (AA, AS, etc.)",9083,17.065289
21,"Post-Graduate Degree (MA, MS, JD, etc.)",8054,15.131987
9,"Doctorate (PhD, MD, etc.)",340,0.638798
8,College/University,40,0.075153
6,"Associate degree (A.A., A.S., etc.)",32,0.060122
3,4 year college,6,0.011273
20,"Other, please specify",3,0.005636
1,2 year college,2,0.003758


In [24]:
#Higher_education=['High School/GED','4 Year Degree (BA, BS, etc.)','2 Year Degree (AA, AS, etc.)','Post-Graduate Degree (MA, MS, JD, etc.','Doctorate (PhD, MD, etc.)','Post-Graduate Degree (MA, MS, JD, etc.)']
clean_contact_dataset = clean_contact_dataset[clean_contact_dataset['Highest_Level_of_Education_Completed__c'].isin(['High School/GED','4 Year Degree (BA, BS, etc.)','2 Year Degree (AA, AS, etc.)','Post-Graduate Degree (MA, MS, JD, etc.','Doctorate (PhD, MD, etc.)','Post-Graduate Degree (MA, MS, JD, etc.)'] )]

In [25]:
summary('Highest_Level_of_Education_Completed__c', clean_contact_dataset)

*** After ***
Number of records: 53121
Unique Values: 5
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Highest_Level_of_Education_Completed__c,count,%
3,High School/GED,20912,39.366729
1,"4 Year Degree (BA, BS, etc.)",14732,27.732912
0,"2 Year Degree (AA, AS, etc.)",9083,17.098699
4,"Post-Graduate Degree (MA, MS, JD, etc.)",8054,15.161612
2,"Doctorate (PhD, MD, etc.)",340,0.640048


In [26]:
#Nameetha's code

# 9. Hire_Heroes_USA_Confirmed_Hire__c

In [27]:
clean_contact_dataset = drop_null('Hire_Heroes_USA_Confirmed_Hire__c', clean_contact_dataset)
summary('Hire_Heroes_USA_Confirmed_Hire__c', clean_contact_dataset)

*** Before ***
Number of records: 53121
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53121
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Hire_Heroes_USA_Confirmed_Hire__c,count,%
0,0,29415,55.373581
1,1,23706,44.626419


# 10. Used_Volunteer_Services__c

In [28]:
clean_contact_dataset = drop_null('Used_Volunteer_Services__c', clean_contact_dataset)
summary('Used_Volunteer_Services__c', clean_contact_dataset)

*** Before ***
Number of records: 53121
Unique Values: 2
Null Values: 65
Null Values %: 0.12236215432691402
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Used_Volunteer_Services__c,count,%
0,0.0,48704,91.797346
1,1.0,4352,8.202654


# 11. VCF_Participant__c

In [29]:
clean_contact_dataset = drop_null('VCF_Participant__c', clean_contact_dataset)
summary('VCF_Participant__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,VCF_Participant__c,count,%
0,0.0,52159,98.309334
1,1.0,897,1.690666


# 12. Virtual_Workshop_Participant__c

In [30]:
clean_contact_dataset = drop_null('Virtual_Workshop_Participant__c', clean_contact_dataset)
summary('Virtual_Workshop_Participant__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Virtual_Workshop_Participant__c,count,%
0,0.0,52903,99.711625
1,1.0,153,0.288375


# 13. On_Job_Board__c 

In [31]:
clean_contact_dataset = drop_null('On_Job_Board__c', clean_contact_dataset)
summary('On_Job_Board__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,On_Job_Board__c,count,%
0,0.0,46781,88.172874
1,1.0,6275,11.827126


# 14. Used_Federal_Services__c

In [32]:
clean_contact_dataset = drop_null('Used_Federal_Services__c', clean_contact_dataset)
summary('Used_Federal_Services__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Used_Federal_Services__c,count,%
0,0.0,50865,95.870401
1,1.0,2191,4.129599


# 15. Interview_Skills__c

In [33]:
clean_contact_dataset = drop_null('Interview_Skills__c', clean_contact_dataset)
summary('Interview_Skills__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Interview_Skills__c,count,%
1,1,35543,66.991481
0,0,17513,33.008519


# 16. Created_LinkedIn_account__c

In [34]:
clean_contact_dataset = drop_null('Created_LinkedIn_account__c', clean_contact_dataset)
summary('Created_LinkedIn_account__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Created_LinkedIn_account__c,count,%
1,1,28623,53.948658
0,0,24433,46.051342


# 17. Value_Proposition__c

In [35]:
clean_contact_dataset = drop_null('Value_Proposition__c', clean_contact_dataset)
summary('Value_Proposition__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Value_Proposition__c,count,%
1,1.0,36552,68.893245
0,0.0,16504,31.106755


# 18. Hired_with_EO_assistance__c

In [36]:
clean_contact_dataset = drop_null('Hired_with_EO_assistance__c', clean_contact_dataset)
summary('Hired_with_EO_assistance__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Hired_with_EO_assistance__c,count,%
0,0.0,53031,99.95288
1,1.0,25,0.04712


# 19. Updated_Resume_Complete__c

In [37]:
clean_contact_dataset = drop_null('Updated_Resume_Complete__c', clean_contact_dataset)
summary('Updated_Resume_Complete__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Updated_Resume_Complete__c,count,%
0,0.0,51699,97.442325
1,1.0,1357,2.557675


# 20. HHUSA_Workshop_Participant__c

In [38]:
clean_contact_dataset = drop_null('HHUSA_Workshop_Participant__c', clean_contact_dataset)
summary('HHUSA_Workshop_Participant__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,HHUSA_Workshop_Participant__c,count,%
0,0,48866,92.102684
1,1,4190,7.897316


# 21. O2O_Initial_Assessment_Complete__c

In [39]:
clean_contact_dataset = drop_null('O2O_Initial_Assessment_Complete__c', clean_contact_dataset)
summary('O2O_Initial_Assessment_Complete__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,O2O_Initial_Assessment_Complete__c,count,%
0,0.0,52677,99.28566
1,1.0,379,0.71434


# 22.  Is_the_Initial_Intake_Assessment_done__c

In [40]:
clean_contact_dataset = drop_null('Is_the_Initial_Intake_Assessment_done__c', clean_contact_dataset)
summary('Is_the_Initial_Intake_Assessment_done__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Is_the_Initial_Intake_Assessment_done__c,count,%
1,1,45654,86.048703
0,0,7402,13.951297


# 23.Permission_to_use_job_board_granted__c

In [41]:
clean_contact_dataset = drop_null('Permission_to_use_job_board_granted__c', clean_contact_dataset)
summary('Permission_to_use_job_board_granted__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Permission_to_use_job_board_granted__c,count,%
0,0,47494,89.516737
1,1,5562,10.483263


# 24. Finalized_HHUSA_revised_resume_on_file__c

In [42]:
clean_contact_dataset = drop_null('Finalized_HHUSA_revised_resume_on_file__c', clean_contact_dataset)
summary('Finalized_HHUSA_revised_resume_on_file__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Finalized_HHUSA_revised_resume_on_file__c,count,%
1,1,42145,79.434937
0,0,10911,20.565063


# 25.Resume_Tailoring_Tips__c

In [43]:
clean_contact_dataset = drop_null('Resume_Tailoring_Tips__c', clean_contact_dataset)
summary('Resume_Tailoring_Tips__c', clean_contact_dataset)

*** Before ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 53056
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Resume_Tailoring_Tips__c,count,%
0,0.0,37446,70.578257
1,1.0,15610,29.421743


# 26. Resume_Completed_By__c

In [44]:
clean_contact_dataset = drop_null('Resume_Completed_By__c', clean_contact_dataset)
summary('Resume_Completed_By__c', clean_contact_dataset)



*** Before ***
Number of records: 53056
Unique Values: 171
Null Values: 11309
Null Values %: 21.315214113389626
*** After ***
Number of records: 41747
Unique Values: 171
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Resume_Completed_By__c,count,%
103,00550000001YDwUAAW,1108,2.654083
106,00550000001zVqjAAE,1086,2.601385
151,005500000042X0TAAU,822,1.969004
148,005500000042TrsAAE,794,1.901933
32,00538000004lJWlAAM,793,1.899538
155,005500000042c6HAAQ,762,1.825281
145,005500000042GlIAAU,697,1.669581
29,00538000004lAqgAAE,693,1.66
20,00538000004SiwfAAC,675,1.616883
16,00538000004SivDAAS,651,1.559393


# Hire_Info Features 

# 27. Hired_but_still_active_and_looking__c_y

In [45]:
clean_Hireinfo_dataset = drop_null('Hired_but_still_active_and_looking__c', Hireinfo_dataset)
summary('Hired_but_still_active_and_looking__c', clean_Hireinfo_dataset)

*** Before ***
Number of records: 30754
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 30754
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Hired_but_still_active_and_looking__c,count,%
0,0,26875,87.387007
1,1,3879,12.612993


# 28. PIM_Approved__c

In [46]:
clean_Hireinfo_dataset = drop_null('PIM_Approved__c', clean_Hireinfo_dataset)
summary('PIM_Approved__c', clean_Hireinfo_dataset)

*** Before ***
Number of records: 30754
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 30754
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,PIM_Approved__c,count,%
1,1,28709,93.350458
0,0,2045,6.649542


# 30 Confirmed_Hired_Date__c

In [47]:
clean_Hireinfo_dataset = drop_null('Confirmed_Hired_Date__c', clean_Hireinfo_dataset)
summary('Confirmed_Hired_Date__c', clean_Hireinfo_dataset)

*** Before ***
Number of records: 30754
Unique Values: 2179
Null Values: 18
Null Values %: 0.058528971841061324
*** After ***
Number of records: 30736
Unique Values: 2179
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Confirmed_Hired_Date__c,count,%
1124,4/13/2018 0:00,81,0.263535
1162,4/20/2018 0:00,80,0.260281
2014,9/13/2018 0:00,68,0.221239
2068,9/20/2018 0:00,67,0.217985
205,10/12/2018 0:00,64,0.208225
428,11/15/2018 0:00,63,0.204971
1595,6/5/2017 0:00,63,0.204971
1985,9/1/2017 0:00,63,0.204971
133,1/3/2017 0:00,62,0.201718
435,11/16/2018 0:00,62,0.201718


In [48]:
clean_Hireinfo_dataset = drop_null('Revised_Resume_Used_to_Apply_for_Job__c', clean_Hireinfo_dataset)
summary('Revised_Resume_Used_to_Apply_for_Job__c', clean_Hireinfo_dataset)

*** Before ***
Number of records: 30736
Unique Values: 2
Null Values: 0
Null Values %: 0.0
*** After ***
Number of records: 30736
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Revised_Resume_Used_to_Apply_for_Job__c,count,%
1,1,27322,88.892504
0,0,3414,11.107496


In [49]:
clean_Hireinfo_dataset["Revised_Resume_Used_to_Apply_for_Job__c"].isna().sum()

0

# Merging contact and Hire info Data using Left Outer Join

In [50]:
merged_data=pd.merge(clean_contact_dataset, clean_Hireinfo_dataset, left_on='Id', right_on='Client_Name__c',how='left')

In [51]:
merged_data.shape

(41928, 426)

In [52]:
clean_Merge_dataset = drop_null('Revised_Resume_Used_to_Apply_for_Job__c_x', merged_data)
summary('Revised_Resume_Used_to_Apply_for_Job__c_x', clean_Merge_dataset)

*** Before ***
Number of records: 41928
Unique Values: 2
Null Values: 21693
Null Values %: 51.73869490555237
*** After ***
Number of records: 20235
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Revised_Resume_Used_to_Apply_for_Job__c_x,count,%
1,Yes,18751,92.666172
0,No,1484,7.333828


In [53]:
clean_Merge_dataset['Revised_Resume_Used_to_Apply_for_Job__c_x'].replace(["Yes","No"],[1,0],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [54]:
clean_Merge_dataset['Revised_Resume_Used_to_Apply_for_Job__c_y']

153      1.0
157      0.0
163      1.0
344      1.0
356      1.0
638      1.0
639      1.0
651      1.0
653      0.0
660      1.0
665      1.0
675      1.0
679      1.0
681      1.0
682      1.0
683      1.0
684      1.0
686      1.0
687      1.0
825      0.0
879      1.0
880      0.0
881      1.0
884      1.0
885      1.0
899      1.0
1017     0.0
1023     1.0
1243     1.0
1259     1.0
        ... 
41885    1.0
41886    1.0
41887    0.0
41890    1.0
41894    1.0
41895    1.0
41896    0.0
41897    1.0
41899    1.0
41900    1.0
41901    1.0
41902    1.0
41903    1.0
41904    1.0
41906    1.0
41907    1.0
41908    1.0
41909    0.0
41910    1.0
41911    1.0
41913    1.0
41914    1.0
41915    1.0
41917    1.0
41918    1.0
41920    1.0
41923    1.0
41925    1.0
41926    1.0
41927    1.0
Name: Revised_Resume_Used_to_Apply_for_Job__c_y, Length: 20235, dtype: float64

In [55]:
#Mergind Hire Info and Cleaned co

In [56]:
merged_data['Revised_Resume_Used_to_Apply_for_Job__c_y'].fillna(merged_data['Revised_Resume_Used_to_Apply_for_Job__c_x'],inplace= True)