# Read me

This code clean the data listed in our DSP data dictionary

In [1]:
import numpy as np
import pandas as pd

In [2]:
contact_dataset = pd.read_csv('SalesForce_Contact.csv', encoding='latin', low_memory=False)
contact_dataset.shape

(132445, 391)

## General Methods

In [3]:
def count_records_unique_null(feature, data):
    print("Number of records:", len(data[feature].index))
    print("Unique Values:", data[feature].nunique())
    print("Null Values:", data[feature].isna().sum())
    print("Null Values %:", data[feature].isna().sum()/len(data.index)*100)
def count_table(feature, data):
    data_by_feature = data.groupby(feature)
    countTable = data_by_feature[feature].agg(['count'])
    total_feature = countTable.values.sum()
    proportion = np.divide(countTable['count'], total_feature/100)
    countTable['%'] = proportion
    countTable = countTable.reset_index()
    return countTable.nlargest(30, 'count')
def drop_null(feature, data):
    print("*** Before ***")
    count_records_unique_null(feature, data)
    clean_dataset = data.dropna(subset = [feature])
    return clean_dataset
def summary(feature, data):
    print("*** After ***")
    count_records_unique_null(feature, data)
    return count_table(feature, data)

## 1. State

In [4]:
# Maxim's Code

## 2. Gender

In [5]:
# Alhasan's Code

### Clean the data by droping all the rows with missing values

In [6]:
clean_contact_dataset = drop_null('Gender__c', contact_dataset)
summary('Gender__c', clean_contact_dataset)

*** Before ***
Number of records: 132445
Unique Values: 3
Null Values: 63159
Null Values %: 47.68696440031711
*** After ***
Number of records: 69286
Unique Values: 3
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Gender__c,count,%
2,Male,52760,76.14814
1,Female,16515,23.835984
0,--None--,11,0.015876


### Clean the data by droping all the rows with "--None--"

In [7]:
clean_contact_dataset = clean_contact_dataset.drop(clean_contact_dataset[clean_contact_dataset.Gender__c == "--None--"].index)
summary('Gender__c', clean_contact_dataset)

*** After ***
Number of records: 69275
Unique Values: 2
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Gender__c,count,%
1,Male,52760,76.160231
0,Female,16515,23.839769


## 3. Race

In [8]:
# Alhasan's Code

## 4. Service_Branch

In [9]:
# Alhasan's Code

### Clean the data by droping all the rows with missing values

In [10]:
clean_contact_dataset = drop_null('Service_Branch__c', clean_contact_dataset)
summary('Service_Branch__c', clean_contact_dataset)

*** Before ***
Number of records: 69275
Unique Values: 8
Null Values: 6675
Null Values %: 9.635510645976183
*** After ***
Number of records: 62600
Unique Values: 8
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Service_Branch__c,count,%
1,Army,34203,54.63738
5,Navy,10204,16.300319
0,Air Force,8969,14.327476
3,Marines,8657,13.829073
2,Coast Guard,560,0.894569
6,Not Applicable,4,0.00639
7,Spouse,2,0.003195
4,Merchant Marine,1,0.001597


### Clean the data by including {Spouse, Merchant Marine} into Not Applicable

In [11]:
col = clean_contact_dataset['Service_Branch__c']
col = col.where(col != "Spouse", "Not Applicable")
col = col.where(col != "Merchant Marine", "Not Applicable")
clean_contact_dataset['Service_Branch__c'] = col

In [12]:
summary('Service_Branch__c', clean_contact_dataset)

*** After ***
Number of records: 62600
Unique Values: 6
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Service_Branch__c,count,%
1,Army,34203,54.63738
4,Navy,10204,16.300319
0,Air Force,8969,14.327476
3,Marines,8657,13.829073
2,Coast Guard,560,0.894569
5,Not Applicable,7,0.011182


## 5. Last_Service_Rank

In [13]:
# Alhasan's Code

### Clean the data by droping all the rows with missing values

In [14]:
clean_contact_dataset = drop_null('Service_Rank__c', clean_contact_dataset)
summary('Service_Rank__c', clean_contact_dataset)

*** Before ***
Number of records: 62600
Unique Values: 27
Null Values: 1056
Null Values %: 1.6869009584664536
*** After ***
Number of records: 61544
Unique Values: 27
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Service_Rank__c,count,%
6,E-4,13547,22.011894
7,E-5,13308,21.623554
8,E-6,8978,14.587937
9,E-7,8200,13.323801
10,E-8,3637,5.909593
15,O-3,3016,4.900559
5,E-3,2660,4.322111
16,O-4,1588,2.580268
11,E-9,1397,2.269921
17,O-5,1383,2.247173


### Adding Last_Service_Rank Column accourding to:
1. Enlisted Personnel (E)
2. Warrant Officers (W, CW)
3. Commissioned Officers (O)

source: https://www.infoplease.com/us/military-personnel/us-military-ranks

In [15]:
col = clean_contact_dataset['Service_Rank__c']
col = col.where(col.str.startswith('O') == False, "O")
col = col.where(col.str.startswith('E') == False, "E")
col = col.where(col.str.contains('W') == False, "W")
clean_contact_dataset['Last_Service_Rank'] = col

In [16]:
summary('Last_Service_Rank', clean_contact_dataset)

*** After ***
Number of records: 61544
Unique Values: 3
Null Values: 0
Null Values %: 0.0


Unnamed: 0,Last_Service_Rank,count,%
0,E,52602,85.470558
1,O,7611,12.366762
2,W,1331,2.16268


## 8. Education

In [17]:
# Maxim's Code