# Salary Classification

### This is my final data science project in Python - Data Science Online Class conducted by SanberCode Indonesia. In this project, I would like to classify whether their respective salary is equal or less than 7 million or more than 7 million. My project conducted by using Sklearn with Random Forest algorithm, GridSearchCV for Hyperparameter Tuning, and ROC_AUC_Score for evaluation metrics.

***

## Import Libraries

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

## Load the Data

In [37]:
# dataset already separated for training and testing according to the projects' instructions.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [38]:
# looks like our training data mostly consists of categorical variabels with string value
train

Unnamed: 0,Age,Working Class,Education,Marital Status,Occupation,Gender,Capital Gain,Capital Loss,Hour per Week,Salary
0,21,Entrepreneur,Senior High School,Single,Other Services,Female,0.0,0.0,35.0,<=7jt
1,49,Entrepreneur,Senior High School,Divorced,Executive Manager,Female,0.0,0.0,40.0,<=7jt
2,44,Entrepreneur,Associate Degree,Married,Executive Manager,Male,61404000.0,0.0,45.0,>7jt
3,24,Entrepreneur,Bachelor,Single,Specialist,Male,0.0,0.0,35.0,<=7jt
4,33,Entrepreneur,Master,Married,Specialist,Male,210336000.0,0.0,40.0,>7jt
...,...,...,...,...,...,...,...,...,...,...
27984,47,Non-Corporate Freelance,Senior High School,Married,Handicraft Refinement,Male,0.0,0.0,40.0,<=7jt
27985,69,Corporate Freelance,Associate Degree,Married,Sales,Male,0.0,0.0,40.0,>7jt
27986,24,Non-Corporate Freelance,Bachelor,Married,Specialist,Female,0.0,0.0,20.0,>7jt
27987,47,Entrepreneur,Senior High School,Married,Sales,Male,0.0,0.0,45.0,<=7jt


In [39]:
# same goes for testing data
test

Unnamed: 0,Age,Working Class,Education,Marital Status,Occupation,Gender,Capital Gain,Capital Loss,Hour per Week
0,30,Local Government,Associate Degree,Divorced,Religious Leader,Female,0.0,0.0,40.0
1,28,Entrepreneur,Senior High School,Married,Machinery Supervisor,Male,0.0,0.0,50.0
2,17,Entrepreneur,Senior High School,Single,Cleaning Service,Male,0.0,0.0,20.0
3,63,Entrepreneur,Senior High School,Married,Executive Manager,Male,0.0,0.0,40.0
4,60,Non-Corporate Freelance,Senior High School,Divorced,Sales,Male,36358000.0,0.0,55.0
...,...,...,...,...,...,...,...,...,...
7418,27,Entrepreneur,Senior High School,Married,Handicraft Refinement,Male,0.0,0.0,40.0
7419,58,Country Government,Junior High School,Married,Farmer,Male,0.0,0.0,40.0
7420,45,Non-Corporate Freelance,Senior High School,Married,Handicraft Refinement,Male,0.0,0.0,50.0
7421,41,Local Government,Senior High School,Separated,Religious Leader,Female,0.0,0.0,40.0


## Data Preprocessing - Train Data

In [40]:
# encode every categorical variables using .get_dummies function from pandas
train = pd.get_dummies(train, columns = ['Working Class','Marital Status','Occupation','Gender','Salary'])

In [41]:
train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary_<=7jt,Salary_>7jt
0,21,Senior High School,0.0,0.0,35.0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
1,49,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
2,44,Associate Degree,61404000.0,0.0,45.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
3,24,Bachelor,0.0,0.0,35.0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,1,0
4,33,Master,210336000.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27984,47,Senior High School,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
27985,69,Associate Degree,0.0,0.0,40.0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
27986,24,Bachelor,0.0,0.0,20.0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
27987,47,Senior High School,0.0,0.0,45.0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,1,0


In [42]:
# since it's only two variable, we could remove one of them as one variable is enough to represent the encoding
train.drop(columns = 'Salary_<=7jt', axis = 1, inplace = True)

In [43]:
train.rename(columns = {'Salary_>7jt':'Salary'}, inplace = True)

In [44]:
train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary
0,21,Senior High School,0.0,0.0,35.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,49,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,44,Associate Degree,61404000.0,0.0,45.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,24,Bachelor,0.0,0.0,35.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,33,Master,210336000.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27984,47,Senior High School,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
27985,69,Associate Degree,0.0,0.0,40.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
27986,24,Bachelor,0.0,0.0,20.0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
27987,47,Senior High School,0.0,0.0,45.0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0


In [45]:
# create a dictionary named obj to encode every values in education variable
obj = {
    'Elementary School':1,
    'Junior High School':2,
    'Senior High School':3,
    'Associate Degree':4,
    'Bachelor':5,
    'Professional School':6,
    'Master':7,
    'Doctoral':8
}

In [46]:
# replace existing value with every values in obj variable
train.Education = train.Education.replace(obj)

In [47]:
train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary
0,21,3,0.0,0.0,35.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,49,3,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,44,4,61404000.0,0.0,45.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,24,5,0.0,0.0,35.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,33,7,210336000.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27984,47,3,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
27985,69,4,0.0,0.0,40.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
27986,24,5,0.0,0.0,20.0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
27987,47,3,0.0,0.0,45.0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0


## Data Preprocessing - Test Data

In [48]:
test = pd.get_dummies(test, columns = ['Working Class','Marital Status','Occupation','Gender'])

In [49]:
test

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Machinery Supervisor,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male
0,30,Associate Degree,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
1,28,Senior High School,0.0,0.0,50.0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,17,Senior High School,0.0,0.0,20.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,63,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60,Senior High School,36358000.0,0.0,55.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7418,27,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7419,58,Junior High School,0.0,0.0,40.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7420,45,Senior High School,0.0,0.0,50.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7421,41,Senior High School,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


In [50]:
test.Education = test.Education.replace(obj)

In [51]:
test

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Machinery Supervisor,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male
0,30,4,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
1,28,3,0.0,0.0,50.0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,17,3,0.0,0.0,20.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,63,3,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60,3,36358000.0,0.0,55.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7418,27,3,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7419,58,2,0.0,0.0,40.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7420,45,3,0.0,0.0,50.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7421,41,3,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


## Hyperparameter Tuning using GridSearchCV

In [53]:
# split the training data to feature (X) and label (y)
X = train.drop(['Salary'], axis = 1)
y = train.Salary

In [54]:
# find the optimal hyperparameter using GridSearchCV
# param_grid dictionary contains every hyperparameter and its value that we would like to iterate

forest_model = RandomForestClassifier()
param_grid = {
    'n_estimators':[1000,2000],
    'max_features':['auto','sqrt','log2'], 
    'max_depth':[10,20],
    'criterion':['gini','entropy']
}
CV_rfc = GridSearchCV(forest_model,param_grid=param_grid,scoring = 'roc_auc',cv=5)
CV_rfc.fit(X,y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [1000, 2000]},
             scoring='roc_auc')

In [55]:
# show the best hyperparameter based on GridCV
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'n_estimators': 1000}

In [56]:
# show the best accuracy score based on GridCV
CV_rfc.best_score_

0.9132454299571116

## Fitting and Predicting with Random Forest, based on prior Hyperparameter Tuning

In [64]:
# refers to previous tuning, fit current data with random forest model
forest = RandomForestClassifier(criterion = 'gini', max_depth = 10, n_estimators = 1000, max_features = 'auto')
forest.fit(X,y)

RandomForestClassifier(max_depth=10, n_estimators=1000)

In [65]:
# predict the label
salaryClass = forest.predict(test)

In [66]:
# create dataframe for our predictions
dfPredict = pd.DataFrame({'id':np.arange(28794,36217),'Salary Classification':salaryClass})

In [67]:
dfPredict

Unnamed: 0,id,Salary Classification
0,28794,0
1,28795,0
2,28796,0
3,28797,0
4,28798,0
...,...,...
7418,36212,0
7419,36213,0
7420,36214,0
7421,36215,0


In [68]:
# 0: Salary less or equal than 7 million
# 1: Salary more than 7 million
dfPredict.groupby('Salary Classification').count()

Unnamed: 0_level_0,id
Salary Classification,Unnamed: 1_level_1
0,6049
1,1374
