# Salary Classification with KNN using Sklearn and GridSearchCV for Hyperparameter Tuning

### This is my final data science project in Python - Data Science Online Class conducted by SanberCode Indonesia. In this project, I would like to classify whether their respective salary is equal or less than 7 million or more than 7 million (in Rupiah)

***

## Import Libraries

In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

## Load the Data

In [84]:
train = pd.read_csv('train.csv')

In [85]:
test = pd.read_csv('test.csv')

In [86]:
train

Unnamed: 0,Age,Working Class,Education,Marital Status,Occupation,Gender,Capital Gain,Capital Loss,Hour per Week,Salary
0,21,Entrepreneur,Senior High School,Single,Other Services,Female,0.0,0.0,35.0,<=7jt
1,49,Entrepreneur,Senior High School,Divorced,Executive Manager,Female,0.0,0.0,40.0,<=7jt
2,44,Entrepreneur,Associate Degree,Married,Executive Manager,Male,61404000.0,0.0,45.0,>7jt
3,24,Entrepreneur,Bachelor,Single,Specialist,Male,0.0,0.0,35.0,<=7jt
4,33,Entrepreneur,Master,Married,Specialist,Male,210336000.0,0.0,40.0,>7jt
...,...,...,...,...,...,...,...,...,...,...
27984,47,Non-Corporate Freelance,Senior High School,Married,Handicraft Refinement,Male,0.0,0.0,40.0,<=7jt
27985,69,Corporate Freelance,Associate Degree,Married,Sales,Male,0.0,0.0,40.0,>7jt
27986,24,Non-Corporate Freelance,Bachelor,Married,Specialist,Female,0.0,0.0,20.0,>7jt
27987,47,Entrepreneur,Senior High School,Married,Sales,Male,0.0,0.0,45.0,<=7jt


In [88]:
test

Unnamed: 0,Age,Working Class,Education,Marital Status,Occupation,Gender,Capital Gain,Capital Loss,Hour per Week
0,30,Local Government,Associate Degree,Divorced,Religious Leader,Female,0.0,0.0,40.0
1,28,Entrepreneur,Senior High School,Married,Machinery Supervisor,Male,0.0,0.0,50.0
2,17,Entrepreneur,Senior High School,Single,Cleaning Service,Male,0.0,0.0,20.0
3,63,Entrepreneur,Senior High School,Married,Executive Manager,Male,0.0,0.0,40.0
4,60,Non-Corporate Freelance,Senior High School,Divorced,Sales,Male,36358000.0,0.0,55.0
...,...,...,...,...,...,...,...,...,...
7418,27,Entrepreneur,Senior High School,Married,Handicraft Refinement,Male,0.0,0.0,40.0
7419,58,Country Government,Junior High School,Married,Farmer,Male,0.0,0.0,40.0
7420,45,Non-Corporate Freelance,Senior High School,Married,Handicraft Refinement,Male,0.0,0.0,50.0
7421,41,Local Government,Senior High School,Separated,Religious Leader,Female,0.0,0.0,40.0


## Data Preprocessing - Train Data

In [90]:
train = pd.get_dummies(train, columns = ['Working Class','Marital Status','Occupation','Gender','Salary'])

In [91]:
train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary_<=7jt,Salary_>7jt
0,21,Senior High School,0.0,0.0,35.0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
1,49,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
2,44,Associate Degree,61404000.0,0.0,45.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
3,24,Bachelor,0.0,0.0,35.0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,1,0
4,33,Master,210336000.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27984,47,Senior High School,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
27985,69,Associate Degree,0.0,0.0,40.0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
27986,24,Bachelor,0.0,0.0,20.0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
27987,47,Senior High School,0.0,0.0,45.0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,1,0


In [92]:
train.drop(columns = 'Salary_<=7jt', axis = 1, inplace = True)

In [93]:
train.rename(columns = {'Salary_>7jt':'Salary'}, inplace = True)

In [94]:
train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary
0,21,Senior High School,0.0,0.0,35.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,49,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,44,Associate Degree,61404000.0,0.0,45.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,24,Bachelor,0.0,0.0,35.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,33,Master,210336000.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27984,47,Senior High School,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
27985,69,Associate Degree,0.0,0.0,40.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
27986,24,Bachelor,0.0,0.0,20.0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
27987,47,Senior High School,0.0,0.0,45.0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0


In [95]:
train.groupby('Education').count()

Unnamed: 0_level_0,Age,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,Working Class_Non-Corporate Freelance,...,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Associate Degree,2675,2675,2675,2675,2675,2675,2675,2675,2675,2675,...,2675,2675,2675,2675,2675,2675,2675,2675,2675,2675
Bachelor,5928,5928,5928,5928,5928,5928,5928,5928,5928,5928,...,5928,5928,5928,5928,5928,5928,5928,5928,5928,5928
Doctoral,451,451,451,451,451,451,451,451,451,451,...,451,451,451,451,451,451,451,451,451,451
Elementary School,634,634,634,634,634,634,634,634,634,634,...,634,634,634,634,634,634,634,634,634,634
Junior High School,1252,1252,1252,1252,1252,1252,1252,1252,1252,1252,...,1252,1252,1252,1252,1252,1252,1252,1252,1252,1252
Master,1944,1944,1944,1944,1944,1944,1944,1944,1944,1944,...,1944,1944,1944,1944,1944,1944,1944,1944,1944,1944
Professional School,622,622,622,622,622,622,622,622,622,622,...,622,622,622,622,622,622,622,622,622,622
Senior High School,14483,14483,14483,14483,14483,14483,14483,14483,14483,14483,...,14483,14483,14483,14483,14483,14483,14483,14483,14483,14483


In [96]:
obj = {
    'Elementary School':1,
    'Junior High School':2,
    'Senior High School':3,
    'Associate Degree':4,
    'Bachelor':5,
    'Professional School':6,
    'Master':7,
    'Doctoral':8
}

In [97]:
train.Education = train.Education.replace(obj)

In [98]:
train

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male,Salary
0,21,3,0.0,0.0,35.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,49,3,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,44,4,61404000.0,0.0,45.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,24,5,0.0,0.0,35.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,33,7,210336000.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27984,47,3,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
27985,69,4,0.0,0.0,40.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
27986,24,5,0.0,0.0,20.0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
27987,47,3,0.0,0.0,45.0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0


## Data Preprocessing - Test Data

In [99]:
test = pd.get_dummies(test, columns = ['Working Class','Marital Status','Occupation','Gender'])

In [100]:
test

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Machinery Supervisor,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male
0,30,Associate Degree,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
1,28,Senior High School,0.0,0.0,50.0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,17,Senior High School,0.0,0.0,20.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,63,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60,Senior High School,36358000.0,0.0,55.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7418,27,Senior High School,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7419,58,Junior High School,0.0,0.0,40.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7420,45,Senior High School,0.0,0.0,50.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7421,41,Senior High School,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


In [101]:
test.Education = test.Education.replace(obj)

In [102]:
test

Unnamed: 0,Age,Education,Capital Gain,Capital Loss,Hour per Week,Working Class_Corporate Freelance,Working Class_Country Government,Working Class_Entrepreneur,Working Class_Local Government,Working Class_Never Worked,...,Occupation_Machinery Supervisor,Occupation_Military,Occupation_Other Services,Occupation_Religious Leader,Occupation_Sales,Occupation_Security,Occupation_Specialist,Occupation_Tech Support,Gender_Female,Gender_Male
0,30,4,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
1,28,3,0.0,0.0,50.0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,17,3,0.0,0.0,20.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,63,3,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60,3,36358000.0,0.0,55.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7418,27,3,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7419,58,2,0.0,0.0,40.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7420,45,3,0.0,0.0,50.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7421,41,3,0.0,0.0,40.0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


## Hyperparameter Tuning using GridSearchCV

In [107]:
X = train.drop(['Salary'], axis = 1)
y = train.Salary

In [131]:
model = KNeighborsClassifier()
param_grid = {'n_neighbors':np.arange(1,51),'weights':['distance','uniform']}
gscv = GridSearchCV(model, param_grid = param_grid, scoring = 'roc_auc', cv = 10)
gscv.fit(X,y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]),
                         'weights': ['distance', 'uniform']},
             scoring='roc_auc')

In [132]:
gscv.best_params_

{'n_neighbors': 23, 'weights': 'uniform'}

In [133]:
gscv.best_score_

0.8979691018552142

## Fitting and Predicting Based on Prior Hyperparameter Tuning

In [134]:
knn = KNeighborsClassifier(n_neighbors = 23, weights = 'uniform')
knn.fit(X,y)

KNeighborsClassifier(n_neighbors=23)

In [135]:
salaryClass = knn.predict(test)

In [136]:
dfPredict = pd.DataFrame({'id':np.arange(28794,36217),'Salary Classification':salaryClass})

In [137]:
dfPredict

Unnamed: 0,id,Salary Classification
0,28794,0
1,28795,0
2,28796,0
3,28797,0
4,28798,0
...,...,...
7418,36212,0
7419,36213,0
7420,36214,0
7421,36215,0


In [138]:
dfPredict.groupby('Salary Classification').count()

Unnamed: 0_level_0,id
Salary Classification,Unnamed: 1_level_1
0,5906
1,1517
