# import Libraries

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_auc_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Importing Data

In [2]:
salary_data=pd.read_csv('SalaryData_Train(1).csv')

In [3]:
salary_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
salary_data.shape

(30161, 14)

# Data Understanding

In [5]:
salary_data.isnull().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [6]:
salary_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


# Data Preparation

In [7]:
le=LabelEncoder()

In [8]:
salary_data['education']=le.fit_transform(salary_data.education)
salary_data['Salary']=le.fit_transform(salary_data.Salary)
salary_data['native']=le.fit_transform(salary_data.native)
salary_data['sex']=le.fit_transform(salary_data.sex)
salary_data['race']=le.fit_transform(salary_data.race)
salary_data['relationship']=le.fit_transform(salary_data.relationship)
salary_data['occupation']=le.fit_transform(salary_data.occupation)
salary_data['maritalstatus']=le.fit_transform(salary_data.maritalstatus)
salary_data['workclass']=le.fit_transform(salary_data.workclass)

In [11]:
scaler   = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.04277892,  2.93594688, -0.34982703, ..., -0.21850808,
        -0.07773541,  0.26635842],
       [ 0.88026081,  1.88764517, -0.34982703, ..., -0.21850808,
        -2.33149486,  0.26635842],
       [-0.0333558 , -0.20895825,  0.1748048 , ..., -0.21850808,
        -0.07773541,  0.26635842],
       ...,
       [ 1.48933854, -0.20895825,  0.1748048 , ..., -0.21850808,
        -0.07773541,  0.26635842],
       [-1.25151126, -0.20895825,  0.1748048 , ..., -0.21850808,
        -1.74718685,  0.26635842],
       [ 1.03253024,  0.83934346,  0.1748048 , ..., -0.21850808,
        -0.07773541,  0.26635842]])

In [12]:
scaled_x_data=pd.DataFrame(x_scaled,columns=salary_data.columns.drop({'Salary'}))

# Model Building

In [13]:
x=salary_data.drop({'Salary'},axis=1)
y=salary_data.Salary

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.20,random_state = 12,stratify = y)

In [15]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((24128, 13), (24128,), (6033, 13), (6033,))

# Kernel - Linear
#### Model Training without scaling | Model Testing | Model Evaluation¶

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.20,random_state = 12,stratify = y)

svc_classifier = SVC(kernel='linear')
svc_classifier.fit(x_train,y_train)
y_pred = svc_classifier.predict(x_test)

print("Overall Accuracy : ",round(accuracy_score(y_test,y_pred),4))
print("Precision        : ",round(precision_score(y_test,y_pred),4))
print("Recall           : ",round(recall_score(y_test,y_pred),4))
print("AUC Score        : ",round(roc_auc_score(y_test,y_pred),4))
print("Confusion Matrix :")

plt.figure(figsize = (12,10))
confu_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(data = confu_matrix,
            annot=True,
            linewidths=0.8,
            xticklabels=salary_data.columns,
            yticklabels=salary_data.columns,
            )
plt.xlabel("Prediction")
plt.ylabel("Actual labels")
plt.title("Prediction Vs Actual",size = 20)

# No Use

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [55]:
rf=RandomForestClassifier()

In [63]:
dt=DecisionTreeClassifier()

In [56]:
x=salary_data.drop({'Salary'},axis=1)
y=salary_data.Salary

In [57]:
rf.fit(x,y)

RandomForestClassifier()

In [64]:
dt.fit(x,y)

DecisionTreeClassifier()

In [58]:
rf.feature_importances_

array([0.21810029, 0.04847588, 0.03524615, 0.09905653, 0.08421199,
       0.08517946, 0.1037317 , 0.01751926, 0.01313559, 0.1267162 ,
       0.04086043, 0.1085051 , 0.01926141])

In [61]:
salary_data.columns

Index(['age', 'workclass', 'education', 'educationno', 'maritalstatus',
       'occupation', 'relationship', 'race', 'sex', 'capitalgain',
       'capitalloss', 'hoursperweek', 'native', 'Salary'],
      dtype='object')

In [65]:
dt.feature_importances_

array([0.17407264, 0.05007697, 0.0150709 , 0.13146322, 0.01166056,
       0.08168239, 0.22026014, 0.02043214, 0.00684132, 0.12693715,
       0.04149704, 0.09954212, 0.02046342])