In [13]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

### Load Data From CSV File

In [14]:
dataset = pd.read_csv("Levels_Fyi_Salary_Data.csv")
dataset.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000.0,20000.0,10000.0,,,7392,807.0,1,0,0,0,0,0,0,0,0,0,0,,
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0.0,0.0,0.0,,,7419,807.0,2,0,0,0,0,0,0,0,0,0,0,,
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000.0,0.0,0.0,,,11527,819.0,3,0,0,0,0,0,0,0,0,0,0,,
3,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000.0,180000.0,35000.0,,,7472,807.0,7,0,0,0,0,0,0,0,0,0,0,,
4,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0.0,0.0,0.0,,,7322,807.0,9,0,0,0,0,0,0,0,0,0,0,,


<div href="pre-processing">
    <h2>Pre-processing</h2>
</div>

Using <b>dataset</b> as the data.csv data read by pandas, declare the following variables: <br>

<ul>
    <li> <b> X </b> as the <b> Feature Matrix </b> (data of dataset) </li>
    <li> <b> y </b> as the <b> response vector (target) </b> </li>
</ul>


In [15]:
from sklearn.impute import SimpleImputer
dataset = dataset.dropna()
dataset.shape

features = dataset[['totalyearlycompensation', 'yearsofexperience', 'yearsatcompany', 'basesalary', 'stockgrantvalue', 'bonus']]
X = np.asarray(features)
X[0:6]

array([[4.00e+05, 5.00e+00, 5.00e+00, 2.10e+05, 1.45e+05, 4.50e+04],
       [1.36e+05, 3.00e+00, 2.00e+00, 1.24e+05, 1.00e+03, 1.10e+04],
       [3.37e+05, 6.00e+00, 6.00e+00, 1.77e+05, 1.25e+05, 3.60e+04],
       [2.22e+05, 4.00e+00, 4.00e+00, 1.64e+05, 3.80e+04, 2.00e+04],
       [1.87e+05, 5.00e+00, 0.00e+00, 1.65e+05, 2.20e+04, 0.00e+00],
       [3.10e+05, 1.50e+01, 3.00e+00, 1.60e+05, 1.50e+05, 0.00e+00]])

Now we can fill the target variable.


In [16]:
y = dataset["title"]
y[0:6]

15710    Software Engineer
23532    Software Engineer
23533    Software Engineer
23534    Software Engineer
23535    Software Engineer
23537    Software Engineer
Name: title, dtype: object

<hr>

<div id="setting_up_tree">
    <h2>Setting up the Decision Tree</h2>
    Using <b>train/test split</b> on our <b>decision tree</b>. I'll import <b>train_test_split</b> from <b>sklearn.cross_validation</b>.
</div>

In [17]:
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

Checking if shapes match for both training and test

In [18]:
print('Shape of X training set {}'.format(X_trainset.shape),'&',' Size of Y training set {}'.format(y_trainset.shape))
print('Shape of X training set {}'.format(X_testset.shape),'&',' Size of Y training set {}'.format(y_testset.shape))

Shape of X training set (15064, 6) &  Size of Y training set (15064,)
Shape of X training set (6457, 6) &  Size of Y training set (6457,)


<hr>

<div id="modeling">
    <h2>Modeling</h2>
    I'll first create an instance of the <b>DecisionTreeClassifier</b> called <b>diagnosisTree</b>.<br>
    Inside of the classifier, specify <i> criterion="entropy" </i> so we can see the information gain of each node.
</div>

In [19]:
titleTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
titleTree # it shows the default parameters

DecisionTreeClassifier(criterion='entropy', max_depth=4)

Fitting the data with the training feature matrix <b> X_trainset </b> and training  response vector <b> y_trainset </b>

In [20]:
titleTree.fit(X_trainset,y_trainset)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

<hr>

<div id="prediction">
    <h2>Prediction</h2>
    Making <b>predictions</b> on the testing dataset and storing it into a variable called <b>predTree</b> and printing out the results.
</div>

In [21]:
predTree = titleTree.predict(X_testset)
print (predTree [0:6])
print (y_testset [0:6])

['Software Engineer' 'Software Engineer' 'Software Engineer'
 'Software Engineer' 'Software Engineer' 'Software Engineer']
56525            Software Engineer
40940            Software Engineer
26992            Software Engineer
28849            Software Engineer
41466    Technical Program Manager
56248            Software Engineer
Name: title, dtype: object


<hr>

<div id="evaluation">
    <h2>Evaluation</h2>
    Importing <b>metrics</b> from sklearn to check the accuracy of the model.
</div>

In [22]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.6337308347529813
