# Classification using Decision Trees
Simple, clean example of classification using decision trees

In [13]:
# Import modules
import pandas as pd
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split


In [14]:
# Load data
df = pd.read_csv("salaries_final.csv", index_col = 0)

In [15]:
# Inspect data
df.head()

Unnamed: 0,Age,Education,Occupation,Relationship,Race,Sex,Target
0,39,Bachelors,Adm-clerical,Not-in-family,White,Male,<=50K
1,50,Bachelors,Exec-managerial,Husband,White,Male,<=50K
2,38,HS-grad,Handlers-cleaners,Not-in-family,White,Male,<=50K
3,53,11th,Handlers-cleaners,Husband,Black,Male,<=50K
4,28,Bachelors,Prof-specialty,Wife,Black,Female,<=50K


In [22]:
# SkLearn doesn't like dealing with categorical inputs, so convert to "Dummies".  This is the "One-hot-encode" approach.
df = pd.get_dummies(df)

In [17]:
# Inspect again
df.head()

Unnamed: 0,Age,Education_10th,Education_11th,Education_12th,Education_1st-4th,Education_5th-6th,Education_7th-8th,Education_9th,Education_Assoc-acdm,Education_Assoc-voc,...,Relationship_Wife,Race_Amer-Indian-Eskimo,Race_Asian-Pac-Islander,Race_Black,Race_Other,Race_White,Sex_Female,Sex_Male,Target_<=50K,Target_>50K
0,39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
1,50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
2,38,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
3,53,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,28,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0


In [18]:
# Split into X (inputs) and y (outputs)
X = df.drop(["Target_<=50K", "Target_>50K"], axis=1)
y = df["Target_<=50K"]

In [25]:
# Create a model - can use gini or entropy for the criterion
model = tree.DecisionTreeClassifier(criterion = "gini", max_depth = 12, min_samples_split = 300, min_samples_leaf = 150)

# Split into training and test sets (4 sets in total) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Use training data to fit model (i.e. train the model) 
model.fit(X_train, y_train)

# Use training inputs to predict training outputs
y_hat_train = model.predict(X_train)

# Use test inputs to predict test outputs
y_hat_test = model.predict(X_test)


In [26]:
# Check how our model did with the test data
print(metrics.confusion_matrix(y_test, y_hat_test))
print(metrics.classification_report(y_test, y_hat_test))

metrics.accuracy_score(y_test, y_hat_test)

[[ 940  988]
 [ 467 5746]]
             precision    recall  f1-score   support

          0       0.67      0.49      0.56      1928
          1       0.85      0.92      0.89      6213

avg / total       0.81      0.82      0.81      8141



0.8212750276378823