# Classification using Decision Trees

This example uses our library for classification using decision trees.

In [1]:
# Import modules
import pandas as pd
import lfsklearn as lf
import sklearn.metrics as metrics


In [2]:
# Load data
df = pd.read_csv("salaries_final.csv", index_col = 0)

In [3]:
# Inspect data
df.head()

Unnamed: 0,Age,Education,Occupation,Relationship,Race,Sex,Target
0,39,Bachelors,Adm-clerical,Not-in-family,White,Male,<=50K
1,50,Bachelors,Exec-managerial,Husband,White,Male,<=50K
2,38,HS-grad,Handlers-cleaners,Not-in-family,White,Male,<=50K
3,53,11th,Handlers-cleaners,Husband,Black,Male,<=50K
4,28,Bachelors,Prof-specialty,Wife,Black,Female,<=50K


In [4]:
# SkLearn doesn't like dealing with categorical inputs, so convert to "Dummies".  This is the "One-hot-encode" approach.
df = pd.get_dummies(df)

In [5]:
# Inspect again
df.head()

Unnamed: 0,Age,Education_10th,Education_11th,Education_12th,Education_1st-4th,Education_5th-6th,Education_7th-8th,Education_9th,Education_Assoc-acdm,Education_Assoc-voc,...,Relationship_Wife,Race_Amer-Indian-Eskimo,Race_Asian-Pac-Islander,Race_Black,Race_Other,Race_White,Sex_Female,Sex_Male,Target_<=50K,Target_>50K
0,39,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
1,50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
2,38,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
3,53,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,28,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0


In [6]:
# Split into X (inputs) and y (outputs)
X = df.drop(["Target_<=50K", "Target_>50K"], axis=1)
y = df["Target_<=50K"]

In [8]:
y_test, y_hat_test = lf.DecisionTreeClassifier(X,y)

In [9]:
# Check how our model did with the test data
print(metrics.confusion_matrix(y_test, y_hat_test))
print(metrics.classification_report(y_test, y_hat_test))

metrics.accuracy_score(y_test, y_hat_test)

[[1020  969]
 [ 480 5672]]
              precision    recall  f1-score   support

           0       0.68      0.51      0.58      1989
           1       0.85      0.92      0.89      6152

   micro avg       0.82      0.82      0.82      8141
   macro avg       0.77      0.72      0.74      8141
weighted avg       0.81      0.82      0.81      8141



0.82201203783319