Dataset is https://www.kaggle.com/datasets/gabrielsantello/cars-purchase-decision-dataset

# Proceding data

In [13]:
# import libraries
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
import pandas as pd
import dtreeviz

In [14]:
# load the dataset
df = pd.read_csv("car_data.csv")

In [15]:
df.describe()

Unnamed: 0,User ID,Age,AnnualSalary,Purchased
count,1000.0,1000.0,1000.0,1000.0
mean,500.5,40.106,72689.0,0.402
std,288.819436,10.707073,34488.341867,0.490547
min,1.0,18.0,15000.0,0.0
25%,250.75,32.0,46375.0,0.0
50%,500.5,40.0,72000.0,0.0
75%,750.25,48.0,90000.0,1.0
max,1000.0,63.0,152500.0,1.0


In [16]:
# check duplicates
df.duplicated().sum()

0

In [17]:
# check nan values
df.isna().sum()

User ID         0
Gender          0
Age             0
AnnualSalary    0
Purchased       0
dtype: int64

In [18]:
df.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0


In [19]:
# make gender numeric
from sklearn.preprocessing import LabelEncoder
variables = ['Gender']
encoder = LabelEncoder()
df[variables] = df[variables].apply(encoder.fit_transform)

In [20]:
df.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,1,35,20000,0
1,681,1,40,43500,0
2,353,1,49,74000,0
3,895,1,40,107500,1
4,661,1,25,79000,0


# X/y -split

In [21]:
X = df.drop('Purchased', axis=1)
y = df['Purchased']

# Decision tree and analysis

In [22]:
# Fit the decision tree classifier with default hyper-parameters
clf = DecisionTreeClassifier()
model = clf.fit(X, y)

In [23]:
# pip install dtreeviz
import dtreeviz
viz_model = dtreeviz.model(clf,
                           X_train=X, y_train=y,
                           feature_names=X.columns,
                           target_name='Decision',
                           class_names=["No", "Yes"])

# for viewing in the Jupyter notebook
# Note: this is not practical with large 
# decision trees!
# adjust the scale if you need a bigger picture (for example 2.5)
viz_model.view(scale=1.5).save("dt_cars_test.svg") 



Analysis:
1. Usually people buy cars when they are older than 44.5 years
2. Annual salary should be more 90 000
3. Age is the most important factor

In [24]:
from sklearn.tree import export_graphviz
import subprocess
from sklearn import tree

# Export the decision tree to DOT format
export_graphviz(clf, 
                   feature_names=X.columns,  
                   class_names=["No", "Yes"],
                   filled=True, rounded=True, node_ids=True, out_file='tree.dot')



# Convert DOT to SVG
subprocess.call(['dot', '-Tsvg', 'tree.dot', '-o', 'dt_cars_test_sk.svg'])

0

Analysis:
1. Age is more important than salary
2. But salary is dependant variable (more age --> more salary)
3. Data is overlapping