In [474]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree

## Loading data

In [475]:
df = pd.read_csv('vehicles.csv')
df

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,,bicycle
1,8,male,,scooter
2,10,female,,bicycle
3,14,male,,metro
4,16,male,,metro
5,18,female,,metro
6,20,male,200.0,scooter
7,22,female,500.0,scooter
8,23,male,300.0,scooter
9,25,female,800.0,metro


## Cleaning

In [476]:
df['Income'] = df['Income'].fillna(0.0)
df.head(8)

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,0.0,bicycle
1,8,male,0.0,scooter
2,10,female,0.0,bicycle
3,14,male,0.0,metro
4,16,male,0.0,metro
5,18,female,0.0,metro
6,20,male,200.0,scooter
7,22,female,500.0,scooter


## Encoding

In [477]:
df['Gender'].unique()

array(['female', 'male'], dtype=object)

In [478]:
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df.head(10)

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,0,0.0,bicycle
1,8,1,0.0,scooter
2,10,0,0.0,bicycle
3,14,1,0.0,metro
4,16,1,0.0,metro
5,18,0,0.0,metro
6,20,1,200.0,scooter
7,22,0,500.0,scooter
8,23,1,300.0,scooter
9,25,0,800.0,metro


## Model

In [479]:
X = df.drop(columns='Favorite Transport')
X.head(3)

Unnamed: 0,Age,Gender,Income
0,5,0,0.0
1,8,1,0.0
2,10,0,0.0


In [480]:
y = df['Favorite Transport']
y.head()

0    bicycle
1    scooter
2    bicycle
3      metro
4      metro
Name: Favorite Transport, dtype: object

In [481]:
model = DecisionTreeClassifier()
model.fit(X, y)
model

## Prediction

In [482]:
test_df = pd.DataFrame({
    'Age': [12, 30, 75],
    'Gender':[0, 1, 0],
    'Income':[0.0, 35000, 300]
})
test_df

Unnamed: 0,Age,Gender,Income
0,12,0,0.0
1,30,1,35000.0
2,75,0,300.0


In [483]:
model.predict(test_df)

array(['bicycle', 'helicopter', 'scooter'], dtype=object)

## Exporting to the DOT file

In [484]:
tree.export_graphviz(model, out_file='decision_tree_model.dot', feature_names=['Age', 'Gender', 'Income'], filled=True, class_names=sorted(y.unique()))

## Evaluation

In [485]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test

Unnamed: 0,Age,Gender,Income
15,47,0,6000.0
4,16,1,0.0
2,10,0,0.0
25,75,1,30000.0
8,23,1,300.0
6,20,1,200.0


In [486]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [487]:
predictions = model.predict(X_test)
predictions

array(['car', 'metro', 'scooter', 'helicopter', 'scooter', 'metro'],
      dtype=object)

In [488]:
model_accuracy = accuracy_score(y_test, predictions)
model_accuracy

0.6666666666666666