# Module 2: Decision Trees

In [1]:
import pandas as pd
import altair as alt

In [2]:
classification_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/quiz2-grade-toy-classification.csv")
classification_df.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1,quiz2
0,1,1,92,93,84,91,92,A+
1,1,0,94,90,80,83,91,not A+
2,0,0,78,85,83,80,80,not A+
3,0,1,91,94,92,91,89,A+
4,0,1,77,83,90,92,85,A+


In [3]:
X = classification_df.drop(columns=["quiz2"])
X.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1
0,1,1,92,93,84,91,92
1,1,0,94,90,80,83,91
2,0,0,78,85,83,80,80
3,0,1,91,94,92,91,89
4,0,1,77,83,90,92,85


In [4]:
y = classification_df["quiz2"]
y.head()

0        A+
1    not A+
2    not A+
3        A+
4        A+
Name: quiz2, dtype: object

In [5]:
X_binary = X.copy()
columns = ["lab1", "lab2", "lab3", "lab4", "quiz1"]
for col in columns:
    X_binary[col] =  X_binary[col].apply(lambda x: 1 if x >= 90 else 0)
X_binary.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1
0,1,1,1,1,0,1,1
1,1,0,1,1,0,0,1
2,0,0,0,0,0,0,0
3,0,1,1,1,1,1,0
4,0,1,0,0,1,1,0


In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
model = DecisionTreeClassifier()

In [8]:
model.fit(X_binary, y);

In [9]:
model.predict(X_binary)

array(['A+', 'not A+', 'not A+', 'A+', 'A+', 'not A+', 'A+', 'not A+',
       'not A+', 'A+', 'A+', 'A+', 'A+', 'A+', 'A+', 'not A+', 'A+',
       'not A+', 'not A+', 'not A+', 'A+'], dtype=object)

In [10]:
model.score(X_binary, y)

0.9047619047619048

### Decision Trees with continuous features

In [11]:
X.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1
0,1,1,92,93,84,91,92
1,1,0,94,90,80,83,91
2,0,0,78,85,83,80,80
3,0,1,91,94,92,91,89
4,0,1,77,83,90,92,85


In [12]:
model = DecisionTreeClassifier()

In [13]:
model.fit(X, y)

In [14]:
model.predict(X)

array(['A+', 'not A+', 'not A+', 'A+', 'A+', 'not A+', 'A+', 'not A+',
       'not A+', 'not A+', 'A+', 'A+', 'A+', 'A+', 'not A+', 'not A+',
       'A+', 'not A+', 'not A+', 'not A+', 'A+'], dtype=object)

In [15]:
X_subset = X[["lab4", "quiz1"]]
X_subset.head()

Unnamed: 0,lab4,quiz1
0,91,92
1,83,91
2,80,80
3,91,89
4,92,85


In [16]:
depth = 1
model = DecisionTreeClassifier(max_depth=depth)
model.fit(X_subset, y)

In [17]:
df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")
df.head()

Unnamed: 0,longitude,latitude,country
0,-130.0437,55.9773,USA
1,-134.4197,58.3019,USA
2,-123.078,48.9854,USA
3,-122.7436,48.9881,USA
4,-122.2691,48.9951,USA


In [18]:
chart1 = alt.Chart(df).mark_circle(size=20, opacity=0.6).encode(
    alt.X('longitude:Q', scale=alt.Scale(domain=[-140, -40]), axis=alt.Axis(grid=False)),
    alt.Y('latitude:Q', scale=alt.Scale(domain=[20, 60]), axis=alt.Axis(grid=False)),
    alt.Color('country:N', scale=alt.Scale(domain=['Canada', 'USA'], range=['red', 'blue']))
)

chart1

### Parameters & Hyperparameters

* **Parameters** are automatically learned by the algorithm during training
* **Hyperparameters** are specified based on:
    * expert knowledge
    * heuristics, or
    * systematic/automated optimization

**Parameters**

* **criterion**: default = "gini"
    * The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.
    
* **splitter**: default = "best"
    * the strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.
    
* **max_depth**: default=None
    * The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
    
* **min_samples_split**: default=2
    * The minimum number of samples required to split an internal node

In [19]:
X.head(3)

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1
0,1,1,92,93,84,91,92
1,1,0,94,90,80,83,91
2,0,0,78,85,83,80,80


In [20]:
model = DecisionTreeClassifier(max_depth=1)
model.fit(X, y);

In [21]:
model2 = DecisionTreeClassifier(max_depth=2)
model2.fit(X, y);

In [22]:
model3 = DecisionTreeClassifier(max_depth=3)
model3.fit(X, y);

In [23]:
model.score(X,y)

0.7619047619047619

In [24]:
model2.score(X,y)

0.8571428571428571

In [25]:
model3.score(X,y)

0.9523809523809523

In [26]:
model4 = DecisionTreeClassifier(max_depth=5)
model4.fit(X, y);
model4.score(X,y)

1.0

In [27]:
model5 = DecisionTreeClassifier(min_samples_split=2)
model5.fit(X,y);
model5.score(X, y)

1.0

In [28]:
model6 = DecisionTreeClassifier(min_samples_split=4)
model6.fit(X,y);
model6.score(X, y)

0.9523809523809523

In [29]:
model7 = DecisionTreeClassifier(min_samples_split=10)
model7.fit(X,y);
model7.score(X, y)

0.9047619047619048

### Decision Tree Regressor

In [30]:
regression_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/quiz2-grade-toy-regression.csv")
regression_df.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1,quiz2
0,1,1,92,93,84,91,92,90
1,1,0,94,90,80,83,91,84
2,0,0,78,85,83,80,80,82
3,0,1,91,94,92,91,89,92
4,0,1,77,83,90,92,85,90


In [31]:
X = regression_df.drop(columns=["quiz2"])

In [32]:
y = regression_df["quiz2"]

In [33]:
from sklearn.tree import DecisionTreeRegressor

In [34]:
depth = 4
reg_model = DecisionTreeRegressor(max_depth=depth)
reg_model.fit(X,y);

In [35]:
X.loc[[0]]

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1
0,1,1,92,93,84,91,92


In [36]:
reg_model.predict(X.loc[[0]])

array([90.])

In [37]:
predicted_grades = reg_model.predict(X)
regression_df = regression_df.assign(predicted_quiz2 = predicted_grades)
print("R^2 score on the training data:" + str(round(reg_model.score(X,y), 3)))

R^2 score on the training data:1.0


In [38]:
regression_df.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1,quiz2,predicted_quiz2
0,1,1,92,93,84,91,92,90,90.0
1,1,0,94,90,80,83,91,84,84.0
2,0,0,78,85,83,80,80,82,82.0
3,0,1,91,94,92,91,89,92,92.0
4,0,1,77,83,90,92,85,90,90.0


### Generalization

In [39]:
classification_df.head()

Unnamed: 0,ml_experience,class_attendance,lab1,lab2,lab3,lab4,quiz1,quiz2
0,1,1,92,93,84,91,92,A+
1,1,0,94,90,80,83,91,not A+
2,0,0,78,85,83,80,80,not A+
3,0,1,91,94,92,91,89,A+
4,0,1,77,83,90,92,85,A+


In [40]:
X = classification_df.drop(columns=["quiz2"])
y = classification_df["quiz2"]

In [41]:
X_subset = X[["lab4", "quiz1"]]
X_subset.head()

Unnamed: 0,lab4,quiz1
0,91,92
1,83,91
2,80,80
3,91,89
4,92,85


In [42]:
depth = 1
model = DecisionTreeClassifier(max_depth = depth)
model.fit(X_subset, y);
model.score(X_subset, y)

0.7142857142857143

In [43]:
depth = 2
model = DecisionTreeClassifier(max_depth = depth)
model.fit(X_subset, y);
model.score(X_subset, y)

0.8095238095238095

In [44]:
depth = 4
model = DecisionTreeClassifier(max_depth = depth)
model.fit(X_subset, y);
model.score(X_subset, y)

0.9523809523809523

In [45]:
depth = 10
model = DecisionTreeClassifier(max_depth = depth)
model.fit(X_subset, y);
model.score(X_subset, y)

1.0