# Module 3: Splitting, Cross-Validation and the Fundamental Tradeoff

In [1]:
import pandas as pd
import altair as alt

In [2]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")

In [3]:
cities_df.head()

Unnamed: 0,longitude,latitude,country
0,-130.0437,55.9773,USA
1,-134.4197,58.3019,USA
2,-123.078,48.9854,USA
3,-122.7436,48.9881,USA
4,-122.2691,48.9951,USA


In [5]:
X = cities_df.drop(columns=["country"])

In [6]:
y = cities_df["country"]

In [9]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

In [10]:
X_test.head(3)

Unnamed: 0,longitude,latitude
172,-64.8001,46.098
175,-82.4066,42.9746
181,-111.3885,56.7292


In [11]:
y_test.head(3)

172    Canada
175    Canada
181    Canada
Name: country, dtype: object

In [12]:
shape_dict = {"Data portion": ["X", "y", "X_train", "y_train", "X_test", "y_test"],
              "Shape": [X.shape, y.shape,
                        X_train.shape, y_train.shape,
                        X_test.shape, y_test.shape]}
shape_df = pd.DataFrame(shape_dict)
shape_df

Unnamed: 0,Data portion,Shape
0,X,"(209, 2)"
1,y,"(209,)"
2,X_train,"(167, 2)"
3,y_train,"(167,)"
4,X_test,"(42, 2)"
5,y_test,"(42,)"


Or split the data into training/testing first and then split X and y:

In [13]:
train_df, test_df = train_test_split(cities_df, test_size = 0.2, random_state = 123)

X_train, y_train = train_df.drop(columns=["country"]), train_df["country"]
X_test, y_test = test_df.drop(columns=["country"]), test_df["country"]

train_df.head()

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.058,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada


In [15]:
chart_cities = alt.Chart(train_df).mark_circle(size=20, opacity=0.6).encode(
    alt.X('longitude:Q', scale=alt.Scale(domain=[-140, -40])),
    alt.Y('latitude:Q', scale=alt.Scale(domain=[20, 60])),
    alt.Color('country:N', scale=alt.Scale(domain=['Canada', 'USA'],
                                           range=['red', 'blue'])))
chart_cities

In [17]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [18]:
print("Train score: " + str(round(model.score(X_train, y_train), 2)))

Train score: 1.0


In [19]:
print("Train score: " + str(round(model.score(X_test, y_test), 2)))

Train score: 0.74


### Cross Validation

The goal of cross-validation is to obtain a better estimate of test score than just using a single validation set.

In [20]:
X = cities_df.drop(columns=["country"])
y = cities_df["country"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [22]:
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(max_depth=4)
cv_score = cross_val_score(model, X_train, y_train, cv=5)
cv_score

array([0.76470588, 0.82352941, 0.78787879, 0.78787879, 0.84848485])

In [23]:
cv_score = cross_val_score(model, X_train, y_train, cv=10)
cv_score

array([0.76470588, 0.82352941, 0.70588235, 0.94117647, 0.82352941,
       0.82352941, 0.70588235, 0.9375    , 0.9375    , 0.9375    ])

In [25]:
cv_score.mean()

0.8400735294117647

In [26]:
from sklearn.model_selection import cross_validate

In [27]:
scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)

In [28]:
scores

{'fit_time': array([0.00357771, 0.00269294, 0.00267792, 0.00399995, 0.00915813,
        0.00963807, 0.01424313, 0.00946093, 0.00699639, 0.00843978]),
 'score_time': array([0.0020442 , 0.00338221, 0.00440931, 0.00265718, 0.00627303,
        0.00596595, 0.0091691 , 0.01229   , 0.00454974, 0.01330042]),
 'test_score': array([0.76470588, 0.82352941, 0.70588235, 0.94117647, 0.82352941,
        0.82352941, 0.70588235, 0.9375    , 0.9375    , 0.9375    ]),
 'train_score': array([0.91333333, 0.90666667, 0.90666667, 0.9       , 0.90666667,
        0.91333333, 0.92      , 0.90066225, 0.90066225, 0.90066225])}

In [30]:
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.003578,0.002044,0.764706,0.913333
1,0.002693,0.003382,0.823529,0.906667
2,0.002678,0.004409,0.705882,0.906667
3,0.004,0.002657,0.941176,0.9
4,0.009158,0.006273,0.823529,0.906667
5,0.009638,0.005966,0.823529,0.913333
6,0.014243,0.009169,0.705882,0.92
7,0.009461,0.01229,0.9375,0.900662
8,0.006996,0.00455,0.9375,0.900662
9,0.00844,0.0133,0.9375,0.900662


In [31]:
pd.DataFrame(scores).mean()

fit_time       0.007088
score_time     0.006404
test_score     0.840074
train_score    0.906865
dtype: float64

In [32]:
cross_val_score(model, X_train, y_train, cv=10).mean()

0.8400735294117647

In [33]:
pd.DataFrame(scores).std()

fit_time       0.003802
score_time     0.003939
test_score     0.094993
train_score    0.006822
dtype: float64

### Underfitting and Overfitting

3 types of errors:

* **score_train**: training score (or mean train score from cross-validation)
* **score_valid**: validation score (or mean validation score from cross-validation)
* **score_test**: test score

**Overfitting**: model is overly specific to the training data (training score is significantly higher than testing score)

In [35]:
model = DecisionTreeClassifier()
scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)
print("Train score: " + str(round(scores["train_score"].mean(), 2)))
print("Validation score: " + str(round(scores["test_score"].mean(), 2)))

Train score: 1.0
Validation score: 0.82


**Underfitting**: model is too simple

In [36]:
model = DecisionTreeClassifier(max_depth=1)
scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)
print("Train score: " + str(round(scores["train_score"].mean(), 2)))
print("Validation score: " + str(round(scores["test_score"].mean(), 2)))

Train score: 0.83
Validation score: 0.81


### The Fundamental Tradeoff

As model complexity goes up, score_train goes up and score_train - score_valid tend to go up.

In [46]:
results_dict = {"depth": list(), "mean_train_score": list(), "mean_cv_score": list()}

for depth in range(1,20):
    model = DecisionTreeClassifier(max_depth=depth)
    scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)
    results_dict["depth"].append(depth)
    results_dict["mean_cv_score"].append(scores["test_score"].mean())
    results_dict["mean_train_score"].append(scores["train_score"].mean())
    
results_df = pd.DataFrame(results_dict)
results_df

Unnamed: 0,depth,mean_train_score,mean_cv_score
0,1,0.834349,0.809926
1,2,0.844989,0.804044
2,3,0.862967,0.804412
3,4,0.906865,0.840074
4,5,0.918848,0.845956
5,6,0.930817,0.815074
6,7,0.954115,0.833824
7,8,0.972066,0.821324
8,9,0.979382,0.827574
9,10,0.994013,0.809191


In [47]:
results_df.sort_values('mean_cv_score', ascending=False).iloc[0]

depth               5.000000
mean_train_score    0.918848
mean_cv_score       0.845956
Name: 4, dtype: float64

In [48]:
results_df = results_df.melt(id_vars=['depth'],
                             value_vars=['mean_train_score',
                                         'mean_cv_score'], 
                             var_name='split',
                             value_name='score')
results_df

Unnamed: 0,depth,split,score
0,1,mean_train_score,0.834349
1,2,mean_train_score,0.844989
2,3,mean_train_score,0.862967
3,4,mean_train_score,0.906865
4,5,mean_train_score,0.918848
5,6,mean_train_score,0.930817
6,7,mean_train_score,0.954115
7,8,mean_train_score,0.972066
8,9,mean_train_score,0.979382
9,10,mean_train_score,0.994013


In [49]:
chart1 = alt.Chart(results_df).mark_line().encode(
         alt.X('depth:Q', axis=alt.Axis(title="Tree Depth")),
         alt.Y('score:Q', scale=alt.Scale(domain=[.80, 1.00])), 
         alt.Color('split:N', scale=alt.Scale(domain=['mean_train_score',
                                                     'mean_cv_score'],
                                             range=['teal', 'gold'])))
chart1

In [41]:
best_depth = results_df.sort_values('mean_cv_score', ascending=False).iloc[0]['depth']
best_depth

5.0

In [43]:
model = DecisionTreeClassifier(max_depth = int(best_depth))
model.fit(X_train, y_train)
print("score on test set: " + str(round(model.score(X_test, y_test), 2)))

score on test set: 0.83


### The Golden Rule:

The test data cannot influence the training phase in any way.