# Module 3: Splitting, Cross-Validation and the Fundamental Tradeoff

In [1]:
import pandas as pd
import altair as alt

In [2]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")

In [3]:
cities_df.head()

Unnamed: 0,longitude,latitude,country
0,-130.0437,55.9773,USA
1,-134.4197,58.3019,USA
2,-123.078,48.9854,USA
3,-122.7436,48.9881,USA
4,-122.2691,48.9951,USA


In [5]:
X = cities_df.drop(columns=["country"])

In [6]:
y = cities_df["country"]

In [9]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

In [10]:
X_test.head(3)

Unnamed: 0,longitude,latitude
172,-64.8001,46.098
175,-82.4066,42.9746
181,-111.3885,56.7292


In [11]:
y_test.head(3)

172    Canada
175    Canada
181    Canada
Name: country, dtype: object

In [12]:
shape_dict = {"Data portion": ["X", "y", "X_train", "y_train", "X_test", "y_test"],
              "Shape": [X.shape, y.shape,
                        X_train.shape, y_train.shape,
                        X_test.shape, y_test.shape]}
shape_df = pd.DataFrame(shape_dict)
shape_df

Unnamed: 0,Data portion,Shape
0,X,"(209, 2)"
1,y,"(209,)"
2,X_train,"(167, 2)"
3,y_train,"(167,)"
4,X_test,"(42, 2)"
5,y_test,"(42,)"


Or split the data into training/testing first and then split X and y:

In [13]:
train_df, test_df = train_test_split(cities_df, test_size = 0.2, random_state = 123)

X_train, y_train = train_df.drop(columns=["country"]), train_df["country"]
X_test, y_test = test_df.drop(columns=["country"]), test_df["country"]

train_df.head()

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.058,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada


In [15]:
chart_cities = alt.Chart(train_df).mark_circle(size=20, opacity=0.6).encode(
    alt.X('longitude:Q', scale=alt.Scale(domain=[-140, -40])),
    alt.Y('latitude:Q', scale=alt.Scale(domain=[20, 60])),
    alt.Color('country:N', scale=alt.Scale(domain=['Canada', 'USA'],
                                           range=['red', 'blue'])))
chart_cities

In [17]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [18]:
print("Train score: " + str(round(model.score(X_train, y_train), 2)))

Train score: 1.0


In [19]:
print("Train score: " + str(round(model.score(X_test, y_test), 2)))

Train score: 0.74
