# Module 4: Similarity-Based Approaches to Supervised Learning

In [1]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")
train_df, test_df = train_test_split(cities_df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.058,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada


### Terminology

Analogy-based models find examples that are most similar to the text example in the training set.

**Dimensions**: number of features

* 20: low dimensional
* 1000: medium dimensional
* 100,000: high dimensional

**Feature Vectors**: a vector composed of feature values associated with an example.

In [3]:
train_df.drop(columns=["country"]).iloc[0].round(2).to_numpy()

array([-76.48,  44.23])

### Distance

In [4]:
cities_viz = alt.Chart(train_df).mark_circle(size=20, opacity=0.6).encode(
    alt.X('longitude:Q', scale=alt.Scale(domain=[-140, -40])),
    alt.Y('latitude:Q', scale=alt.Scale(domain=[20,60])),
    alt.Color('country:N', scale=alt.Scale(domain=['Canada', 'USA'],
                                           range=['red', 'blue']))
)
cities_viz

Select two points:

In [5]:
two_cities = cities_df.sample(2, random_state=42).drop(columns=['country'])
two_cities

Unnamed: 0,longitude,latitude
30,-66.9843,44.8607
171,-80.2632,43.1408


Subtract the two cities:

In [6]:
two_cities.iloc[1] - two_cities.iloc[0]

longitude   -13.2789
latitude     -1.7199
dtype: float64

Square the differences:

In [7]:
(two_cities.iloc[1] - two_cities.iloc[0])**2

longitude    176.329185
latitude       2.958056
dtype: float64

Sum them up:

In [8]:
((two_cities.iloc[1] - two_cities.iloc[0])**2).sum()

179.28724121999983

Square root:

In [9]:
np.sqrt(((two_cities.iloc[1] - two_cities.iloc[0])**2).sum())

13.389818565611703

In [10]:
from sklearn.metrics.pairwise import euclidean_distances

In [11]:
euclidean_distances(two_cities)

array([[ 0.        , 13.38981857],
       [13.38981857,  0.        ]])

### Finding the nearest neighbour

In [12]:
dists = euclidean_distances(train_df[["latitude", "longitude"]])
dists

array([[ 0.        ,  4.92866046, 10.47586257, ..., 45.36619339,
         3.13968038,  9.58476504],
       [ 4.92866046,  0.        , 15.36399019, ..., 40.48484175,
         1.80868018, 14.45684087],
       [10.47586257, 15.36399019,  0.        , ..., 55.83947468,
        13.60621684,  0.94361393],
       ...,
       [45.36619339, 40.48484175, 55.83947468, ...,  0.        ,
        42.23325838, 54.93872568],
       [ 3.13968038,  1.80868018, 13.60621684, ..., 42.23325838,
         0.        , 12.70774745],
       [ 9.58476504, 14.45684087,  0.94361393, ..., 54.93872568,
        12.70774745,  0.        ]])

In [13]:
dists.shape

(167, 167)

In [14]:
pd.DataFrame(dists).loc[:5,:5]

Unnamed: 0,0,1,2,3,4,5
0,0.0,4.92866,10.475863,3.402295,9.046,44.329135
1,4.92866,0.0,15.36399,8.326614,13.965788,39.839439
2,10.475863,15.36399,0.0,7.19535,2.653738,54.549042
3,3.402295,8.326614,7.19535,0.0,5.643921,47.391337
4,9.046,13.965788,2.653738,5.643921,0.0,52.532333
5,44.329135,39.839439,54.549042,47.391337,52.532333,0.0


In [15]:
np.fill_diagonal(dists, np.inf)
pd.DataFrame(dists).loc[:5,:5]

Unnamed: 0,0,1,2,3,4,5
0,inf,4.92866,10.475863,3.402295,9.046,44.329135
1,4.92866,inf,15.36399,8.326614,13.965788,39.839439
2,10.475863,15.36399,inf,7.19535,2.653738,54.549042
3,3.402295,8.326614,7.19535,inf,5.643921,47.391337
4,9.046,13.965788,2.653738,5.643921,inf,52.532333
5,44.329135,39.839439,54.549042,47.391337,52.532333,inf


Feature vector for city 0:

In [16]:
train_df.iloc[0]

longitude   -76.4813
latitude     44.2307
country       Canada
Name: 160, dtype: object

Distances from city 0 to 5 other cities:

In [17]:
dists[0][:5]

array([        inf,  4.92866046, 10.47586257,  3.40229467,  9.04600003])

In [18]:
train_df.iloc[[0]]

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada


In [19]:
np.argmin(dists[0])

157

In [20]:
train_df.iloc[[157]]

Unnamed: 0,longitude,latitude,country
96,-76.3019,44.211,Canada


In [21]:
dists[0][157]

0.18047839205805613

Finding the distances to a query point

In [22]:
query_point = [[-80, 25]]

In [23]:
dists = euclidean_distances(train_df[["longitude", "latitude"]], query_point)
dists[0:5]

array([[19.54996348],
       [18.02706204],
       [24.60912622],
       [21.39718237],
       [25.24111312]])

In [24]:
np.argmin(dists)

147

In [25]:
dists[np.argmin(dists)].item()

3.8383922936564634

In [26]:
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=1)
nn.fit(train_df[['longitude', 'latitude']]);
nn.kneighbors([[-80, 25]])



(array([[3.83839229]]), array([[147]]))

In [28]:
#dists = euclidean_distances(X_train[:3])
#dists

In [29]:
train_df

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.0580,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada
...,...,...,...
17,-76.3305,44.1255,USA
98,-74.7287,45.0184,Canada
66,-121.4944,38.5816,USA
126,-79.5656,43.6436,Canada


In [30]:
X_train = train_df.drop(columns=["country"])
X_test = test_df.drop(columns=["country"])
y_test = test_df["country"]

In [31]:
nn = NearestNeighbors(n_neighbors=5)
nn.fit(X_train);
nn.kneighbors(X_test.iloc[1])



ValueError: Expected 2D array, got 1D array instead:
array=[-82.4066  42.9746].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [33]:
X_test.iloc[1].shape

(2,)

In [34]:
X_test.iloc[[1]].shape

(1, 2)

In [35]:
nn = NearestNeighbors(n_neighbors=5)
nn.fit(X_train);
nn.kneighbors(X_test.iloc[[1]])

(array([[0.03461517, 0.90722048, 0.90722048, 0.90970871, 0.90970871]]),
 array([[100,  39,  77, 130,  87]]))

### K Nearest Neighbours

In [36]:
small_train_df = cities_df.sample(30, random_state=90)
X_train = small_train_df.drop(columns=["country"])
y_train = small_train_df["country"]
one_city = small_train_df.sample(1, random_state=44)
one_city

Unnamed: 0,longitude,latitude,country
144,-104.6173,50.4488,Canada


In [37]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=1) # 1 neighbor
neigh.fit(X_train, y_train);
neigh.predict(one_city.drop(columns=["country"]))

array(['Canada'], dtype=object)

In [38]:
neigh = KNeighborsClassifier(n_neighbors=3) # 3 neighbors
neigh.fit(X_train, y_train);
neigh.predict(one_city.drop(columns=["country"]))

array(['Canada'], dtype=object)

In [39]:
neigh = KNeighborsClassifier(n_neighbors=9) # 9 neighbors
neigh.fit(X_train, y_train);
neigh.predict(one_city.drop(columns=["country"]))

array(['USA'], dtype=object)

In [40]:
model = KNeighborsClassifier(n_neighbors=1) 
model.fit(X_train, y_train.to_numpy());
model.score(X_train, y_train)

1.0

In [41]:
model.score(X_test, y_test)

0.7380952380952381

### Choosing K (n_neighbors)

In [42]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")
X = cities_df.drop(columns = ["country"])
y = cities_df["country"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train.to_numpy());
model.score(X_train, y_train)

1.0

In [44]:
from sklearn.model_selection import cross_validate

In [45]:
k = 1
knn1 = KNeighborsClassifier(n_neighbors=k)
scores = cross_validate(knn1, X_train, y_train, return_train_score =True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.008027,0.017333,0.710526,1.0
1,0.002899,0.185673,0.684211,1.0
2,0.003427,0.013709,0.842105,1.0
3,0.008745,0.005402,0.702703,1.0
4,0.002258,0.009968,0.837838,1.0


In [46]:
k = 100
knn1 = KNeighborsClassifier(n_neighbors=k)
scores = cross_validate(knn1, X_train, y_train, return_train_score =True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.003092,0.08068,0.605263,0.6
1,0.005422,0.016014,0.605263,0.6
2,0.006784,0.038697,0.605263,0.6
3,0.006945,0.039679,0.594595,0.602649
4,0.005669,0.009191,0.594595,0.602649


In [47]:
results_dict = {"n_neighbors": list(), "mean_train_score": list(), "mean_cv_score": list()}

for k in range(1,50,5):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validate(knn, X_train, y_train, return_train_score=True)
    results_dict["n_neighbors"].append(k)
    results_dict["mean_cv_score"].append(np.mean(scores["test_score"]))
    results_dict["mean_train_score"].append(np.mean(scores["train_score"]))
    
results_df = pd.DataFrame(results_dict)
results_df

Unnamed: 0,n_neighbors,mean_train_score,mean_cv_score
0,1,1.0,0.755477
1,6,0.831135,0.792603
2,11,0.819152,0.802987
3,16,0.801863,0.782219
4,21,0.777934,0.76643
5,26,0.755364,0.723613
6,31,0.743391,0.707681
7,36,0.728777,0.707681
8,41,0.706128,0.681223
9,46,0.694155,0.660171


In [56]:
results_df = pd.DataFrame(results_dict).melt(id_vars=['n_neighbors'],
                                             value_vars=['mean_train_score',
                                                         'mean_cv_score'], 
                                             var_name='split',
                                             value_name='score')

chart1 = alt.Chart(results_df, width = 500, height = 300).mark_line().encode(
         alt.X('n_neighbors:Q', axis=alt.Axis(title="Number of Neighbours")),
         alt.Y('score:Q'), 
         alt.Color('split:N', scale=alt.Scale(domain=['mean_train_score',
                                                     'mean_cv_score'],
                                             range=['teal', 'gold'])))
chart1

In [48]:
sorted_results_df = results_df.sort_values("mean_cv_score", ascending = False)
sorted_results_df

Unnamed: 0,n_neighbors,mean_train_score,mean_cv_score
2,11,0.819152,0.802987
1,6,0.831135,0.792603
3,16,0.801863,0.782219
4,21,0.777934,0.76643
0,1,1.0,0.755477
5,26,0.755364,0.723613
6,31,0.743391,0.707681
7,36,0.728777,0.707681
8,41,0.706128,0.681223
9,46,0.694155,0.660171


In [49]:
best_k = sorted_results_df.iloc[0,0]
best_k

11

In [51]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train);
print("Test accuracy:", round(knn.score(X_test, y_test), 3))

Test accuracy: 0.905


* K-NN usually works well when the number of dimensions is small.

### K Nearest Neighbors Regressor

In [57]:
np.random.seed(0)
n = 50
X_1 = np.linspace(0,2,n)+np.random.randn(n)*0.01
X = pd.DataFrame(X_1[:,None], columns=['length'])
X.head()

Unnamed: 0,length
0,0.017641
1,0.044818
2,0.09142
3,0.144858
4,0.181941


In [58]:
y = abs(np.random.randn(n,1))*2 + X_1[:, None]*5
y = pd.DataFrame(y, columns=['weight'])
y.head()

Unnamed: 0,weight
0,1.879136
1,0.997894
2,1.47871
3,3.085554
4,0.966069


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

source = pd.concat([X_train, y_train], axis=1)

scatter = alt.Chart(source, width=500, height=300).mark_point(filled=True, color='green').encode(
    alt.X('length:Q'),
    alt.Y('weight:Q'))

scatter

In [60]:
from sklearn.neighbors import KNeighborsRegressor

In [61]:
knnr = KNeighborsRegressor(n_neighbors=1, weights="uniform")
knnr.fit(X_train, y_train);

In [64]:
predicted = knnr.predict(X_train)
predicted[:5] # first 5 predictions

array([[ 4.57636104],
       [13.20245224],
       [ 3.03671796],
       [10.74123618],
       [ 1.82820801]])

In [65]:
knnr = KNeighborsRegressor(n_neighbors=10, weights="uniform")
knnr.fit(X_train, y_train);

In [66]:
knnr.score(X_train, y_train)

0.9254540554756747

In [67]:
knnr = KNeighborsRegressor(n_neighbors=10, weights="distance")
knnr.fit(X_train,y_train);

In [69]:
knnr.score(X_train, y_train) # overfitting

1.0

**Pros & Cons of K-Nearest Neighbors:**

Pros:
* Easy to understand, interpret
* Simply hyperparameter k controlling the fundamental tradeoff
* Can learn very complex functions given enough data
* Lazy learning: takes time to fit

Cons:
* Can potentially be very slow during prediction time
* Often not that great test accuracy compared to the modern approaches
* You should scale your features

### Support Vector Machines (SVMs) with RBF Kernel

(smooth boundary lines)

In [71]:
train_df, test_df = train_test_split(cities_df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["country"]), train_df["country"]
X_test, y_test = test_df.drop(columns = ["country"]), test_df["country"]

In [72]:
from sklearn.svm import SVC

In [73]:
svm = SVC(gamma=0.01)
scores = cross_validate(svm, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.060607,0.016741,0.823529,0.842105
1,0.008343,0.003062,0.823529,0.842105
2,0.010665,0.008434,0.727273,0.858209
3,0.009831,0.008936,0.787879,0.843284
4,0.006363,0.002203,0.939394,0.80597


In [74]:
svm_cv_score = scores['test_score'].mean()
svm_cv_score

0.8203208556149733

In [75]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train);
scores = cross_validate(knn, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.008759,0.073694,0.852941,0.849624
1,0.007958,0.01204,0.764706,0.834586
2,0.003488,0.010512,0.727273,0.850746
3,0.009926,0.176671,0.787879,0.858209
4,0.160662,0.030157,0.878788,0.813433


In [77]:
knn_cv_score = scores['test_score'].mean().round(3)
knn_cv_score

0.802

In [79]:
svm_cv_score.round(3)

0.82

In [80]:
from sklearn.svm import SVR

Hyperparameters of SVM are:

* 'gamma': controls the complexity of a model (higher gamma means higher complexity)
* 'C': affects the fundamental tradeoff (higher C means higher complexity)