# Module 4: Similarity-Based Approaches to Supervised Learning

In [12]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")
train_df, test_df = train_test_split(cities_df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.058,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada


### Terminology

Analogy-based models find examples that are most similar to the text example in the training set.

**Dimensions**: number of features

* 20: low dimensional
* 1000: medium dimensional
* 100,000: high dimensional

**Feature Vectors**: a vector composed of feature values associated with an example.

In [3]:
train_df.drop(columns=["country"]).iloc[0].round(2).to_numpy()

array([-76.48,  44.23])

### Distance

In [6]:
cities_viz = alt.Chart(train_df).mark_circle(size=20, opacity=0.6).encode(
    alt.X('longitude:Q', scale=alt.Scale(domain=[-140, -40])),
    alt.Y('latitude:Q', scale=alt.Scale(domain=[20,60])),
    alt.Color('country:N', scale=alt.Scale(domain=['Canada', 'USA'],
                                           range=['red', 'blue']))
)
cities_viz

Select two points:

In [9]:
two_cities = cities_df.sample(2, random_state=42).drop(columns=['country'])
two_cities

Unnamed: 0,longitude,latitude
30,-66.9843,44.8607
171,-80.2632,43.1408


Subtract the two cities:

In [10]:
two_cities.iloc[1] - two_cities.iloc[0]

longitude   -13.2789
latitude     -1.7199
dtype: float64

Square the differences:

In [11]:
(two_cities.iloc[1] - two_cities.iloc[0])**2

longitude    176.329185
latitude       2.958056
dtype: float64

Sum them up:

In [14]:
((two_cities.iloc[1] - two_cities.iloc[0])**2).sum()

179.28724121999983

Square root:

In [15]:
np.sqrt(((two_cities.iloc[1] - two_cities.iloc[0])**2).sum())

13.389818565611703

In [16]:
from sklearn.metrics.pairwise import euclidean_distances

In [17]:
euclidean_distances(two_cities)

array([[ 0.        , 13.38981857],
       [13.38981857,  0.        ]])

### Finding the nearest neighbour

In [18]:
dists = euclidean_distances(train_df[["latitude", "longitude"]])
dists

array([[ 0.        ,  4.92866046, 10.47586257, ..., 45.36619339,
         3.13968038,  9.58476504],
       [ 4.92866046,  0.        , 15.36399019, ..., 40.48484175,
         1.80868018, 14.45684087],
       [10.47586257, 15.36399019,  0.        , ..., 55.83947468,
        13.60621684,  0.94361393],
       ...,
       [45.36619339, 40.48484175, 55.83947468, ...,  0.        ,
        42.23325838, 54.93872568],
       [ 3.13968038,  1.80868018, 13.60621684, ..., 42.23325838,
         0.        , 12.70774745],
       [ 9.58476504, 14.45684087,  0.94361393, ..., 54.93872568,
        12.70774745,  0.        ]])

In [19]:
dists.shape

(167, 167)

In [20]:
pd.DataFrame(dists).loc[:5,:5]

Unnamed: 0,0,1,2,3,4,5
0,0.0,4.92866,10.475863,3.402295,9.046,44.329135
1,4.92866,0.0,15.36399,8.326614,13.965788,39.839439
2,10.475863,15.36399,0.0,7.19535,2.653738,54.549042
3,3.402295,8.326614,7.19535,0.0,5.643921,47.391337
4,9.046,13.965788,2.653738,5.643921,0.0,52.532333
5,44.329135,39.839439,54.549042,47.391337,52.532333,0.0


In [21]:
np.fill_diagonal(dists, np.inf)
pd.DataFrame(dists).loc[:5,:5]

Unnamed: 0,0,1,2,3,4,5
0,inf,4.92866,10.475863,3.402295,9.046,44.329135
1,4.92866,inf,15.36399,8.326614,13.965788,39.839439
2,10.475863,15.36399,inf,7.19535,2.653738,54.549042
3,3.402295,8.326614,7.19535,inf,5.643921,47.391337
4,9.046,13.965788,2.653738,5.643921,inf,52.532333
5,44.329135,39.839439,54.549042,47.391337,52.532333,inf


Feature vector for city 0:

In [22]:
train_df.iloc[0]

longitude   -76.4813
latitude     44.2307
country       Canada
Name: 160, dtype: object

Distances from city 0 to 5 other cities:

In [23]:
dists[0][:5]

array([        inf,  4.92866046, 10.47586257,  3.40229467,  9.04600003])

In [24]:
train_df.iloc[[0]]

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada


In [25]:
np.argmin(dists[0])

157

In [26]:
train_df.iloc[[157]]

Unnamed: 0,longitude,latitude,country
96,-76.3019,44.211,Canada


In [27]:
dists[0][157]

0.18047839205805613

Finding the distances to a query point

In [28]:
query_point = [[-80, 25]]

In [33]:
dists = euclidean_distances(train_df[["longitude", "latitude"]], query_point)
dists[0:5]

array([[19.54996348],
       [18.02706204],
       [24.60912622],
       [21.39718237],
       [25.24111312]])

In [34]:
np.argmin(dists)

147

In [35]:
dists[np.argmin(dists)].item()

3.8383922936564634

In [38]:
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=1)
nn.fit(train_df[['longitude', 'latitude']]);
nn.kneighbors([[-80, 25]])



(array([[3.83839229]]), array([[147]]))

In [39]:
dists = euclidean_distances(X_train[:3])
dists

NameError: name 'X_train' is not defined

In [40]:
train_df

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.0580,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada
...,...,...,...
17,-76.3305,44.1255,USA
98,-74.7287,45.0184,Canada
66,-121.4944,38.5816,USA
126,-79.5656,43.6436,Canada


In [42]:
X_train = train_df.drop(columns=["country"])
X_test = test_df.drop(columns=["country"])

In [43]:
nn = NearestNeighbors(n_neighbors=5)
nn.fit(X_train);
nn.kneighbors(X_test.iloc[1])



ValueError: Expected 2D array, got 1D array instead:
array=[-82.4066  42.9746].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [44]:
X_test.iloc[1].shape

(2,)

In [45]:
X_test.iloc[[1]].shape

(1, 2)

In [46]:
nn = NearestNeighbors(n_neighbors=5)
nn.fit(X_train);
nn.kneighbors(X_test.iloc[[1]])

(array([[0.03461517, 0.90722048, 0.90722048, 0.90970871, 0.90970871]]),
 array([[100,  39,  77, 130,  87]]))