# Section 1: Imports, Functions, and Constants

In [1]:
# Library Import Statements

import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize


In [2]:
def euclidean_dist_calc(test, train):
    """
    Compute the Euclidean distance between each
    point in test and all points in train.
    """
    num_test = test.shape[0]
    num_train = train.shape[0]
    dists = np.zeros((num_test, num_train)) 
    sum_squares_train = np.sum(np.square(train), axis=1)
    sum_squares_test = np.sum(np.square(test), axis=1)
    sum_combo = np.dot(test, train.T) * -2
    dists = np.sqrt(((sum_combo + sum_squares_train).T + sum_squares_test).T)
    return dists

In [3]:
def manhattan_dist_calc(test, train):
    """
    Compute the Manhattan distance between each
    point in test and all points in train.
    """
    num_test = test.shape[0]
    num_train = train.shape[0]
    dists = np.zeros((num_test, num_train)) 
    for i in range(num_test):
        test_point = test[i]
        result_matrix = train - test_point
        result_matrix = np.absolute(result_matrix)
        result_vector = np.sum(result_matrix, axis=1)
        dists[i] = result_vector
    return dists

In [4]:
def compute_distances(test, train, metric):
    """
    Computes distances according to either 
    Euclidean or Manhattan distance metrics given
    a test set and a training set.
    
    The input metric must either be "Euclidean" or 
    "Manhattan".
    """
    if metric == "Euclidean":
        dists = euclidean_dist_calc(test, train)
    elif metric == "Manhattan":
        dists = manhattan_dist_calc(test, train)
    else:
        raise ValueError("Invalid value d% for metric; must be Euclidean or Manhattan as a string." % metric)
    return dists

In [5]:
k = 1

# Section 2: Make Train & Test Sets

In [6]:
# Data Import Cell
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv"
data = pd.read_csv(url, sep=",")
data = data.tail(5000)
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
5000,5.825547,6.153305,9.215683,2.644971,4.749492,-1.491278,-1.382048,-1.876166,0.528459,0.155706,0.156222,0.458131,-0.023089,stable
5001,2.750088,8.767781,4.517367,1.373404,2.700511,-0.58731,-1.190554,-0.922647,0.471845,0.428823,0.596576,0.249577,-0.021277,stable
5002,6.253534,6.625686,0.613047,7.550439,4.305593,-1.52939,-1.415248,-1.360955,0.376768,0.686419,0.598896,0.408225,0.009619,unstable
5003,1.800725,1.185765,1.515843,8.576087,4.255226,-1.270079,-1.970055,-1.015092,0.34115,0.623442,0.247956,0.653949,-0.038621,stable
5004,7.15043,4.837233,3.244408,2.089166,4.539624,-1.981831,-1.375972,-1.181821,0.31928,0.072775,0.842072,0.577839,-0.027978,stable


In [7]:
# Convert Class labels to numbers
# 0 = stable
# 1 = unstable


data.loc[(data['stabf'] == 'stable'), 'stabf'] = 0
data.loc[(data['stabf'] == 'unstable'), 'stabf'] = 1
data['stabf'] = pd.to_numeric(data['stabf'])
data['stabf']

5000    0
5001    0
5002    1
5003    0
5004    0
       ..
9995    1
9996    0
9997    0
9998    1
9999    1
Name: stabf, Length: 5000, dtype: int64

In [8]:
# Split dataframe into testing and training sets
train_data = data.iloc[:3000]
test_data = data.iloc[3000:]
test_data

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
8000,5.306905,2.452792,5.414825,4.964973,2.853706,-0.952709,-1.391630,-0.509366,0.160440,0.958669,0.867021,0.183520,0.026496,1
8001,7.745244,4.311693,7.893386,0.917804,3.457962,-1.425012,-1.291345,-0.741605,0.576198,0.851929,0.172012,0.788637,-0.004614,0
8002,5.279096,5.458030,4.123039,9.860768,2.441075,-1.028213,-0.790346,-0.622516,0.462567,0.287156,0.252946,0.663104,0.021423,1
8003,1.596736,5.744044,5.888295,2.434392,3.762364,-1.019351,-0.869846,-1.873167,0.769447,0.733218,0.096916,0.159050,-0.033336,0
8004,7.214450,2.265906,3.889059,9.012663,3.675266,-0.722557,-1.249246,-1.703463,0.837425,0.599088,0.975245,0.772472,0.061052,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,1
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,0
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,0
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,1


# Section 3: Split Labels from Train & Test Sets, Numpyize Them

In [9]:
# Save labels for train and test sets
train_labels = train_data['stabf'].to_numpy()
test_labels = test_data['stabf'].to_numpy()
test_labels

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [10]:
# Drop class column from train and test sets
train = train_data.drop('stabf', axis=1)
test = test_data.drop('stabf', axis=1)
test

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
8000,5.306905,2.452792,5.414825,4.964973,2.853706,-0.952709,-1.391630,-0.509366,0.160440,0.958669,0.867021,0.183520,0.026496
8001,7.745244,4.311693,7.893386,0.917804,3.457962,-1.425012,-1.291345,-0.741605,0.576198,0.851929,0.172012,0.788637,-0.004614
8002,5.279096,5.458030,4.123039,9.860768,2.441075,-1.028213,-0.790346,-0.622516,0.462567,0.287156,0.252946,0.663104,0.021423
8003,1.596736,5.744044,5.888295,2.434392,3.762364,-1.019351,-0.869846,-1.873167,0.769447,0.733218,0.096916,0.159050,-0.033336
8004,7.214450,2.265906,3.889059,9.012663,3.675266,-0.722557,-1.249246,-1.703463,0.837425,0.599088,0.975245,0.772472,0.061052
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789


In [11]:
# Convert train and test to numpy arrays
train = train.to_numpy()
test = test.to_numpy()
test

array([[ 5.30690458e+00,  2.45279249e+00,  5.41482459e+00, ...,
         8.67021410e-01,  1.83519882e-01,  2.64962906e-02],
       [ 7.74524391e+00,  4.31169345e+00,  7.89338602e+00, ...,
         1.72011877e-01,  7.88636816e-01, -4.61427241e-03],
       [ 5.27909591e+00,  5.45802967e+00,  4.12303948e+00, ...,
         2.52945554e-01,  6.63103928e-01,  2.14226453e-02],
       ...,
       [ 2.36403419e+00,  2.84203025e+00,  8.77639096e+00, ...,
         1.49286458e-01,  1.45984032e-01, -3.18098881e-02],
       [ 9.63151069e+00,  3.99439760e+00,  2.75707093e+00, ...,
         8.89118346e-01,  8.18391326e-01,  3.77888091e-02],
       [ 6.53052662e+00,  6.78178990e+00,  4.34969522e+00, ...,
         3.78760930e-01,  9.42630833e-01,  4.52633082e-02]])

In [12]:
# normalize the datasets
train = normalize(train, axis=0)
test = normalize(test, axis=0)
test

array([[ 0.01997457,  0.00929616,  0.02046053, ...,  0.03290911,
         0.00680405,  0.01478897],
       [ 0.0291522 ,  0.01634145,  0.02982605, ...,  0.00652897,
         0.02923891, -0.00257547],
       [ 0.01986991,  0.0206861 ,  0.01557937, ...,  0.00960093,
         0.02458474,  0.0119571 ],
       ...,
       [ 0.00889795,  0.01077138,  0.03316259, ...,  0.00566639,
         0.00541239, -0.01775477],
       [ 0.03625189,  0.01513889,  0.0104179 , ...,  0.03374783,
         0.03034206,  0.02109192],
       [ 0.02458015,  0.02570319,  0.01643582, ...,  0.01437644,
         0.03494827,  0.02526383]])

# Section 4: Analysis

In [13]:
# Compute Distance Matrix

dists = compute_distances(test, train, "Euclidean")
#dists = compute_distances(test, train, "Manhattan")
dists

array([[0.05656039, 0.04742277, 0.03528381, ..., 0.06292082, 0.0555769 ,
        0.0355984 ],
       [0.03771883, 0.04687068, 0.0450504 , ..., 0.05494742, 0.04898223,
        0.04444321],
       [0.04538253, 0.04727916, 0.03121922, ..., 0.03755693, 0.05127272,
        0.04494746],
       ...,
       [0.0379053 , 0.04752331, 0.05298324, ..., 0.05169922, 0.048019  ,
        0.04054628],
       [0.06163712, 0.05883515, 0.0375313 , ..., 0.0623689 , 0.05840329,
        0.04991057],
       [0.0580277 , 0.06039776, 0.03919261, ..., 0.05632557, 0.06536506,
        0.05633403]])

In [14]:
# Make Predictions

num_test = dists.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
    nearest_neighbors_indices = np.argsort(dists[i])
    closest_y = np.zeros(k)
    for m in range(k):
        closest_y[m] = train_labels[nearest_neighbors_indices[m]]
    labels = []
    label_counts = []
    for label in closest_y:
        if label not in labels:
          labels.append(label)
          label_counts.append(1)
        else:
          label_counts[labels.index(label)] = label_counts[labels.index(label)] + 1
    mode_labels = np.argsort(label_counts)[::-1]
    y_pred[i] = labels[mode_labels[0]]
y_pred

array([1., 1., 1., ..., 0., 1., 1.])

In [15]:
# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_pred == test_labels)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

Got 1868 / 2000 correct => accuracy: 0.934000


# Section 5: Reminders and Task Description

## DOs
- DO Run the k-value cell in Section 1 and the cells in Section 4 individually to test each parameter set (k-value and distance metric).
- DO Edit the cell for k-value in Section 1.
- DO Edit the cell that calculates the distances using one of two distance metrics in Section 4.
- DO Take Notes on paper.

## DO NOTs
- DO NOT Run All.
- DO NOT Edit any cells other than the two specified.
- DO NOT Move cells.
- DO NOT Delete cells.
- DO NOT Add cells.
- DO NOT Take Notes in the notebook.

## Task Description Reminder
You are tasked with tuning the parameters for k-value and distance metric to find the best k-value for each distance metric and then to determine which distance metric, with its optimal k-value, results in the highest accuracy (most accurate model). To do this, you will have to evaluate each combination.

## Lorem Ipsum
Sed ut perspiciatis, unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam eaque ipsa, quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt, explicabo. Nemo enim ipsam voluptatem, quia voluptas sit, aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos, qui ratione voluptatem sequi nesciunt, neque porro quisquam est, qui dolorem ipsum, quia dolor sit amet consectetur adipiscing velit, sed quia non numquam do eius modi tempora incididunt, ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrumd exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? DQuis autem vel eum irure reprehenderit, qui in ea voluptate velit esse, quam nihil molestiae consequatur, vel illum, qui dolorem eum fugiat, quo voluptas nulla pariatur? At vero eos et accusamus et iusto odio dignissimos ducimus, qui blanditiis praesentium voluptatum deleniti atque corrupti, quos dolores et quas molestias excepturi sint, obcaecati cupiditate non provident, similique sunt in culpa, qui officia deserunt mollitia animi, id est laborum et dolorum fuga. Et harum quidem rerum facilis est et expedita distinctio. Nam libero tempore, cum soluta nobis est eligendi optio, cumque nihil impedit, quo minus id, quod maxime placeat, facere possimus, omnis voluptas assumenda est, omnis dolor repellendus. Temporibus autem quibusdam et aut officiis debitis aut rerum necessitatibus saepe eveniet, ut et voluptates repudiandae sint et molestiae non recusandae. Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat.

# Irrelevant Text
This is more irrelevant text padding the notebook. Don't read this any further. It's pointless. Why are you still reading this? Focus on the task at hand.