# Lab 3: Feature Representation for Car Data
In this lab, the goal is to classify car models into either having >28 miles per gallon, or <28 miles per gallon.

In [121]:
import numpy as np
import pandas as pd

cars_pd = pd.read_csv('auto-mpg.csv')
print(cars_pd.head())

   mpg  cylinders  displacement  horsepower  weight  acceleration  model_year  \
0   -1          8         304.0         193    4732          18.5          70   
1   -1          8         307.0         200    4376          15.0          70   
2   -1          8         360.0         215    4615          14.0          70   
3   -1          8         318.0         210    4382          13.5          70   
4   -1          8         350.0         180    3664          11.0          73   

   origin          car_name  
0       1          hi 1200d  
1       1         chevy c20  
2       1         ford f250  
3       1        dodge d200  
4       1  oldsmobile omega  


# Preprocessing
- Remove unused data points
- Extract outcomes
- Standardize numerical data
- Encode origin country using one hot encoding
- Separate data into train and test set

In [122]:
# standardize a 2d array
def standardize(arr, axis=0):
    mean = np.mean(arr, axis=axis)
    std = np.std(arr, axis=axis)
    arr = (arr - mean)/std
    return arr

# one hot encoding of discrete number column
def one_hot_encoding(arr):
    arr = arr.astype(int)
    codes = np.zeros((arr.size, np.max(arr) + 1))
    codes[np.arange(arr.size).astype(int), arr.astype(int)] = 1    
    return codes

In [123]:
cars_arr = cars_pd.to_numpy()
np.random.shuffle(cars_arr)
# get outcomes
outcomes = np.array([cars_arr[:, 0]])
cars_arr = np.delete(cars_arr, 0, axis=1)

# remove useless rows: car_name
cars_arr = np.delete(cars_arr, -1, axis=1)

# set type of data to np.float64
cars_arr = cars_arr.astype(np.float64)

# remove origin from array for one hot encoding
origin_col = cars_arr[:, -1]
cars_arr = np.delete(cars_arr, -1, axis=1)

origin_col -= 1
one_hot_origin =one_hot_encoding(origin_col)

# standardize
std_cars_arr = standardize(cars_arr.astype(np.float64), axis=0)

# append one hot origin encoding to std_cars_arr
data = np.concatenate((std_cars_arr, one_hot_origin), axis=1)

# Build train and test set
data_len = data.shape[0]
split_idx = int(data_len*0.9)

train_data = data[:split_idx, :].T
train_outcomes = outcomes[:, :split_idx]

test_data = data[split_idx:, :].T
test_outcomes = outcomes[:, split_idx:]

train_data_unprocessed = cars_arr[:split_idx, :].T
test_data_unprocessed = cars_arr[split_idx:, :].T

In [124]:
print("shape of training data", train_data.shape)
print("shape of training outcomes", train_outcomes.shape)
print("shape of test data", test_data.shape)
print("shape of test outcomes", test_outcomes.shape)

shape of training data (9, 352)
shape of training outcomes (1, 352)
shape of test data (9, 40)
shape of test outcomes (1, 40)


# Model fitting
Computing accuracy of perceptron for data that's been processed and unprocessed 

In [125]:
def averaged_perceptron(data, labels, params={}, hook=None):
    # if T not in params, default to 100
    data_dim, data_amt = data.shape
    T = params.get('T', 100)
    th = np.zeros((data_dim, 1))
    th0 = np.zeros((1,1))
    ths = np.zeros((data_dim, 1))
    th0s = np.zeros((1,1))
    
    for t in range(T):
        for i_d, d in enumerate(data.T):
            l = labels[0, i_d]
            d = np.array([d])
            if l*(d.dot(th)+th0) <= 0:
                th += l*d.T
                th0 += l
            ths += th
            th0s += th0
        if hook:
            hook((ths/(data_amt*(t+1)), th0s/(data_amt*(t+1))))
    return (ths/(data_amt*T), th0s/(data_amt*T))

def compute_score(th, th0, test_data, test_outcomes):
    test_data.T.dot()


def y(x, th, th0):
    '''
    x is dimension d by 1
    th is dimension d by 1
    th0 is a scalar
    return a 1 by 1 matrix
    '''
    return np.dot(np.transpose(th), x) + th0

def positive(x, th, th0):
    '''
    x is dimension d by 1
    th is dimension d by 1
    th0 is dimension 1 by 1
    return 1 by 1 matrix of +1, 0, -1
    '''
    return np.sign(y(x, th, th0))

def score(data, labels, th, th0):
    '''
    data is dimension d by n
    labels is dimension 1 by n
    ths is dimension d by 1
    th0s is dimension 1 by 1
    return 1 by 1 matrix of integer indicating number of data points correct for
    each separator.
    '''
    return np.sum(positive(data, th, th0) == labels)

def eval_classifier(learner, data_train, labels_train, data_test, labels_test):
    th, th0 = learner(data_train, labels_train)
    test_score = score(data_test, labels_test, th, th0)
    return test_score/(labels_test.shape[1])

In [126]:
th, th0 = averaged_perceptron(train_data, train_outcomes)
th_unprocessed, th0_unprocessed = averaged_perceptron(train_data_unprocessed, train_outcomes)

# Test
Test perceptron

In [127]:
test_score = score(test_data, test_outcomes, th, th0)
test_score_unprocessed = score(test_data_unprocessed, test_outcomes, th_unprocessed, th0_unprocessed)
print("Test score for processed data: ", test_score/test_data.shape[1])
print("Test score for unprocessed data: ", test_score_unprocessed/test_data.shape[1])

Test score for processed data:  0.925
Test score for unprocessed data:  0.85
