# Homework 6

# Problem 2

We have to classify points via linear regression on a nonlinear transformed data set.


## Import libraries and read data

Let's first import libraries and read data into a pandas dataframe.

In [1]:
import pandas as pd        # for reading in the data
import numpy as np

## A. Read in training set

We read in the in-sample data.

In [2]:
# dataframe df
# assign header names for each column
# tell pandas that data is separated by whitespace
# tell pandas that datatype is float64 
df_train = pd.read_csv('in.dta.txt', names = ["x1", "x2", "y"], sep='\s+', dtype=np.float64)
print('The first 5 rows of the table:\n')
print(df_train.head(5))
print()

# alternatively read data with :
# data = np.loadtxt('in.dta.txt', dtype=np.float64)
# x1 = data[:,0]
# print(x1)
# via our mentor Jonathan @ypeels on the discussion forums


# Examine data
rows, col = df_train.shape
print("The table has {0} rows and {1} columns.".format(rows, col))
print("So we have N = {0} data points (x1,x2) with classification y.".format(rows))

The first 5 rows of the table:

         x1        x2    y
0 -0.779470  0.838221  1.0
1  0.155635  0.895377  1.0
2 -0.059908 -0.717780  1.0
3  0.207596  0.758933  1.0
4 -0.195983 -0.375487 -1.0

The table has 35 rows and 3 columns.
So we have N = 35 data points (x1,x2) with classification y.


## Implement Linear Regression

- Create feature matrix $\mathbf{Z}$ with a nonlinear transformation.
- Apply linear regression to get $\mathbf{\tilde{w}} = (Z^T Z)^{-1} Z y$, where $\mathbf{y}$ is the vector with the classifications of each point $\textbf{x} = (x_1, x_2)$.
- Classify points via $\text{sign}(\mathbf{Z}\mathbf{\tilde{w}})$ which is a vector.
- Compute the error $E_{in}$ as the fraction of points that are misclassified. This is done by counting the mismatches between $\mathbf{y}$ and $\text{sign}(\mathbf{Z}\mathbf{\tilde{w}})$ and dividing that number by the sample size.

In [3]:
def problem_2_linear_regression(dataframe):
    '''
    Takes a pandas dataframe as test set.
    
    Returns the classification error and the weight vector w_tilde
    using a linear regression hypothesis.
    '''
    
    # Use data from the pandas dataframe
    x1 = np.array(dataframe['x1'])
    x2 = np.array(dataframe['x2'])
    y = np.array(dataframe['y'])
    N = dataframe.shape[0]
    
    # feature matrix Z
    Z = np.array([np.ones(N), x1, x2,
                  x1**2, x2**2, x1*x2,
                  np.absolute(x1-x2), np.absolute(x1+x2)]).T

    # see lecture 3, slide 17
    Z_dagger = np.dot(np.linalg.inv(np.dot(Z.T, Z)), Z.T)

    # Use linear regression to get weight vector
    w_tilde = np.dot(Z_dagger, y)

    # compute classification error
    error = sum(y != np.sign(np.dot(Z, w_tilde))) / N
    return (error, w_tilde)

## Compute in-sample classification error $E_{in}$

In [4]:
# pass dataframe df_train to our function problem2_linear_regression()
# which returns the tuple (E_in, w_tilde)
E_in, w_tilde = problem_2_linear_regression(df_train)
print('The in-sample classification error is E_in = {0}'.format(E_in))

The in-sample classification error is E_in = 0.02857142857142857


## B. Read in test set

We read in the out-of-sample data.

In [5]:
# dataframe df
# assign header names for each column
# tell pandas that data is separated by whitespace
# tell pandas that datatype is float64 
df_test = pd.read_csv('out.dta.txt', names = ["x1", "x2", "y"], sep='\s+', dtype=np.float64)
print('The first 5 rows of the table:\n')
print(df_test.head(5))
print()


# Examine data
rows, col = df_test.shape
print("The table has {0} rows and {1} columns.".format(rows, col))
print("So we have N = {0} data points (x1,x2) with classification y.".format(rows))

The first 5 rows of the table:

         x1        x2    y
0 -0.106006 -0.081467 -1.0
1  0.177930 -0.345951 -1.0
2  0.102162  0.718258  1.0
3  0.694078  0.623397 -1.0
4  0.023541  0.727432  1.0

The table has 250 rows and 3 columns.
So we have N = 250 data points (x1,x2) with classification y.


## Compute out-of-sample classification error $E_{out}$

In [6]:
# Use data from the pandas dataframe
x1 = np.array(df_test['x1'])
x2 = np.array(df_test['x2'])
y = np.array(df_test['y'])
N = df_test.shape[0]

# feature matrix Z
Z = np.array([np.ones(N), x1, x2,
             x1**2, x2**2, x1*x2,
             np.absolute(x1-x2), np.absolute(x1+x2)]).T

# Compute out-of-sample error
E_out = sum(y != np.sign(np.dot(Z, w_tilde))) / N
print('The out-of-sample classification error is E_out = {0}'.format(E_out))

The out-of-sample classification error is E_out = 0.084


# Pick answer

As per problem statement we use the Euclidian distance to determine which of the possible answers is closest to our computes values.

In [7]:
choices = [(0.03, 0.08), (0.03, 0.10), (0.04, 0.09), (0.04, 0.11), (0.05, 0.10)]

computed_values = (E_in, E_out)

min_distance = 2**64
pick_choice = None

print("Our computed values are (E_in, E_out) = ", computed_values, "\n")

for choice in choices:
    distance = np.linalg.norm(np.array(choice) - np.array(computed_values))
    if distance < min_distance:
        min_distance = distance
        pick_choice = choice
    print("choice=", choice, "\tEuclidian distance:", distance)

    
print("\nWe pick:", pick_choice)

Our computed values are (E_in, E_out) =  (0.028571428571428571, 0.084000000000000005) 

choice= (0.03, 0.08) 	Euclidian distance: 0.00424744821352
choice= (0.03, 0.1) 	Euclidian distance: 0.0160636489107
choice= (0.04, 0.09) 	Euclidian distance: 0.0129078365692
choice= (0.04, 0.11) 	Euclidian distance: 0.0284009197896
choice= (0.05, 0.1) 	Euclidian distance: 0.0267429181928

We pick: (0.03, 0.08)


# Result

The correct answer is **2[a]** (0.03, 0.08).