In [1]:
import numpy as np
import scipy
from liblinearutil import *

# Data Preprocessing

In [2]:
# data loading
traindta = np.loadtxt("hw4_train.dat", dtype=np.float, delimiter=' ')
train_x = traindta[:, 0:6]
train_y = traindta[:, 6]

In [3]:
# data loading
testdta = np.loadtxt("hw4_test.dat", dtype=np.float, delimiter=' ')
test_x = testdta[:, 0:6]
test_y = testdta[:, 6]

In [4]:
# feature tranformation
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
train_tx = poly.fit_transform(train_x)
test_tx = poly.fit_transform(test_x)

In [5]:
np.shape(train_tx)

(200, 28)

In [6]:
np.shape(test_tx)

(300, 28)

# Relation of $\lambda$ and C

Set $\dfrac{\lambda}{N} = \dfrac{1}{2}$.
Set $C = \dfrac{1}{N}$.

Therefore, $C = \dfrac{1}{2\lambda}$.

$\log{\lambda} \in \{−4,−2,0,2,4\} \rightarrow C \in \{5000,50,0.5,0.005,0.00005\}$

# Q16

In [19]:
c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    prob = problem(train_y, train_tx)
    param = parameter('-s 0 -c {c} -e {e}'.format(c = i, e = e))
    m = train(prob,param)
    p_labs, p_acc, p_vals = predict(test_y, test_tx, m)
    print("When c =", i,"The error in test data is", 100-p_acc[0])

Accuracy = 86.6667% (260/300) (classification)
When c = 5000 The error in test data is 13.333333333333329
Accuracy = 87% (261/300) (classification)
When c = 50 The error in test data is 13.0
Accuracy = 80.6667% (242/300) (classification)
When c = 0.5 The error in test data is 19.333333333333343
Accuracy = 74.3333% (223/300) (classification)
When c = 0.005 The error in test data is 25.66666666666667
Accuracy = 51.6667% (155/300) (classification)
When c = 5e-05 The error in test data is 48.33333333333333


In [80]:
# SKLEARN VERSION
from sklearn.linear_model import LogisticRegression

c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    lr_model = LogisticRegression(C = i, penalty = 'l2', tol = e, solver ='liblinear')
    w = lr_model.fit(train_tx, train_y)
    print(w.score(test_tx, test_y))

0.8666666666666667
0.8733333333333333
0.82
0.7466666666666667
0.52


Thus, when $C = 50$ ($\log\lambda = -2$), the 0/1 accuracy is highest. [b]

# Q17

In [23]:
c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    prob = problem(train_y, train_tx)
    param = parameter('-s 0 -c {c} -e {e}'.format(c = i, e = e))
    m = train(prob,param)
    p_labs, p_acc, p_vals = predict(train_y, train_tx, m)
    print("When c =", i,"The error in train data is", 100-p_acc[0])
    ACC, MSE, SCC = evaluations(train_y, p_labs)
    print("Error:",100-ACC)

Accuracy = 91% (182/200) (classification)
When c = 5000 The error in train data is 9.0
Error: 9.0
Accuracy = 90% (180/200) (classification)
When c = 50 The error in train data is 10.0
Error: 10.0
Accuracy = 87% (174/200) (classification)
When c = 0.5 The error in train data is 13.0
Error: 13.0
Accuracy = 80.5% (161/200) (classification)
When c = 0.005 The error in train data is 19.5
Error: 19.5
Accuracy = 46.5% (93/200) (classification)
When c = 5e-05 The error in train data is 53.5
Error: 53.5


In [81]:
# SKLEARN VERSION
from sklearn.linear_model import LogisticRegression

c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    lr_model = LogisticRegression(C = i, penalty = 'l2', tol = e, solver ='liblinear')
    w = lr_model.fit(train_tx, train_y)
    print(w.score(train_tx, train_y))

0.91
0.905
0.875
0.805
0.465


Thus, when $C = 5000$ ($\log\lambda = -4$), the 0/1 accuracy is highest. [a]

# Q18

In [16]:
subtrain_tx = train_tx[0:120,:]
subval_tx = train_tx[120:,:]
subtrain_y = train_y[0:120]
subval_y = train_y[120:]

In [24]:
c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    prob = problem(subtrain_y, subtrain_tx)
    param = parameter('-s 0 -c {c} -e {e}'.format(c = i, e = e))
    m = train(prob,param)
    p_labs, p_acc, p_vals = predict(subval_y, subval_tx, m)
    p_labs2, p_acc2, p_vals2 = predict(test_y, test_tx, m)
    print("When c =", i,"The err in val data is", 100-p_acc[0],"The err in test data is", 100-p_acc2[0])

Accuracy = 80% (64/80) (classification)
Accuracy = 82.3333% (247/300) (classification)
When c = 5000 The err in val data is 20.0 The err in test data is 17.666666666666657
Accuracy = 86.25% (69/80) (classification)
Accuracy = 85.6667% (257/300) (classification)
When c = 50 The err in val data is 13.75 The err in test data is 14.333333333333329
Accuracy = 76.25% (61/80) (classification)
Accuracy = 76% (228/300) (classification)
When c = 0.5 The err in val data is 23.75 The err in test data is 24.0
Accuracy = 73.75% (59/80) (classification)
Accuracy = 76.3333% (229/300) (classification)
When c = 0.005 The err in val data is 26.25 The err in test data is 23.66666666666667
Accuracy = 42.5% (34/80) (classification)
Accuracy = 51.6667% (155/300) (classification)
When c = 5e-05 The err in val data is 57.5 The err in test data is 48.33333333333333


In [82]:
# SKLEARN VERSION
from sklearn.linear_model import LogisticRegression

c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    lr_model = LogisticRegression(C = i, penalty = 'l2', tol = e, solver ='liblinear')
    w = lr_model.fit(subtrain_tx, subtrain_y)
    print(w.score(subval_tx, subval_y), w.score(test_tx, test_y))

0.8125 0.8266666666666667
0.8625 0.8633333333333333
0.775 0.7766666666666666
0.7625 0.7666666666666667
0.425 0.5166666666666667


Cause $\text{Min}$ $E_{val} = 13.75$ whose $\log\lambda = -2$, by the $w^-$ the validation data indicates, the corresponding $E_{out}(w^-) = 14.3$. [e]

# Q19

In [26]:
c = 50
e = 0.000001

prob = problem(train_y, train_tx)
param = parameter('-s 0 -c {c} -e {e}'.format(c = c, e = e))
m = train(prob,param)
p_labs, p_acc, p_vals = predict(test_y, test_tx, m)
print("When c =", c,"The error in test data is", 100-p_acc[0])

Accuracy = 87% (261/300) (classification)
When c = 50 The error in test data is 13.0


Using $\log\lambda = -2$, by the $w$ the whole training data indicates, the corresponding $E_{out}(w) = 13.0\%$. [d]

# Q20

In [72]:
c = [5000,50,0.5,0.005,0.00005]
e = 0.000001


f1x = train_tx[0:40,:]
f1y = train_y[0:40]
f2x = train_tx[40:80,:]
f2y = train_y[40:80]
f3x = train_tx[80:120,:]
f3y = train_y[80:120]
f4x = train_tx[120:160,:]
f4y = train_y[120:160]
f5x = train_tx[160:200,:]
f5y = train_y[160:200]

fx = [f1x,f2x,f3x,f4x,f5x]
fy = [f1y,f2y,f3y,f4y,f5y]

In [74]:
for i in c:
    ttlerr = 0
    for k in range(5):
        tempx = np.concatenate((fx[(k+1)%5],fx[(k+2)%5],fx[(k+3)%5],fx[(k+4)%5]), axis=0)
        tempy = np.concatenate((fy[(k+1)%5],fy[(k+2)%5],fy[(k+3)%5],fy[(k+4)%5]), axis=0)
        prob = problem(tempy,tempx)
        param = parameter('-s 0 -c {c} -e {e}'.format(c = i, e = e))
        m = train(prob,param)
        p_labs, p_acc, p_vals = predict(fy[k], fx[k], m)
        ttlerr += 100-p_acc[0]
    print("When c =", i,"The error in CV is", ttlerr/5)

Accuracy = 87.5% (35/40) (classification)
Accuracy = 77.5% (31/40) (classification)
Accuracy = 95% (38/40) (classification)
Accuracy = 77.5% (31/40) (classification)
Accuracy = 90% (36/40) (classification)
When c = 5000 The error in CV is 14.5
Accuracy = 85% (34/40) (classification)
Accuracy = 80% (32/40) (classification)
Accuracy = 95% (38/40) (classification)
Accuracy = 85% (34/40) (classification)
Accuracy = 95% (38/40) (classification)
When c = 50 The error in CV is 12.0
Accuracy = 80% (32/40) (classification)
Accuracy = 90% (36/40) (classification)
Accuracy = 90% (36/40) (classification)
Accuracy = 80% (32/40) (classification)
Accuracy = 82.5% (33/40) (classification)
When c = 0.5 The error in CV is 15.5
Accuracy = 77.5% (31/40) (classification)
Accuracy = 92.5% (37/40) (classification)
Accuracy = 85% (34/40) (classification)
Accuracy = 75% (30/40) (classification)
Accuracy = 80% (32/40) (classification)
When c = 0.005 The error in CV is 18.0
Accuracy = 42.5% (17/40) (classificati

In [83]:
#in-built cross validation

c = [5000,50,0.5,0.005,0.00005]
e = 0.000001

for i in c:
    
    prob = problem(train_y, train_tx)
    param = parameter('-s 0 -c {c} -e {e} -v 5'.format(c = i, e = e))
    m = train(prob,param)

Cross Validation Accuracy = 86%
Cross Validation Accuracy = 88.5%
Cross Validation Accuracy = 85.5%
Cross Validation Accuracy = 81.5%
Cross Validation Accuracy = 50.5%


Hence, $\text{Min}\ E_{VC} = 12\%$. [c]