In [1]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=None)
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-class']
occ = data['occupation'].value_counts()
wor = data['workclass'].value_counts()
capacity = wor.sum() #32561

In [2]:
# prediction
# data extraction：5 continuous features
# age, education-num, capital-gain, capital-loss, hours-per-week
age = data['age']
eduNum = data['education-num']
capGain = data['capital-gain']
capLoss = data['capital-loss']
hpw = data['hours-per-week']
salary = data['salary-class']
assert ' ?' not in age.value_counts().index  # no missing data
assert ' ?' not in eduNum.value_counts().index
assert ' ?' not in hpw.value_counts().index
assert ' ?' not in capGain.value_counts().index
assert ' ?' not in capLoss.value_counts().index
assert ' ?' not in salary.value_counts().index
age = age.tolist()
eduNum = eduNum.tolist()
capGain = capGain.tolist()
capLoss = capLoss.tolist()
hpw = hpw.tolist()
salary = salary.tolist()
scale_half = int(capacity/2)

age_std = np.std(np.array(age[:scale_half]))
eduNum_std = np.std(np.array(eduNum[:scale_half]))
capGain_std = np.std(np.array(capGain[:scale_half]))
capLoss_std = np.std(np.array(capLoss[:scale_half]))
hpw_std = np.std(np.array(hpw[:scale_half]))
print "Standard deviation of age is", age_std
print "Standard deviation of eduNum is", eduNum_std
print "Standard deviation of capGain is", capGain_std
print "Standard deviation of capLoss is", capLoss_std
print "Standard deviation of hpw is", hpw_std

X = []
Y = []
for i in range(capacity):
    X.append([age[i],eduNum[i],hpw[i]])
    Y.append(salary[i] == ' >50K')
    
# SVM Classification
import scipy.optimize
import random
from sklearn import svm


X_train = X[:scale_half]
Y_train = Y[:scale_half]
X_valid = X[scale_half:]
Y_valid = Y[scale_half:]

c = 0.1
clf = svm.SVC(C=c, kernel='linear')
clf.fit(X_train, Y_train)

train_predictions = clf.predict(X_train)
valid_predictions = clf.predict(X_valid)

train_correct = 0
for i in range(scale_half):
    if train_predictions[i] == Y_train[i]:
        train_correct = train_correct + 1
train_acc = train_correct/float(scale_half)
valid_correct = 0
for i in range(scale_half):
    if valid_predictions[i] == Y_valid[i]:
        valid_correct = valid_correct + 1
valid_acc = valid_correct/float(scale_half)
print "C=",c,"Accuracy of the predictor on the train data is:",train_acc
print "C=",c,"Accuracy of the predictor on the valid data is:",valid_acc


Standard deviation of age is 13.651859975568133
Standard deviation of eduNum is 2.5550951067314753
Standard deviation of capGain is 7314.405625313393
Standard deviation of capLoss is 401.25526866261686
Standard deviation of hpw is 12.289313389422963
C= 0.1 Accuracy of the predictor on the train data is: 0.760626535627
C= 0.1 Accuracy of the predictor on the valid data is: 0.757800982801


In [3]:
# Logistic Regression
import scipy.optimize
import random
from sklearn import svm
from math import exp
from math import log
import numpy

def inner(x,y):
    return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
    loglikelihood = 0
    for i in range(len(X)):
        logit = inner(X[i], theta)
        loglikelihood -= log(1 + exp(-logit))
        if not(y[i]==True):
            loglikelihood -= logit
    for k in range(len(theta)):
        loglikelihood -= lam * theta[k]*theta[k]
    #print ("ll =", loglikelihood)
    return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    dl = [0.0]*len(theta)
    for k in range(len(theta)): 
        for i in range(len(X)):
            logit = inner(X[i], theta)
            dl[k] += X[i][k]*(exp(-logit))/(1 + exp(-logit))
            if not(y[i]==True):
                dl[k] -= X[i][k]
        dl[k] -= 2*lam*theta[k]
  # Negate the return value since we're doing gradient *ascent*
    return numpy.array([-x for x in dl])  

X = []
Y = []
for i in range(capacity):
    X.append([age[i],eduNum[i],capGain[i],capLoss[i],hpw[i]])
    Y.append(salary[i] == ' >50K')

scale_half = int(capacity/2)
X_train = X[:scale_half]
Y_train = Y[:scale_half]
X_valid = X[scale_half:]
Y_valid = Y[scale_half:]
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, Y_train, 1.0))
print "Final log likelihood =", -l

YPre_train = []
for i in range(scale_half):
    x_theta = 0
    for j in range(len(theta)):
        x_theta += X_train[i][j]*theta[j]
    if x_theta > 0:
        YPre_train.append(1)
    else:
        YPre_train.append(0)
count = 0
for i in range(scale_half):
    if Y_train[i] == YPre_train[i]:
        count = count + 1
acc = count/float(scale_half)
print "Accuracy of predictor in train data = ", acc

YPre_valid = []
for i in range(scale_half):
    x_theta = 0
    for j in range(len(theta)):
        x_theta += X_valid[i][j]*theta[j]
    if x_theta > 0:
        YPre_valid.append(1)
    else:
        YPre_valid.append(0)
count = 0
for i in range(scale_half):
    if Y_valid[i] == YPre_valid[i]:
        count = count + 1
acc = count/float(scale_half)
print "Accuracy of predictor in valid data = ", acc

Final log likelihood = -8613.946116166113
Accuracy of predictor in train data =  0.797235872236
Accuracy of predictor in valid data =  0.797604422604


In [4]:
# Accuracy with lambda
for lam in [0,0.01,0.1,1,10,100,1000,10000,100000,1000000]:
    theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, Y_train, lam))
    #print "Final log likelihood =", -l
    print "lambda = ",lam, "coefficient vector theta: ",theta
    
    YPre_train = []
    for i in range(scale_half):
        x_theta = 0
        for j in range(len(theta)):
            x_theta += X_train[i][j]*theta[j]
        if x_theta > 0:
            YPre_train.append(1)
        else:
            YPre_train.append(0)
    count = 0
    for i in range(scale_half):
        if Y_train[i] == YPre_train[i]:
            count = count + 1
    acc = count/float(scale_half)
    print "lambda = ", lam, "Accuracy of predictor in train data = ", acc

    YPre_valid = []
    for i in range(scale_half):
        x_theta = 0
        for j in range(len(theta)):
            x_theta += X_valid[i][j]*theta[j]
        if x_theta > 0:
            YPre_valid.append(1)
        else:
            YPre_valid.append(0)
    count = 0
    for i in range(scale_half):
        if Y_valid[i] == YPre_valid[i]:
            count = count + 1
    acc = count/float(scale_half)
    print "lambda = ", lam, "Accuracy of predictor in valid data = ", acc

lambda =  0 coefficient vector theta:  [-0.00718288 -0.0422981   0.0003254   0.00080449 -0.01441198]
lambda =  0 Accuracy of predictor in train data =  0.797235872236
lambda =  0 Accuracy of predictor in valid data =  0.797604422604
lambda =  0.01 coefficient vector theta:  [-0.00718289 -0.04229807  0.0003254   0.00080449 -0.01441198]
lambda =  0.01 Accuracy of predictor in train data =  0.797235872236
lambda =  0.01 Accuracy of predictor in valid data =  0.797604422604
lambda =  0.1 coefficient vector theta:  [-0.00718291 -0.04229781  0.0003254   0.00080449 -0.01441202]
lambda =  0.1 Accuracy of predictor in train data =  0.797235872236
lambda =  0.1 Accuracy of predictor in valid data =  0.797604422604
lambda =  1 coefficient vector theta:  [-0.00718313 -0.04229529  0.0003254   0.00080449 -0.01441239]
lambda =  1 Accuracy of predictor in train data =  0.797235872236
lambda =  1 Accuracy of predictor in valid data =  0.797604422604
lambda =  10 coefficient vector theta:  [-0.00718588 

In [5]:
def train(lam):
    theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, Y_train, lam))           
    return theta

def performance(theta, X, y):
    scores = [inner(theta,x) for x in X]
    predictions = [s > 0 for s in scores]
    correct = [(a==b) for (a,b) in zip(predictions,y)]
    acc = sum(correct) * 1.0 / len(correct)
    return acc

#BER Calculation
def ber(theta, X, y):
    scores = [inner(theta, x) for x in X]
    predictions = [s > 0 for s in scores]
    TP = [(a==1 and b==1) for (a,b) in zip(predictions, y)]
    TN = [(a==0 and b==0) for (a,b) in zip(predictions, y)]
    FP = [(a==1 and b==0) for (a,b) in zip(predictions, y)]
    FN = [(a==0 and b==1) for (a,b) in zip(predictions, y)]
    tp = sum(TP)
    tn = sum(TN)
    fp = sum(FP)
    fn = sum(FN)
    print("TP = "+str(tp)+" TN = "+str(tn)+" FP = "+str(fp)+" FN = "+str(fn))
    BER = 1-0.5*(float(fn)/(tp+fn)+float(tn)/(tn+fp))
    return BER

lam = 1.0

Theta_1 = train(lam)
acc1 = performance(Theta_1, X_train, Y_train)
acc2 = performance(Theta_1, X_valid, Y_valid)

print ("lambda = " + str(lam) + ":\taccuracy of training set = " + str(acc1))
print ("lambda = " + str(lam) + ":\taccuracy of valid set = " + str(acc2))

BER_1 = ber(Theta_1, X_valid, Y_valid)
print("BER of classifier on valid set is "+str(BER_1))

lambda = 1.0:	accuracy of training set = 0.7972358722358722
lambda = 1.0:	accuracy of valid set = 0.797616854001597
TP = 1051 TN = 11935 FP = 402 FN = 2893
BER of classifier on valid set is 0.149532818706


In [6]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

data = pd.read_csv("adult.test.txt",header=None)
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-class']
occ = data['occupation'].value_counts()
wor = data['workclass'].value_counts()
capacity = wor.sum() 
print "size of test set: ", capacity
# prediction
# data extraction：5 continuous features
# age, education-num, capital-gain, capital-loss, hours-per-week
age = data['age']
eduNum = data['education-num']
capGain = data['capital-gain']
capLoss = data['capital-loss']
hpw = data['hours-per-week']
salary = data['salary-class']
assert ' ?' not in age.value_counts().index  # no missing data
assert ' ?' not in eduNum.value_counts().index
assert ' ?' not in hpw.value_counts().index
assert ' ?' not in capGain.value_counts().index
assert ' ?' not in capLoss.value_counts().index
assert ' ?' not in salary.value_counts().index
age = age.tolist()
eduNum = eduNum.tolist()
capGain = capGain.tolist()
capLoss = capLoss.tolist()
hpw = hpw.tolist()
salary = salary.tolist()
X_test = []
Y_test = []
for i in range(capacity):
    X_test.append([age[i],eduNum[i],capGain[i],capLoss[i],hpw[i]])
    Y_test.append(salary[i] == ' >50K')

# Logistic Regression
import scipy.optimize
import random
from sklearn import svm
from math import exp
from math import log
import numpy

def inner(x,y):
    return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
    loglikelihood = 0
    for i in range(len(X)):
        logit = inner(X[i], theta)
        loglikelihood -= log(1 + exp(-logit))
        if not(y[i]==True):
            loglikelihood -= logit
    for k in range(len(theta)):
        loglikelihood -= lam * theta[k]*theta[k]
    #print ("ll =", loglikelihood)
    return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    dl = [0.0]*len(theta)
    for k in range(len(theta)): 
        for i in range(len(X)):
            logit = inner(X[i], theta)
            dl[k] += X[i][k]*(exp(-logit))/(1 + exp(-logit))
            if not(y[i]==True):
                dl[k] -= X[i][k]
        dl[k] -= 2*lam*theta[k]
  # Negate the return value since we're doing gradient *ascent*
    return numpy.array([-x for x in dl])  
    
for lam in [0,0.01,0.1,1,10,100,1000,10000,100000,1000000]:
    theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, Y_train, lam))
    #print "Final log likelihood =", -l
    print "lambda = ",lam, "coefficient vector theta: ",theta

    YPre_test = []
    for i in range(capacity):
        x_theta = 0
        for j in range(len(theta)):
            x_theta += X_test[i][j]*theta[j]
        if x_theta > 0:
            YPre_test.append(1)
        else:
            YPre_test.append(0)
    count = 0
    for i in range(capacity):
        if Y_test[i] == YPre_test[i]:
            count = count + 1
    acc = count/float(capacity)
    print "lambda = ", lam, "Accuracy of predictor in test data = ", acc

size of test set:  16281
lambda =  0 coefficient vector theta:  [-0.00718288 -0.0422981   0.0003254   0.00080449 -0.01441198]
lambda =  0 Accuracy of predictor in test data =  0.912658927584
lambda =  0.01 coefficient vector theta:  [-0.00718289 -0.04229807  0.0003254   0.00080449 -0.01441198]
lambda =  0.01 Accuracy of predictor in test data =  0.912658927584
lambda =  0.1 coefficient vector theta:  [-0.00718291 -0.04229781  0.0003254   0.00080449 -0.01441202]
lambda =  0.1 Accuracy of predictor in test data =  0.912658927584
lambda =  1 coefficient vector theta:  [-0.00718313 -0.04229529  0.0003254   0.00080449 -0.01441239]
lambda =  1 Accuracy of predictor in test data =  0.912658927584
lambda =  10 coefficient vector theta:  [-0.00718588 -0.04227433  0.00032539  0.00080445 -0.0144141 ]
lambda =  10 Accuracy of predictor in test data =  0.912658927584
lambda =  100 coefficient vector theta:  [-0.00720956 -0.04203687  0.00032538  0.00080438 -0.01444753]
lambda =  100 Accuracy of pred