In [10]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=None)
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-class']
occ = data['occupation'].value_counts()
wor = data['workclass'].value_counts()
capacity = wor.sum() #32561

# data extraction：6 continuous features
# age, education-num, capital-gain, capital-loss, hours-per-week,fnlwgt
age = data['age']
eduNum = data['education-num']
capGain = data['capital-gain']
capLoss = data['capital-loss']
hpw = data['hours-per-week']
fnl = data['fnlwgt']
sex = data['sex']
salary = data['salary-class']
assert ' ?' not in age.value_counts().index  # no missing data
assert ' ?' not in eduNum.value_counts().index
assert ' ?' not in hpw.value_counts().index
assert ' ?' not in capGain.value_counts().index
assert ' ?' not in capLoss.value_counts().index
assert ' ?' not in fnl.value_counts().index
assert ' ?' not in sex.value_counts().index
assert ' ?' not in salary.value_counts().index
age = age.tolist()
eduNum = eduNum.tolist()
capGain = capGain.tolist()
capLoss = capLoss.tolist()
hpw = hpw.tolist()
fnl = fnl.tolist()
sex = sex.tolist()
salary = salary.tolist()
fn = [f/1000 for f in fnl]

In [11]:
# Logistic Regression
import scipy.optimize
import random
from sklearn import svm
from math import exp
from math import log
import numpy

def inner(x,y):
    return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
    loglikelihood = 0
    for i in range(len(X)):
        logit = inner(X[i], theta)
        loglikelihood -= log(1 + exp(-logit))
        if not(y[i]==True):
            loglikelihood -= logit
    for k in range(len(theta)):
        loglikelihood -= lam * theta[k]*theta[k]
    #print ("ll =", loglikelihood)
    return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    dl = [0.0]*len(theta)
    for k in range(len(theta)): 
        for i in range(len(X)):
            logit = inner(X[i], theta)
            dl[k] += X[i][k]*(exp(-logit))/(1 + exp(-logit))
            if not(y[i]==True):
                dl[k] -= X[i][k]
        dl[k] -= 2*lam*theta[k]
  # Negate the return value since we're doing gradient *ascent*
    return numpy.array([-x for x in dl])  

X = []
Y = []
for i in range(capacity):
    X.append([age[i],eduNum[i],capGain[i],capLoss[i],hpw[i],fn[i],sex[i]==' Male'])
    Y.append(salary[i] == ' >50K')

scale_half = int(capacity/2)
X_train = X[:scale_half]
Y_train = Y[:scale_half]
X_valid = X[scale_half:]
Y_valid = Y[scale_half:]
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, Y_train, 1.0))
print "Final log likelihood =", -l

YPre_train = []
for i in range(scale_half):
    x_theta = 0
    for j in range(len(theta)):
        x_theta += X_train[i][j]*theta[j]
    if x_theta > 0:
        YPre_train.append(1)
    else:
        YPre_train.append(0)
count = 0
for i in range(scale_half):
    if Y_train[i] == YPre_train[i]:
        count = count + 1
acc = count/float(scale_half)
print "Accuracy of predictor in train data = ", acc

YPre_valid = []
for i in range(scale_half):
    x_theta = 0
    for j in range(len(theta)):
        x_theta += X_valid[i][j]*theta[j]
    if x_theta > 0:
        YPre_valid.append(1)
    else:
        YPre_valid.append(0)
count = 0
for i in range(scale_half):
    if Y_valid[i] == YPre_valid[i]:
        count = count + 1
acc = count/float(scale_half)
print "Accuracy of predictor in valid data = ", acc

Final log likelihood = -8329.14535628004
Accuracy of predictor in train data =  0.796744471744
Accuracy of predictor in valid data =  0.799447174447


In [12]:
# Accuracy with lambda
for lam in [0,0.01,0.1,1,10,100,1000,10000,100000,1000000]:
    theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, Y_train, lam))
    #print "Final log likelihood =", -l
    print "lambda = ",lam, "coefficient vector theta: ",theta

    YPre_train = []
    for i in range(scale_half):
        x_theta = 0
        for j in range(len(theta)):
            x_theta += X_train[i][j]*theta[j]
        if x_theta > 0:
            YPre_train.append(1)
        else:
            YPre_train.append(0)
    count = 0
    for i in range(scale_half):
        if Y_train[i] == YPre_train[i]:
            count = count + 1
    acc = count/float(scale_half)
    print "lambda = ", lam, "Accuracy of predictor in train data = ", acc

    YPre_valid = []
    for i in range(scale_half):
        x_theta = 0
        for j in range(len(theta)):
            x_theta += X_valid[i][j]*theta[j]
        if x_theta > 0:
            YPre_valid.append(1)
        else:
            YPre_valid.append(0)
    count = 0
    for i in range(scale_half):
        if Y_valid[i] == YPre_valid[i]:
            count = count + 1
    acc = count/float(scale_half)
    print "lambda = ", lam, "Accuracy of predictor in valid data = ", acc

lambda =  0 coefficient vector theta:  [-6.35349623e-03 -1.50877962e-02  3.20492440e-04  7.73964831e-04
 -1.48217915e-02 -3.85340929e-03  5.72306857e-01]
lambda =  0 Accuracy of predictor in train data =  0.796805896806
lambda =  0 Accuracy of predictor in valid data =  0.799385749386
lambda =  0.01 coefficient vector theta:  [-6.35237566e-03 -1.50825774e-02  3.20490729e-04  7.74022304e-04
 -1.48224820e-02 -3.85363078e-03  5.72214665e-01]
lambda =  0.01 Accuracy of predictor in train data =  0.796805896806
lambda =  0.01 Accuracy of predictor in valid data =  0.799385749386
lambda =  0.1 coefficient vector theta:  [-6.35346976e-03 -1.50777664e-02  3.20493825e-04  7.74106621e-04
 -1.48186682e-02 -3.85413412e-03  5.72065511e-01]
lambda =  0.1 Accuracy of predictor in train data =  0.796805896806
lambda =  0.1 Accuracy of predictor in valid data =  0.799385749386
lambda =  1 coefficient vector theta:  [-6.35032827e-03 -1.50748478e-02  3.20522006e-04  7.73677630e-04
 -1.47905294e-02 -3.851

In [13]:
# test set
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

data = pd.read_csv("adult.test.txt",header=None)
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary-class']
occ = data['occupation'].value_counts()
wor = data['workclass'].value_counts()
capacity = wor.sum()
print "size of test set: ", capacity

# data extraction：6 continuous features
# age, education-num, capital-gain, capital-loss, hours-per-week,fnlwgt
age = data['age']
eduNum = data['education-num']
capGain = data['capital-gain']
capLoss = data['capital-loss']
hpw = data['hours-per-week']
fnl = data['fnlwgt']
sex = data['sex']
salary = data['salary-class']
assert ' ?' not in age.value_counts().index  # no missing data
assert ' ?' not in eduNum.value_counts().index
assert ' ?' not in hpw.value_counts().index
assert ' ?' not in capGain.value_counts().index
assert ' ?' not in capLoss.value_counts().index
assert ' ?' not in fnl.value_counts().index
assert ' ?' not in sex.value_counts().index
assert ' ?' not in salary.value_counts().index
age = age.tolist()
eduNum = eduNum.tolist()
capGain = capGain.tolist()
capLoss = capLoss.tolist()
hpw = hpw.tolist()
fnl = fnl.tolist()
sex = sex.tolist()
salary = salary.tolist()
fn = [f/1000 for f in fnl]

size of test set:  16281


In [14]:
# Logistic Regression
import scipy.optimize
import random
from sklearn import svm
from math import exp
from math import log
import numpy

def inner(x,y):
    return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
    loglikelihood = 0
    for i in range(len(X)):
        logit = inner(X[i], theta)
        loglikelihood -= log(1 + exp(-logit))
        if not(y[i]==True):
            loglikelihood -= logit
    for k in range(len(theta)):
        loglikelihood -= lam * theta[k]*theta[k]
    #print ("ll =", loglikelihood)
    return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    dl = [0.0]*len(theta)
    for k in range(len(theta)): 
        for i in range(len(X)):
            logit = inner(X[i], theta)
            dl[k] += X[i][k]*(exp(-logit))/(1 + exp(-logit))
            if not(y[i]==True):
                dl[k] -= X[i][k]
        dl[k] -= 2*lam*theta[k]
  # Negate the return value since we're doing gradient *ascent*
    return numpy.array([-x for x in dl])  

X_test = []
Y_test = []
for i in range(capacity):
    X_test.append([age[i],eduNum[i],capGain[i],capLoss[i],hpw[i],fn[i],sex[i]==' Male'])
    Y_test.append(salary[i] == ' >50K')
    
for lam in [0,0.01,0.1,1,10,100,1000,10000,100000,1000000]:
    theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, Y_train, lam))
    #print "Final log likelihood =", -l
    print "lambda = ",lam, "coefficient vector theta: ",theta

    YPre_test = []
    for i in range(capacity):
        x_theta = 0
        for j in range(len(theta)):
            x_theta += X_test[i][j]*theta[j]
        if x_theta > 0:
            YPre_test.append(1)
        else:
            YPre_test.append(0)
    count = 0
    for i in range(capacity):
        if Y_test[i] == YPre_test[i]:
            count = count + 1
    acc = count/float(capacity)
    print "lambda = ", lam, "Accuracy of predictor in test data = ", acc


lambda =  0 coefficient vector theta:  [-6.35349623e-03 -1.50877962e-02  3.20492440e-04  7.73964831e-04
 -1.48217915e-02 -3.85340929e-03  5.72306857e-01]
lambda =  0 Accuracy of predictor in test data =  0.913887353357
lambda =  0.01 coefficient vector theta:  [-6.35237566e-03 -1.50825774e-02  3.20490729e-04  7.74022304e-04
 -1.48224820e-02 -3.85363078e-03  5.72214665e-01]
lambda =  0.01 Accuracy of predictor in test data =  0.913887353357
lambda =  0.1 coefficient vector theta:  [-6.35346976e-03 -1.50777664e-02  3.20493825e-04  7.74106621e-04
 -1.48186682e-02 -3.85413412e-03  5.72065511e-01]
lambda =  0.1 Accuracy of predictor in test data =  0.913887353357
lambda =  1 coefficient vector theta:  [-6.35032827e-03 -1.50748478e-02  3.20522006e-04  7.73677630e-04
 -1.47905294e-02 -3.85182770e-03  5.69800551e-01]
lambda =  1 Accuracy of predictor in test data =  0.913948774645
lambda =  10 coefficient vector theta:  [-6.27019415e-03 -1.49447435e-02  3.20439672e-04  7.74021801e-04
 -1.46054