python实现logistic regression

In [282]:
import math
import os
import time
import random
import numpy as np
from multiprocessing import Process, Array, Lock, Value

In [373]:
def lordData(path):
    data = []
    label = []
    with open(path, 'r') as fr:
        for line in fr.readlines():
            index = line.find(' ')
            data.append(line[index + 1 : ])
            label.append(int(line[0]))
    return data, label

def parseData(data):
    dataDictionary = {}
    features = data.strip().split() #先去除换行符等，而后根据空格切片
    for item in features:
        index,fea = item.split(':') #解析每个index的特征
        dataDictionary[int(index)] = fea
        #print dataDictionary
    return dataDictionary  #返回稀疏特征

def normalization(dataDict):
    #对数据进行归一化
    data_sum = sum(float(data) for i,data in dataDict.items())
    data_sum_sq = sum(float(data)*float(data) for i,data in dataDict.items())
    data_mean = float(data_sum / 201)  #均值
    data_mean_sq = float(data_sum_sq / 201) #平方的均值
    data_var = data_mean_sq - data_mean * data_mean #方差
    data_std = math.sqrt(data_var)
    #print data_mean
    #print data_mean_sq
    #print data_std
    result = {}
    for i in range(202):
        if i == 0:
            continue
        if dataDict.has_key(i):
            result[i] = float((float(dataDict[i]) - data_mean) / data_std)
        else:
            result[i] = float((0 - data_mean) / data_std)
    return result
    
def sigmoid(x):
    if x > 30:
        return 1.0
    if x < -30:
        return 0.0
    return 1.0/(1 + math.exp(-x))

def predict(weights, dataDict):
    z = 0.0
    for index in dataDict:
        z = z + weights[index] * dataDict[index]
    z = z + weights[0] #加上常数项
    return z

def calAcc(weights, val_data, val_label, iter):
    hit = 0
    num = len(val_data)
    for i in range(num):
        dataDict = val_data[i]
        z = predict(weights, dataDict)
        if ((z >= 0) and val_label[i] == 1) or ((z < 0) and val_label[i] == 0):
            hit = hit + 1
        acc = float(hit) / num
    print 'iter', iter, ': accuracy = ', hit, '/', num, '=', acc
        
    
def LogisticRegression(tdata, tlabel, val_data, val_label, param = {'learning_rate':0.01, 'batch_size':1000, 'max_iter':1000}):
    #获取参数学习率，若无设置，默认为0.01
    if param.has_key('learning_rate'):
        learning_rate = param['learning_rate']
    else:
        learning_rate = 0.01
    #获取参数batchsize，若无设置，默认为1000
    if param.has_key('batch_size'):
        batch = param['batch_size']
    else:
        batch = 1000
    #获取参数最大迭代数，若无设置，默认为1000
    if param.has_key('max_iter'):
        max_iter = param['max_iter']
    else:
        max_iter = 1000
    
    #获取特征的系数，共有201个特征，加上常数项，因此有202个需要学习的系数
    #这里采用随机初始化为[0,1)间的值
    weights = [random.random() for i in range(202)]
    totalTime = 0.0
    #通过minibatch随机梯度下降进行系数的更新
    for iter in range(max_iter):
        start = time.time()
        #rindex = int(random.uniform(0, len(tdata) - batch))
        rindex = int(0 + iter * batch)
        while (rindex >= len(tdata)):
            rindex = rindex - len(tdata)
        #weights_temp = [weights[i] for i in range(202)]
        error = np.zeros(201)
        loss = 0.0
        for i in range(batch):
            dataDict = tdata[rindex + i]
            if (tlabel[rindex + i] == 1):
                loss = loss - math.log(sigmoid(predict(weights, dataDict)))
            else:
                loss = loss - math.log(1 - sigmoid(predict(weights, dataDict))) 
            e_temp = sigmoid(predict(weights, dataDict)) - tlabel[rindex + i] 
            for index in dataDict:
                error[index - 1] = error[index - 1] + e_temp * dataDict[index]
        #计算loss
        loss = loss / batch
        print 'loss:',loss
        for index in dataDict:
            weights[index] = weights[index] - learning_rate * (float(1)/batch) * error[index - 1]
        end = time.time()
        totalTime = totalTime + float(end - start)
        if ((iter + 1) % 10 == 0):
            calAcc(weights, val_data, val_label, iter + 1)
    print 'iter cost:', float(totalTime) / max_iter, 's in average.'
    return weights

In [52]:
tdata, tlabel = lordData("./train.txt")

In [139]:
temp = parseData(tdata[0])
t = normalization(temp)

In [255]:
train_data = []
for i in range(100000):
    temp1 = parseData(tdata[i])
    temp2 = normalization(temp1)
    train_data.append(temp2)

In [276]:
w = LogisticRegression(train_data, tlabel,val_data, val_label)

loss: 1.71067992121
loss: 1.63632032552
loss: 1.55157230346
loss: 1.48540930206
loss: 1.48630348971
loss: 1.30982765716
loss: 1.2477162819
loss: 1.18068695413
loss: 1.12925550802
loss: 1.14982796815
loss: 1.10711832589
loss: 1.06516425072
loss: 1.05120964256
loss: 1.02281088489
loss: 1.02205683334
loss: 0.973219044153
loss: 0.964416915587
loss: 0.954565216854
loss: 0.923673390111
loss: 0.93291751954
loss: 0.866751145971
loss: 0.888044014319
loss: 0.798085451823
loss: 0.848373541407
loss: 0.851887309743
loss: 0.793308528491
loss: 0.847830500216
loss: 0.811557383334
loss: 0.754539282711
loss: 0.772566531104
loss: 0.736589920222
loss: 0.770809051595
loss: 0.764995125496
loss: 0.750135394852
loss: 0.747155032633
loss: 0.726490136932
loss: 0.708239486074
loss: 0.703318005474
loss: 0.661785965625
loss: 0.706274630568
loss: 0.662052185434
loss: 0.681580191999
loss: 0.672145707115
loss: 0.647304750962
loss: 0.668906591981
loss: 0.669552569043
loss: 0.629549315485
loss: 0.632892881467
loss: 0.6

loss: 0.567900315149
loss: 0.579932034508
loss: 0.592152478494
loss: 0.598192112055
loss: 0.557148776955
loss: 0.600866694948
loss: 0.614875790894
loss: 0.55045059924
loss: 0.582639713547
loss: 0.574811113987
loss: 0.578044187149
loss: 0.575428122311
loss: 0.5760894501
iter 400 : accuracy =  686 / 1000 = 0.686
loss: 0.586736323096
loss: 0.582069746165
loss: 0.525878314508
loss: 0.595156214479
loss: 0.565568749987
loss: 0.555255224663
loss: 0.579614426961
loss: 0.59229745512
loss: 0.581751704539
loss: 0.555925354276
loss: 0.584145446183
loss: 0.573672236316
loss: 0.575355035581
loss: 0.58466410873
loss: 0.558591722287
loss: 0.566219906168
loss: 0.555937066601
loss: 0.584797400575
loss: 0.59027678261
loss: 0.599098790381
loss: 0.555907369091
loss: 0.603361250601
loss: 0.586606682531
loss: 0.556314217904
loss: 0.571462349235
loss: 0.546312010205
loss: 0.575182294476
loss: 0.612683442943
loss: 0.546985580078
loss: 0.570972443189
loss: 0.566666913742
loss: 0.583943547863
loss: 0.60048202948

loss: 0.547260699065
loss: 0.571831274755
loss: 0.548424677195
loss: 0.545742479896
loss: 0.563120629304
loss: 0.563229536521
loss: 0.590209814383
loss: 0.583896217571
loss: 0.56322769495
loss: 0.577890529835
loss: 0.579186508085
loss: 0.570235404244
loss: 0.550940521655
loss: 0.569719304358
loss: 0.523472085711
loss: 0.571179065804
loss: 0.553970550726
loss: 0.567938084565
loss: 0.580200089585
loss: 0.582831329138
loss: 0.544742039673
loss: 0.58610505726
loss: 0.597422060779
loss: 0.542143563293
loss: 0.573108529352
loss: 0.566495647024
loss: 0.568190894275
loss: 0.56722861332
loss: 0.566444736496
iter 800 : accuracy =  690 / 1000 = 0.69
loss: 0.577022954361
loss: 0.564408311457
loss: 0.523120023352
loss: 0.579438938944
loss: 0.559233985765
loss: 0.546044604073
loss: 0.570226804036
loss: 0.584354211662
loss: 0.572377496775
loss: 0.551979507791
loss: 0.568701215752
loss: 0.566053560394
loss: 0.565879890241
loss: 0.570966243286
loss: 0.551618008471
loss: 0.553855253409
loss: 0.549666297

In [219]:
len(w)

202

In [259]:
math.log(10)

2.302585092994046

In [270]:
val_data = []
val_label = []
for i in range(200000,201000):
    temp1 = parseData(tdata[i])
    temp2 = normalization(temp1)
    val_data.append(temp2)
    val_label.append(tlabel[i])

In [272]:
len(val_label)

1000

In [376]:
def MultiLR(tdata, tlabel, val_data, val_label, param = {'learning_rate':0.01, 'batch_size':100000, 'max_iter':10, 'n_jobs':10}):
    #获取参数学习率，若无设置，默认为0.01
    if param.has_key('learning_rate'):
        learning_rate = param['learning_rate']
    else:
        learning_rate = 0.01
    #获取参数batchsize，若无设置，默认为1000
    if param.has_key('batch_size'):
        batch = param['batch_size']
    else:
        batch = 1000
    #获取参数最大迭代数，若无设置，默认为1000
    if param.has_key('max_iter'):
        max_iter = param['max_iter']
    else:
        max_iter = 1000
    #获取进程数，若无设置，默认为10
    if param.has_key('n_jobs'):
        n_jobs = param['n_jobs']
    else:
        n_jobs = 2
        
    weights = [random.random() for i in range(202)]
    num_of_jobs = batch / n_jobs
    totalTime = 0.0
    for iter in range(max_iter):
        start = time.time()
        lock = Lock()
        rindex = int(0 + iter * batch)
        while (rindex >= len(tdata)):
            rindex = rindex - len(tdata)
        error = Array('f', np.zeros(201))
        loss = Value('f', 0.0)
        #开启多进程
        processes = []
        for i in range(0, batch, num_of_jobs):
            if i + num_of_jobs > batch:
                continue
            process = Process(target=cal_loss, \
                args=(weights, error, loss, tdata[rindex+i:rindex+i+num_of_jobs], tlabel[rindex+i:rindex+i+num_of_jobs], lock))
            processes.append(process)
        
        #启动多进程
        for i in range(len(processes)):
            processes[i].start()
            
        #等待多进程结束
        for i in range(len(processes)):
            processes[i].join()
            #print 'Process ', i, 'ended.'
            
        losses = loss.value / batch
        print 'losses:',losses
        for index in range(201):
            weights[index + 1] = weights[index + 1] - learning_rate * (float(1)/batch) * error[index]
        end = time.time()
        totalTime = totalTime + float(end - start)
        if ((iter + 1) % 10 == 0):
            calAcc(weights, val_data, val_label, iter + 1)
    print 'iter cost:', float(totalTime) / max_iter, 's in average.'
    return weights

def cal_loss(weights, error, loss, tdata, tlabel, lock):
    losstemp = loss.value
    error_temp = [error[i] for i in range(201)]
        #print "error_temp",error_temp
    for i in range(len(tdata)):
        dataDict = tdata[i]
        if (tlabel[i] == 10):
            losstemp = losstemp - math.log(sigmoid(predict(weights, dataDict)))
        else:
            losstemp = losstemp - math.log(1 - sigmoid(predict(weights, dataDict))) 
        e_temp = sigmoid(predict(weights, dataDict)) - tlabel[i] 
        for index in dataDict:
            error_temp[index - 1] = error_temp[index - 1] + e_temp * dataDict[index]
    with lock:
        loss.value += losstemp
        for i in range(201):
            error[i] += error_temp[i]

In [378]:
w1 = MultiLR(train_data, tlabel,val_data, val_label, param = {'learning_rate':0.01, 'batch_size':100000, 'max_iter':100, 'n_jobs':10})

losses: 5.8211075
losses: 5.538529375
losses: 4.3342103125
losses: 3.0888340625
losses: 2.580300625
losses: 1.98774203125
losses: 1.90831296875
losses: 1.80111078125
losses: 1.757274375
losses: 1.7442921875
iter 10 : accuracy =  592 / 1000 = 0.592
losses: 1.7469315625
losses: 1.75758640625
losses: 1.772221875
losses: 1.78860953125
losses: 1.80549421875
losses: 1.82217078125
losses: 1.83825265625
losses: 1.73792640625
losses: 1.867426875
losses: 1.88084640625
iter 20 : accuracy =  653 / 1000 = 0.653
losses: 1.89343109375
losses: 1.905191875
losses: 1.916159375
losses: 1.92637375
losses: 1.93587984375
losses: 1.9447228125
losses: 1.9529471875
losses: 1.96059453125
losses: 1.9677046875
losses: 1.97431484375
iter 30 : accuracy =  667 / 1000 = 0.667
losses: 1.98045828125
losses: 1.986166875
losses: 1.991469375
losses: 1.9963925
losses: 2.00096046875
losses: 2.00519640625
losses: 2.00912078125
losses: 2.01275234375
losses: 2.0161096875
losses: 2.01920859375
iter 40 : accuracy =  672 / 1000 =

In [379]:
w = LogisticRegression(train_data, tlabel,val_data, val_label, param = {'learning_rate':0.01, 'batch_size':100000, 'max_iter':100})

loss: 2.23651151833
loss: 2.17199044125
loss: 2.11114709575
loss: 2.05251778177
loss: 1.99528135642
loss: 1.93897950488
loss: 1.88336228751
loss: 1.82830291174
loss: 1.77375045862
loss: 1.71970363535
iter 10 : accuracy =  443 / 1000 = 0.443
loss: 1.66619637689
loss: 1.61329023199
loss: 1.56107063795
loss: 1.50964532115
loss: 1.4591436293
loss: 1.40971586544
loss: 1.36153180415
loss: 1.31477763885
loss: 1.26965072604
loss: 1.22635174257
iter 20 : accuracy =  457 / 1000 = 0.457
loss: 1.18507430828
loss: 1.14599274163
loss: 1.10924930175
loss: 1.07494280905
loss: 1.04312066365
loss: 1.01377581462
loss: 0.986849230155
loss: 0.962237207554
loss: 0.939801898539
loss: 0.919383036107
iter 30 : accuracy =  539 / 1000 = 0.539
loss: 0.90080906432
loss: 0.883906468571
loss: 0.868506780433
loss: 0.854451270692
loss: 0.841593665537
loss: 0.829801345803
loss: 0.818955483201
loss: 0.808950495491
loss: 0.799693110741
loss: 0.791101244663
iter 40 : accuracy =  597 / 1000 = 0.597
loss: 0.783102825038
los