In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr

In [13]:
def loadDataSet(filePath):
    ''' 数据集读取函数'''
    data, label = [], []
    # 读取数据集
    with open(filePath) as f:
        for line in f.readlines():
            temp = line.strip().split(" ")
            data.append([float(i) for i in temp[:-1]])
            if temp[-1] != '?':
                temp[-1] = float(temp[-1])
            label.append(temp[-1])
    ##### 输出数据集相关信息 ##########
    print("data dimension of dataset：", len(data[0]))
    print("number of sample in data :", len(data))
    print("label frequency:", dict(Counter(label)))
    ##### 输出数据集相关信息 ##########
    return np.array(data), np.array(label)

class LogisticRegression:
    '''逻辑回归类实现'''
    
    def __addOne2Samples(self, dataSet):
        '''给每一个样本前加一个常数1'''
        ones = np.ones(len(dataSet))
        return np.column_stack((ones, dataSet))
    
    def sigmoid(self, x):
        '''sigmoid函数实现'''
        return 1/(1+np.exp(-1*x))
    
    def fit(self, trainSet, label, eta=1e-3, maxRunTimes=100, reduceEta=True):
        '''根据给定的训练集和标签训练PLA的参数 w '''
        #设置默认的 w 全为1
        self.w = np.mat(np.zeros(trainSet.shape[1]+1)).reshape(-1,1)
        #给训练集中每一个样本前加一个常数1，并转换为numpy矩阵
        trainSet = np.mat(self.__addOne2Samples(trainSet))
        #将标签转换为numpy矩阵，并将其设置为只有一列的数据的矩阵
        label = np.mat(label).reshape(-1,1)
        eta_reduced = eta
        for i in range(maxRunTimes):
            if reduceEta:
                eta_reduced = eta/(1+i)+1e-7
            #根据矩阵运算得到整个数据集每个维度梯度
            gradient = trainSet.transpose()*(self.sigmoid(trainSet*self.w) - label)
            #print("第"+str(i+1)+"次迭代的梯度值：\n", gradient)
            #更新 w
            self.w -= eta_reduced*gradient
        
    def __apply(self, x):
        '''利用训练好的 w 对输入的向量x进行分类'''
        w = np.array(self.w)[:,0] #转换为numpy向量，方便后续计算
        return 1 if self.sigmoid(np.dot(w, x)) > 0.5 else 0
    
    def apply(self, otherSet):
        '''根据已训练出的 w 对其他数据集进行划分'''
        otherSet = self.__addOne2Samples(otherSet)
        outputLabel = np.zeros(otherSet.shape[0])
        for index, sample in enumerate(otherSet):
            outputLabel[index] = self.__apply(sample)
        return outputLabel
    
    def getW(self):
        return np.array(self.w)[:,0]
    
    
LR = LogisticRegression()
LR.fit(trainSet, trainSet_label, eta=1, maxRunTimes=1)
LR.apply(testSet)
LR.getW()

array([ 0.,  0.,  0.])

array([-0.50000005, -1.50000015, -5.0000005 ,  2.0000002 ,  3.50000035,
        0.50000005, -2.50000025])

In [10]:
trainSet, trainSet_label = loadDataSet('.\\data\\train.txt')
trainSet
testSet, testSet_lable = loadDataSet('.\\data\\test.txt')
testSet

data dimension of dataset： 6
number of sample in data : 5
label frequency: {0.0: 3, 1.0: 2}


array([[ 4.,  6.,  2.,  3.,  4.,  5.],
       [ 3.,  1.,  6.,  8.,  9.,  3.],
       [ 6.,  5.,  2.,  0.,  3.,  4.],
       [ 5.,  2.,  7.,  3.,  5.,  1.],
       [ 1.,  2.,  5.,  1.,  6.,  0.]])

data dimension of dataset： 6
number of sample in data : 3
label frequency: {'?': 3}


array([[ 7.,  2.,  5.,  2.,  6.,  1.],
       [ 1.,  2.,  1.,  1.,  5.,  2.],
       [ 7.,  6.,  2.,  1.,  5.,  7.]])