In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [8]:
def loadDataSet(filePath):
    '''数据集读取函数'''
    data, label = [], []
    # 读取数据集
    with open(filePath) as f:
        for line in f.readlines():
            temp = line.strip().split(",")
            print(temp)
            data.append([float(i) for i in temp[:-1]])
            if temp[-1] != '?':
                temp[-1] = float(temp[-1])
            label.append(temp[-1])
    #####输出数据集相关信息##########
    print("\ndata dimension of dataset：", len(data[0]))
    print("number of training data :", len(data))
    print("label frequency:", dict(Counter(label)))
    #####输出数据集相关信息##########
    return data, label

In [9]:
trainSet, trainSet_label = loadDataSet("fri56train.csv")

['2.4104', '2.0597', '1.8863', '2.646', '3.4419', '3.4161', '-1']
['0', '0', '0.66582', '-1.3125', '5.4326', '5.605', '-1']
['4.625', '1.6094', '3.0584', '7.2627', '3.8202', '3.9627', '-1']
['7.2286', '5.5099', '4.4569', '7.5059', '0', '1.4612', '1']
['8.875', '-2.7671', '0', '2.1675', '-5.0326', '5.0541', '1']
['-4.6052', '3.7136', '-7.467', '1', '0', '-5.2147', '-1']
['-8.6378', '4.8647', '0', '2.6785', '4.7847', '4.7464', '-1']
['7.8943', '0', '-10.267', '1.0132', '4.0489', '-4.0372', '1']

data dimension of dataset： 6
number of training data : 8
label frequency: {1.0: 3, -1.0: 5}


In [13]:
testSet, _ = loadDataSet("fri56test.csv")

['1', '4.5468', '-4.5072', '4.585', '4.585', '3.3193', '?']
['2.3104', '4.6052', '6.5469', '4.6052', '3.4562', '4.6052', '?']

data dimension of dataset： 6
number of training data : 2
label frequency: {'?': 2}


In [12]:
class PLA_origin:
    
    def __init__(self, w0, theta=0):
        '''初始化变量'''
        self.w = np.array([theta]+w0, dtype='float64')
    
    def __addOne2Samples(self, dataSet):
        '''给每一个样本前加一个常数1'''
        ones = np.ones(len(dataSet))
        dataSet = np.array(dataSet)
        return np.column_stack((ones, dataSet))
    
    def fit(self, trainSet, label, maxRunTimes=100):
        '''根据给定的训练集和标签训练PLA的参数 w '''
        label = np.array(label, dtype='float64')
        #给每一个样本前加一个常数1
        trainSet = self.__addOne2Samples(trainSet)
        cnt = 1
        while cnt <= maxRunTimes:
            cnt += 1
            #遍历所有样本
            for index, sample in enumerate(trainSet):
                #更新预测错误的样本
                if np.sign(np.dot(sample, self.w)) != label[index]:
                    self.w += label[index]*sample
                    
    def apply(self, otherSet):
        '''根据已训练出的 w 对其他数据集进行划分'''
        otherSet = self.__addOne2Samples(otherSet)
        outputLabel = np.zeros(otherSet.shape[0])
        for index, sample in enumerate(otherSet):
            outputLabel[index] = np.sign(np.dot(sample, self.w))
        return outputLabel
    
    def getW(self):
        return self.w

############测试程序###################
a = [[-4, -1], [0, 3]] # trainSet
b = [1,-1]             # label
c = [1, 1]             # w0
d = 1                  # theta
e = [[-2, 3]]          # otherSet

p = PLA_origin(w0=c, theta=d) # 得到PLA类
p.fit(a, b, maxRunTimes=10)      # 给定的训练集和标签，训练PLA的参数 w
print("最终训练得到的参数 w 为：", p.getW())  
print("对测试数据集 %s 划分的结果为： %s " %(e, p.apply(e)))
############测试程序###################

最终训练得到的参数 w 为： [ 1. -3. -3.]
对测试数据集 [[-2, 3]] 划分的结果为： [-1.] 


In [15]:
pla = PLA_origin(w0=[1]*len(trainSet[0]), theta=1)
pla.fit(trainSet, trainSet_label, maxRunTimes=10)
ansLabel1 = pla.apply(testSet)
pla.getW()
ansLabel1

array([ -3.    ,  18.6843,  -0.9048,   4.2036,  -7.1952, -16.7374, -10.3606])

array([-1., -1.])