In [20]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame
arr = np.array

# 读取数据集

In [21]:
dirPath = "E:\\Code\\_largeData\\Github--Open-Course-Learning--A04\\Project\\multiclass classification\\data preprocessed\\tf-idf"

trainSet = np.loadtxt(dirPath + '\\train.csv', delimiter=",")
validateSet = np.loadtxt(dirPath + '\\validate.csv', delimiter=",")
testSet = np.loadtxt(dirPath + '\\test.csv', delimiter=",")


trainSetLabel = np.loadtxt(dirPath + '\\train_label.csv', delimiter=",")
validateSetLabel = np.loadtxt(dirPath + '\\validate_label.csv', delimiter=",")

trainSet.shape, validateSet.shape, testSet.shape

trainSetLabel.shape, validateSetLabel.shape

((43766, 1644), (18756, 1644), (8671, 1644))

((43766,), (18756,))

In [22]:
Counter(trainSetLabel)
Counter(validateSetLabel)

Counter({0.0: 13606, 1.0: 18255, 2.0: 11905})

Counter({0.0: 5692, 1.0: 7802, 2.0: 5262})

## 实现OVO

由于有OVO和OVA两种方式可选，但是在三元分类上，OVO的效果从理论上应比OVA的好，因此直接选择OVO。

两者需要跑的子模型是一样的。

为适应LR的模型输入，需要把不同类别的标签转换为0和1.

In [23]:
mp = {"01":{0:0, 1:1}, "02":{0:0, 2:1}, "12":{1:0, 2:1}}
mp["01"]
mp["02"]
mp["12"]

{0: 0, 1: 1}

{0: 0, 2: 1}

{1: 0, 2: 1}

01

In [24]:
idx_01 = np.logical_or(trainSetLabel == 0, trainSetLabel == 1)
trainSet_01 = trainSet[idx_01]
trainSetLabel_01 = trainSetLabel[idx_01]

trainSet_01.shape, trainSetLabel_01.shape

np.place(trainSetLabel_01, trainSetLabel_01==0, 0)
np.place(trainSetLabel_01, trainSetLabel_01==1, 1)
Counter(trainSetLabel_01)

((31861, 1644), (31861,))

Counter({0.0: 13606, 1.0: 18255})

02

In [25]:
idx_02 = np.logical_or(trainSetLabel == 0, trainSetLabel == 2)
trainSet_02 = trainSet[idx_02]
trainSetLabel_02 = trainSetLabel[idx_02]

trainSet_02.shape, trainSetLabel_02.shape

np.place(trainSetLabel_02, trainSetLabel_02==0, 0)
np.place(trainSetLabel_02, trainSetLabel_02==2, 1)
Counter(trainSetLabel_02)

((25511, 1644), (25511,))

Counter({0.0: 13606, 1.0: 11905})

12

In [26]:
idx_12 = np.logical_or(trainSetLabel == 1, trainSetLabel == 2)
trainSet_12 = trainSet[idx_12]
trainSetLabel_12 = trainSetLabel[idx_12]

trainSet_12.shape, trainSetLabel_12.shape

np.place(trainSetLabel_12, trainSetLabel_12==1, 0)
np.place(trainSetLabel_12, trainSetLabel_12==2, 1)
Counter(trainSetLabel_12)

((30160, 1644), (30160,))

Counter({0.0: 18255, 1.0: 11905})

# 计算平均准确率

In [28]:
def calcAvg(predict, actual):
    all_labels = list(set(actual))
    tot = 0.0
    for i in all_labels:
        counter = Counter(predict[actual == i] == i)
        hit_num, miss_num = counter[True], counter[False]
        tot += hit_num / (hit_num + miss_num)
    return tot/len(all_labels)

calcAvg(arr([1,2,3]), arr([3,2,1]))

0.3333333333333333

# 绘制图像

In [29]:
def show_fig(testRange, avgs):
    fig,ax = plt.subplots() 
    fig.set_size_inches(10, 4)
    plt.plot(testRange, avgs, 'g')
    plt.xlabel('eta')
    plt.ylabel('avg(%)')
    plt.title("avg versus eta with Logistic Regression")
    plt.show()
    plt.close()

# 逻辑回归

In [108]:
class LogisticRegression:
    '''逻辑回归类实现'''
    
    def __addOne2Samples(self, dataSet):
        '''给每一个样本前加一个常数1'''
        ones = np.ones(len(dataSet))
        return np.column_stack((ones, dataSet))
    
    def sigmoid(self, x):
        '''sigmoid函数实现'''
        return 1/(1+np.exp(-1*x))
    
    def fit(self, trainSet, label, eta=1e-3, maxRunTimes=100):
        '''根据给定的训练集和标签训练PLA的参数 w '''
        ######## adaboost相关 ###########
        #初始化权重向量
        self.u_adaboost = np.ones(trainSet.shape[0])# 注意还是不要 /trainSet.shape[0]
        self.a_adaboost = [] #分类器的话语权
        self.w_adaboost = [] #分类器对应的模型权重向量
        self.train_adaboost = self.__addOne2Samples(trainSet)
        self.label_adaboost = label.copy()
        ######## adaboost相关 ###########
        #设置默认的 w 全为1
        self.w = np.mat(np.ones(trainSet.shape[1]+1)).reshape(-1,1)
        #给训练集中每一个样本前加一个常数1，并转换为numpy矩阵
        trainSet = np.mat(self.__addOne2Samples(trainSet))
        #将标签转换为numpy矩阵，并将其设置为只有一列的数据的矩阵
        label = np.mat(label).reshape(-1,1)
        
        step_adaboost = 10
        for i in tnrange(step_adaboost, desc="fit-top"):
            for i in tnrange(maxRunTimes, desc="fit", leave=False):
                eta_reduced = eta/(1+i)+1e-7
                #根据矩阵运算得到整个数据集每个维度梯度
                u_adaboost = np.mat(self.u_adaboost).reshape(-1,1)
                gradient = trainSet.T*(np.multiply(u_adaboost, self.sigmoid(trainSet*self.w) - label))
                #更新 w
                self.w -= eta_reduced*gradient
            ########################
            #计算子模型在u下在验证集上的错误率e，以及对应的a
            self.calc_train_error_and_update_u()
            ########################
        print("\n\n-------------------\n\n")
        
    def _apply(self, x):
        '''利用训练好的 w 对输入的向量x进行分类'''
        cnt = 0
        for index, w in enumerate(self.w_adaboost):
            predict = self.apply2single_sample(w, x)
            if predict == 0:
                predict = -1
            cnt += self.a_adaboost[index] * predict
        return 1 if cnt >=0 else 0
    
    def apply(self, otherSet):
        '''根据已训练出的 w 对其他数据集进行划分'''
        otherSet = self.__addOne2Samples(otherSet)
        outputLabel = np.zeros(otherSet.shape[0])
        for index in tnrange(len(otherSet), desc="apply"):
            outputLabel[index] = self._apply(otherSet[index])
        return outputLabel
    
    def apply2single_sample(self, w_mat, x):
        '''利用训练好的 w 对输入的向量x进行分类'''
        w = np.array(w_mat)[:,0] #转换为numpy向量，方便后续计算
        return 1 if self.sigmoid(np.dot(w, x)) > 0.5 else 0        
        
    def calc_train_error_and_update_u(self):
        '''计算当前w在训练集的误差并更新adaboost权重向量u'''
        errorCnt = 0
        wrongflag = []
        rightflag = []
        for index, sample in enumerate(self.train_adaboost):
            if self.apply2single_sample(self.w, sample) != self.label_adaboost[index]:
                errorCnt += 1
                wrongflag.append(index)
            else:
                rightflag.append(index)
        
        e = errorCnt/self.train_adaboost.shape[0]
        s = np.sqrt((1-e)/e)
        a = np.log(s)
        print(e, s, a)
        
        self.u_adaboost[wrongflag] *= s
        self.u_adaboost[rightflag] /= s
        #self.normalize_u() 不对权重归一化
        self.a_adaboost.append(a)
        self.w_adaboost.append(self.w)
    
    def normalize_u(self):
        '''归一化函数'''
        u_max, u_min = self.u_adaboost.max(), self.u_adaboost.min()
        if u_max == u_min:
            self.u_adaboost = np.ones(len(self.u_adaboost))
        else:
            self.u_adaboost = (self.u_adaboost - u_min)/(u_max - u_min) * len(self.u_adaboost) 
    
    def getW(self):
        return np.array(self.w)[:,0]

# 调参

暂调参数eta，暂定迭代次数为1000，不设置正则化项。

```
0.22745676532437775 1.84294327264 0.611363898312
0.21408618687423495 1.91598955636 0.65023422878
0.30121465114089324 1.52311904748 0.420760237294
0.4771978280656602 1.04669334385 0.045635998684
0.5935155833150246 0.82757218847 -0.189258938714
0.6605254072376887 0.716900485099 -0.332818241461
0.6924766956467154 0.666402219832 -0.405861857055
0.7072282728100185 0.643405487129 -0.440980135882
0.7151062427419101 0.631184291274 -0.460157396843
0.7188098301999309 0.625450671504 -0.469282814688



-------------------


0.15130727921288856 2.3683469673 0.862192229641
0.1288855787699424 2.59977305766 0.955424155703
0.17925600721257498 2.13976952067 0.760698122611
0.39841636940927444 1.22879495143 0.206033974868
0.5747716671239858 0.860128664193 -0.150673291398
0.6728862059503743 0.697234095479 -0.360634063091
0.7315667751166164 0.60574682685 -0.501293157681
0.7644937477950688 0.555027180415 -0.588738192714
0.7819764023362471 0.528025537231 -0.638610630477
0.7900121516208695 0.515561164634 -0.66249933148



-------------------


0.2980437665782493 1.53466997289 0.428315356548
0.2902519893899204 1.56373979828 0.447080258897
0.3538793103448276 1.35122987241 0.301015194325
0.4713859416445623 1.05896362214 0.0572907148858
0.5404840848806366 0.922059238704 -0.0811458072687
0.5842175066312998 0.843617958517 -0.170055542652
0.6143567639257295 0.792287013152 -0.232831562472
0.6321618037135278 0.762806326717 -0.270751111219
0.6436671087533157 0.744041735633 -0.295658149442
0.65 0.733799385705 -0.309519604203
```

In [109]:
bestEta = 0.053
bestRuntime = 550
LR_01, LR_02, LR_12 = LogisticRegression(), LogisticRegression(), LogisticRegression()
    
LR_01.fit(trainSet_01, trainSetLabel_01, eta=bestEta, maxRunTimes=bestRuntime)
LR_02.fit(trainSet_02, trainSetLabel_02, eta=bestEta, maxRunTimes=bestRuntime)
LR_12.fit(trainSet_12, trainSetLabel_12, eta=bestEta, maxRunTimes=bestRuntime)

0.22745676532437775 1.84294327264 0.611363898312
0.6981890085056967 0.65747769252 -0.419344443233
0.6967138507893663 0.659779877945 -0.415849017856
0.6938890806942657 0.664193137554 -0.409182301991
0.6892125168701547 0.671514170382 -0.398220160662
0.6862935877718841 0.676093448187 -0.39142397553
0.6841907033677537 0.67939738479 -0.386549072557
0.6826213866482533 0.681865761586 -0.382922471031
0.681365933272653 0.683842196158 -0.380028095347
0.6799849345594928 0.686018087944 -0.376851284339



-------------------


0.15130727921288856 2.3683469673 0.862192229641
0.7118889890635413 0.636170886507 -0.452288062256
0.7094194661126573 0.640002555526 -0.446283109627
0.7029516679079613 0.650056082409 -0.430696639185
0.7000509584100976 0.654574242838 -0.423770265631
0.695935086825291 0.660995933912 -0.414007590569
0.6938967504213869 0.664181146025 -0.409200356433
0.6918976127944808 0.66730843659 -0.404502916279
0.6906040531535416 0.669333787716 -0.401472407981
0.6897808788365803 0.670623404313 

In [110]:
def getTestLabel(dataSet):   
    ansLabel_01 = LR_01.apply(dataSet)
#     np.place(ansLabel_01, ansLabel_01==0, 0)
#     np.place(ansLabel_01, ansLabel_01==1, 1)
    
    ansLabel_02 = LR_02.apply(dataSet)
    
    np.place(ansLabel_02, ansLabel_02==0, 0)
    np.place(ansLabel_02, ansLabel_02==1, 2)
    
    ansLabel_12 = LR_12.apply(dataSet)
    np.place(ansLabel_12, ansLabel_12==0, 1)
    np.place(ansLabel_12, ansLabel_12==1, 2)    
    
    print("ansLabel_01", Counter(ansLabel_01))
    print("ansLabel_02", Counter(ansLabel_02))
    print("ansLabel_12", Counter(ansLabel_12))
    
    labels = np.vstack([ansLabel_01, ansLabel_02, ansLabel_12])
    print("labels:\n", labels)
    ansLabel = np.zeros(labels.shape[1])
    
    for i in tnrange(labels.shape[1], leave=False):
        t = Counter(labels[:, i]).most_common(2)
        #print(t)
        if t[0][1] == t[1][1]:
            ansLabel[i] = 1 #默认选择MID
        else:
            ansLabel[i] =  t[0][0]
    
    print(Counter(ansLabel))
    
    return ansLabel

calcAvg(getTestLabel(validateSet), validateSetLabel)




ansLabel_01 Counter({1.0: 11920, 0.0: 6836})
ansLabel_02 Counter({0.0: 9969, 2.0: 8787})
ansLabel_12 Counter({2.0: 18756})
labels:
 [[ 1.  1.  1. ...,  0.  1.  1.]
 [ 2.  0.  0. ...,  0.  2.  0.]
 [ 2.  2.  2. ...,  2.  2.  2.]]
Counter({2.0: 8787, 0.0: 5772, 1.0: 4197})


0.48857152283256183

In [102]:
ansLabel = getTestLabel(testSet)
ans = DF(ansLabel).replace(0, "LOW").replace(1, "MID").replace(2, "HIG")
ans.to_csv('.\\rank\\47_v1.csv', index=False, header=False)




ansLabel_01 Counter({1.0: 5463, 0.0: 3208})
ansLabel_02 Counter({2.0: 4601, 0.0: 4070})
ansLabel_12 Counter({2.0: 8671})
labels:
 [[ 0.  1.  1. ...,  0.  1.  0.]
 [ 0.  0.  2. ...,  0.  2.  0.]
 [ 2.  2.  2. ...,  2.  2.  2.]]
Counter({2.0: 4601, 0.0: 2716, 1.0: 1354})
