In [1]:
import math
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.metrics import accuracy_score
import random
import time
from sklearn.metrics import classification_report

In [2]:
model_DOC_test = word2vec.Word2Vec.load('DOC-test_model')
model_DOC_train = word2vec.Word2Vec.load('DOC-train_model')

In [3]:
def modelToVector(model, file):
 
    new = []
    f = open(file, 'r')
    lines = f.readlines()
    f.close()
    
    for i in lines:
        x = i.strip('\n')
        array = model[x]
        new.append(array)

    return new

In [4]:
Vector_DOC_test = modelToVector(model_DOC_test,'DOCSTART-test.txt')
Vector_DOC_train = modelToVector(model_DOC_train,'DOCSTART-train.txt')
array_DOC_test = np.array(Vector_DOC_test)
array_DOC_train = np.array(Vector_DOC_train)

  # Remove the CWD from sys.path while we load stuff.


In [5]:
O_test = pd.read_table('O-test.txt',header=None)
O_test = O_test.values
int_O_test = O_test.astype(np.int32)
O_train = pd.read_table('O-train.txt',header=None)
O_train = O_train.values
int_O_train = O_train.astype(np.int32)

In [6]:
array_DOC_test.shape

(97252, 64)

In [7]:
array_DOC_train.shape

(202388, 64)

In [8]:
int_O_test.shape

(97252, 1)

In [9]:
int_O_train.shape

(202388, 1)

In [10]:
int_O_test[0:5]

array([[0],
       [0],
       [1],
       [0],
       [0]], dtype=int32)

In [11]:
list_O_test = int_O_test.tolist()

In [12]:
list_O_test[0:5]

[[0], [0], [1], [0], [0]]

In [13]:
list_O_train = int_O_train.tolist()

In [14]:
iterList_O_test = sum(list_O_test,[])

In [15]:
iterList_O_test[0:5]

[0, 0, 1, 0, 0]

In [16]:
tuple_O_test = tuple(iterList_O_test)

In [17]:
iterList_O_train = sum(list_O_train,[])

In [31]:
# learn from: https://blog.csdn.net/wds2006sdo/article/details/53699778?utm_source=itdadao&utm_medium=referral
class Softmax(object):

    def __init__(self):
        self.learning_step = 0.000001           # 学习速率
        self.max_iteration = 50000             # 最大迭代次数
        self.weight_lambda = 0.01               # 衰退权重

    def cal_e(self,x,l):
        '''
        计算博客中的公式3
        '''

        theta_l = self.w[l]
        product = np.dot(theta_l,x)

        return math.exp(product)

    def cal_probability(self,x,j):
        '''
        计算博客中的公式2
        '''

        molecule = self.cal_e(x,j)
        denominator = sum([self.cal_e(x,i) for i in range(self.k)])

        return molecule/denominator


    def cal_partial_derivative(self,x,y,j):
        '''
        计算博客中的公式1
        '''

        first = int(y==j)                           # 计算示性函数
        second = self.cal_probability(x,j)          # 计算后面那个概率

        return -x*(first-second) + self.weight_lambda*self.w[j]

    def predict_(self, x):
        result = np.dot(self.w,x)
        row, column = result.shape

        # 找最大值所在的列
        _positon = np.argmax(result)
        m, n = divmod(_positon, column)

        return m

    def train(self, features, labels):
        self.k = len(set(labels))

        self.w = np.zeros((self.k,len(features[0])+1))
        time = 0

        while time < self.max_iteration:
            if(time%100 == 0):
                print('loop %d' % time)
            time += 1
            index = random.randint(0, len(labels) - 1)

            x = features[index]
            y = labels[index]

            x = list(x)
            x.append(1.0)
            x = np.array(x)

            derivatives = [self.cal_partial_derivative(x,y,j) for j in range(self.k)]

            for j in range(self.k):
                self.w[j] -= self.learning_step * derivatives[j]

    def predict(self,features):
        labels = []
        for feature in features:
            x = list(feature)
            x.append(1)
            x = np.matrix(x)
            x = np.transpose(x)

            labels.append(self.predict_(x))
        return labels

In [32]:
if __name__ == '__main__':

    print('Start read data')

    time_1 = time.time()

    train_features = array_DOC_train 
    test_features = array_DOC_test
    train_labels = iterList_O_train
    test_labels = iterList_O_test
    # print train_features.shape
    # print train_features.shape

    time_2 = time.time()
    print('read data cost '+ str(time_2 - time_1)+' second')

    print('Start training')
    p = Softmax()
    p.train(train_features, train_labels)

    time_3 = time.time()
    print('training cost '+ str(time_3 - time_2)+' second')

    print('Start predicting')
    test_predict = p.predict(test_features)
    time_4 = time.time()
    print('predicting cost ' + str(time_4 - time_3) +' second')

    score = classification_report(test_labels,test_predict)
    print(str(score))

Start read data
read data cost 3.0994415283203125e-06 second
Start training
loop 0
loop 100
loop 200
loop 300
loop 400
loop 500
loop 600
loop 700
loop 800
loop 900
loop 1000
loop 1100
loop 1200
loop 1300
loop 1400
loop 1500
loop 1600
loop 1700
loop 1800
loop 1900
loop 2000
loop 2100
loop 2200
loop 2300
loop 2400
loop 2500
loop 2600
loop 2700
loop 2800
loop 2900
loop 3000
loop 3100
loop 3200
loop 3300
loop 3400
loop 3500
loop 3600
loop 3700
loop 3800
loop 3900
loop 4000
loop 4100
loop 4200
loop 4300
loop 4400
loop 4500
loop 4600
loop 4700
loop 4800
loop 4900
loop 5000
loop 5100
loop 5200
loop 5300
loop 5400
loop 5500
loop 5600
loop 5700
loop 5800
loop 5900
loop 6000
loop 6100
loop 6200
loop 6300
loop 6400
loop 6500
loop 6600
loop 6700
loop 6800
loop 6900
loop 7000
loop 7100
loop 7200
loop 7300
loop 7400
loop 7500
loop 7600
loop 7700
loop 7800
loop 7900
loop 8000
loop 8100
loop 8200
loop 8300
loop 8400
loop 8500
loop 8600
loop 8700
loop 8800
loop 8900
loop 9000
loop 9100
loop 9200
loop 9

  'precision', 'predicted', average, warn_for)
