In [40]:
from ai_base import List2CSV, CSV2List2
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from typing import List, Tuple
import csv
import numpy as np
def doc2vec(trainData: List[List], testData: List[List], savePath = "",fname = "doc2vec.model")-> Tuple[np.array, np.array]:
    documents = list(range(len(trainData)))
    for i, doc in enumerate(trainData):
        documents[i] = TaggedDocument(doc, [i])
    vec_sz=50
    model = Doc2Vec(documents, vector_size=vec_sz, window=2, min_count=1, workers=12)
    model.save(fname)
    trainX = np.zeros((len(trainData), vec_sz))
    testX = np.zeros((len(testData), vec_sz))
    for i, data in enumerate(trainData):
        trainX[i] = model.infer_vector(data)
    for i, data in enumerate(testData):
        testX[i] = model.infer_vector(data)
    List2CSV(savePath + "doc2vec" + "TrainSet" + str(vec_sz) + "D" + str(len(trainData)) + "L.csv", trainX)
    List2CSV(savePath + "doc2vec" + "TestSet" + str(vec_sz) + "D" + str(len(testData)) + "L.csv", testX)
    return (trainX, testX)

In [2]:
from collections import OrderedDict
def _getTFIDF(fdata: List[List], word_dict: OrderedDict) -> np.array:
    '''
    获取TF-IDF矩阵，并将每个单词及出现次数存储到word_dict中
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    #文章数
    D = len(fdata)
    if len(word_dict) is 0:
        #训练集
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1
        word_dict[None] = 0
    else:
        #验证集和测试集，丢弃未出现的单词
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] += 1
                else:
                    word_dict[None] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                TF[i][word_order[word]] += 1
            else:
                TF[i][word_order[None]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log2(D / (1 + np.array(list(word_dict.values()))))
    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    return TF_IDF

In [3]:
def TFIDF(trainData: List[List], testData: List[List], savePath="")-> Tuple[np.array, np.array]:
    wdict = OrderedDict()
    trainX = _getTFIDF(trainData, wdict)
    testX = _getTFIDF(testData, wdict)
    List2CSV(savePath + "tfidf" + "TrainSet" + str(len(wdict)) + "D" + str(len(trainData)) + "L.csv", trainX)
    List2CSV(savePath + "tfidf" + "TestSet" + str(len(wdict)) + "D" + str(len(testData)) + "L.csv", testData)
    return trainX, testX


In [4]:
def _getOneHot(fdata: List[List], word_dict: OrderedDict) -> np.array:
    D = len(fdata)
    if len(word_dict) is 0:
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
        word_dict[None] = 0
    else:
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[None] = 1
    word_vec = word_dict.keys()
    word_order = dict(zip(word_vec,range(len(word_vec))))
    oneHot = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                oneHot[i][word_order[word]] = 1
            else:
                oneHot[i][word_order[None]] = 1
    return oneHot

In [5]:
def OneHot(trainData: List[List], testData: List[List], savePath="")-> Tuple[np.array, np.array]:
    wdict = OrderedDict()
    trainX = _getOneHot(trainData, wdict)
    testX = _getOneHot(testData, wdict)
    List2CSV(savePath + "onehot" + "TrainSet" + str(len(wdict)) + "D" + str(len(trainData)) + "L.csv", trainX)
    List2CSV(savePath + "onehot" + "TestSet" + str(len(wdict)) + "D" + str(len(testData)) + "L.csv", testData)
    return trainX, testX


In [6]:
# import csv
# ret = []
# with open('data/2/clean/trainDataclean.csv', 'r') as f:
#     reader = csv.reader(f)
#     for row in reader:
#         ret.append(list(row))
trainData = CSV2List2('data/2/clean/trainDataclean.csv')
testData = CSV2List2('data/2/clean/testDataclean.csv')
trainData = [[w.lower() for w in row] for row in trainData]
trainData = [[w.lower() for w in row] for row in testData]
len(trainData)

6000

In [7]:
trainX, testX = doc2vec(trainData, testData)

In [20]:
trainX2, testX2 = TFIDF(trainData, testData)

In [21]:
trainX3, testX3 = OneHot(trainData, testData)

In [8]:
sum(trainX3[0])

NameError: name 'trainX3' is not defined

In [9]:
len(trainX)

6000

In [11]:
import KNN
from ai_base import CSV2List2, List2CSV
import numpy as np
trainX = np.float_(CSV2List2('doc2vecTrainSet50D24000L.csv'))
trainYData = CSV2List2('data/2/clean/trainLabel.txt')
testData = np.float_(CSV2List2('doc2vecTestSet50D6000L.csv'))
len(testData)


6000

In [12]:
testData[0]

array([ 0.29104736,  0.0768804 , -0.16329062,  0.15835629,  0.08793257,
        0.20293267, -0.30032602, -0.08988713,  0.00514956,  0.18893704,
        0.33035704,  0.11677902,  0.0503263 , -0.15362824,  0.15530102,
        0.05387696, -0.54698908,  0.31815219,  0.11182838, -0.21462551,
       -0.18355298,  0.2321436 ,  0.23971845, -0.05463126, -0.16444041,
        0.02434071,  0.39980423,  0.04631198, -0.2933962 ,  0.36511028,
        0.26598665,  0.2677407 ,  0.11230476,  0.07896607, -0.32979521,
       -0.02836336, -0.15179099, -0.16036966,  0.17397907, -0.28041485,
       -0.10498746, -0.23027968,  0.10070526,  0.12655641,  0.1419463 ,
       -0.26576224,  0.53412449,  0.12394352,  0.05899602,  0.10648291])

In [13]:
KNNtrainX = np.array(trainX[0:20000])
KNNvaildX = np.array(trainX[20000:])


KNNtrainY = np.array(np.float_(trainYData[0:20000]))
KNNvaildY = np.array(np.float_(trainYData[20000:]))


In [14]:
KNNtrainY

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [1.]])

In [16]:
from ai_base import CSV2List2, List2CSV
List2CSV(data=KNNtrainX, filen='1')
List2CSV(data=KNNvaildX, filen='2')
List2CSV(data=KNNtrainY, filen='3')
List2CSV(data=KNNvaildY, filen='4')


In [13]:
from ai_base import CSV2List2, List2CSV
KNNtrainX = np.array(np.float_(CSV2List2('1')))
KNNvaildX = np.array(np.float_(CSV2List2('2')))
KNNtrainY = np.array(np.float_(CSV2List2('3')))
KNNvaildY = np.array(np.float_(CSV2List2('4')))

KNNtrainX = KNNtrainX[0:2000]
KNNvaildX = KNNvaildX[0:1000]
KNNtrainY = KNNtrainY[0:2000]
KNNvaildY = KNNvaildY[0:1000]

In [15]:
KNNvaildY.shape[1]

1

# KNN

In [18]:
import ai_base
import numpy as np
from typing import List, Tuple, Callable
from tqdm import tnrange, tqdm_notebook
from scipy.stats.stats import pearsonr
from time import time
from collections import OrderedDict

# 一范数
Dis1 = lambda v1, v2: np.linalg.norm(v1 - v2, 1)
# 二范数
Dis2 = lambda v1, v2: np.linalg.norm(v1 - v2, 2)
# 无穷范数
DisInf = lambda v1, v2: np.linalg.norm(v1 - v2, np.inf)
# 余弦距离（1-余弦相关度）
def DisCosine(v1, v2):
    t1 = np.dot(v1,v2)
    t2 = np.linalg.norm(v1)
    t3 = np.linalg.norm(v2)
    ret = 1 - t1 / (t2*t3)
    return ret
    
def DisInvNormAvg(distances: np.array, Y: np.array) -> np.array:
    '''
    按照归一化的距离倒数加权求和，返回均值
    '''
    # 如果训练集中有向量距离和待预测向量完全一致（距离为0）
    for idx, dis in enumerate(distances):
        if np.isclose(dis, 0):
            # 则直接返回该训练集向量对应的Y
            return Y[idx]
    # 求距离的倒数
    distances = np.array(1.0) / distances
    # 归一化
    s = np.sum(distances)
    distances = distances / s
    # 分别作为权值乘以K个最邻近的训练集向量对应的Y
    tmp = np.diag(distances) @ Y  
    # 加权后Y的个分量求和
    if len(tmp.shape) is 1:
        return tmp
    else:
        return np.sum(tmp,  axis = (0))

def classifyParseY(ydata: List[str], n: int)->np.array:
    '''
    Convert Y data from raw string list to matrix consisted of Y vectors
    e.g.
    ["anger", "disgust", ..., "surprise"] -> 
    |1, 0, 0, 0, 0, 0|
    |0, 1, 0, 0, 0, 0|
    |0, 0, ...,  0, 0|
    |0, 0, 0, 0, 1, 0|
    |0, 0, 0, 0, 0, 1|
    '''
    D = len(ydata)
    
    #fast hash ydata from strings ["anger", "disgust", ...] to [1, 2, ...]^T
    #ydata = np.array(ydata).reshape((-1,1))
    
    '''
    ymat is the column-wise repeat of ydata.
    e.g.
    |0|      |0, 0, 0, 0, 0, 0|
    |1|   -> |1, 1, 1, 1, 1, 1|
    ...      |................|
    |5|      |5, 5, 5, 5, 5, 5|
    ydata -> ymat
    '''
    ymat  = np.tile(ydata, (1, n))
    
    '''
    ycmp is a matrix of which each row is [0, 1, 2, 3, 4, 5]
    |0, 1, 2, 3, 4, 5|
    |0, 1, 2, 3, 4, 5|
    |................|
    |0, 1, 2, 3, 4, 5|
    '''
    ycmp  = np.tile(np.array(range(n)), (D, 1))
    return np.int_(np.equal(ymat, ycmp))

def KNN(trainSet: Tuple[np.array, np.array],
        testVec: np.array,
        DisFunc: Callable[[np.array, np.array], float],
        K: int,
        WeightFunc: Callable[[np.array, np.array], float]) -> np.array: 
    '''
    一个通用的KNN接口
    trainSet: 二元元组，第一个元素是训练集的X，第二个是Y
    testVec: 待预测向量
    DisFunc: 距离函数
    K: K值
    WeightFunc: 依据第一个参数list<距离>,对第二个参数list<Y值>进行加权，返回预测值
    '''
    #对于多个要预测的值，逐一预测
#     if len(testVec.shape) > 1:
#         n = len(testVec)
#         ret = list(range(n))
#         for i in tnrange(n):
#             ret[i] = KNN(trainSet, testVec[i], DisFunc, K, WeightFunc)
#         return np.array(ret)
#     else:
        #测量待预测向量到训练集中每个向量的距离
        #distances是一个list<tuple(index, distance)>
        
#     distances = list(enumerate(map(lambda trainVec: DisFunc(trainVec, testVec), trainSet[0])))
    
    test_sum = np.sum(np.square(testVec), axis=1)  # num_test x 1
    train_sum = np.sum(np.square(trainSet[0]), axis=1)  # num_train x 1
    inner_product = np.dot(testVec, trainSet[0].T)  # num_test x num_train
    dists = np.sqrt(-2 * inner_product + test_sum.reshape(-1, 1) + train_sum)  # broadcast
    n = len(testVec)
    ret = list(range(n))
    for i in tnrange(n):
        distances = list(enumerate(dists[i]))
        #依据距离从小到大排序
        distances.sort(key=lambda t: t[1])
        #获取最临近的K个训练样本的下标和对应的距离，输出值
        tmp = list(zip(*distances[:K]))
        kNearIdx = list(tmp[0])
        kNearDis = list(tmp[1])
        kNearY   = trainSet[1][kNearIdx, :]
        #对输出值根据距离加权作为预测输出
        ret[i] = WeightFunc(kNearDis, kNearY)

def get_regress(predictY, vaildY):
    r = [pearsonr(predictY[:, i], vaildY[:, i])[0] for i in range(vaildY.shape[1])]
    average = np.average(r)
    print("Correlation Coefficient: ", average)
    return average

def get_classify(predictY, vaildY):
    classifyY = np.zeros_like(predictY)
    for i, row in enumerate(predictY):
        m = 0
        idx = 0
        for j, v in enumerate(row):
            if v > m:
                m = v
                idx = j
        classifyY[i][idx] = 1
    ret = np.sum(np.logical_and(classifyY, vaildY)) / vaildY.shape[0]
    print("Classification Accuracy: ", ret)
    return ret

def autoTrain(trainSet: Tuple, vaildSet:Tuple):
    trainX, trainY = trainSet
    vaildX, vaildY = vaildSet
    print("Start training...")
    t = time()
    K_val = range(8, 14)
#     DisFuncs = {"Dis1": Dis1, "Dis2": Dis2, "DisInf": DisInf, "DisCosine": DisCosine}
    DisFuncs = {"Dis2": Dis2}
    results_reg = OrderedDict()
    results_cla = OrderedDict()
    for K in K_val:
        for dfname, DisFunc in DisFuncs.items():
            predictY = KNN((trainX,trainY), vaildX, DisFunc, K, DisInvNormAvg)
            cla_ret = get_classify(predictY, vaildY)
            reg_ret = get_regress(predictY, vaildY)
            results_reg[(K, dfname)] = cla_ret
            results_cla[(K, dfname)] = reg_ret
            print(K, dfname, ":", cla_ret, reg_ret)
    print("{} groups of argument tested, spent {}s".format(len(K_val) * len(DisFuncs), time() - t))
    return results

def vaild(trainSet: Tuple, vaildSet: Tuple, K, DisFunc):
    trainX, trainY = trainSet
    vaildX, vaildY = vaildSet
    predictY = KNN(trainSet,vaildX,DisFunc,K,DisInvNormAvg)
    cla_ret = get_classify(predictY, vaildY)
    reg_ret = get_regress(predictY, vaildY)
    print(pfname, K, dfname, ":", cla_ret, reg_ret)


# KNN end

In [19]:
autoTrain((KNNtrainX, KNNtrainY), (KNNvaildX, KNNvaildY))

Start training...


HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))




TypeError: 'numpy.float64' object is not iterable