In [1]:
import numpy as np
import csv
from collections import OrderedDict
import copy
import pandas as pd
from time import time
from typing import Callable
from typing import Any
from typing import Dict, Tuple, List
import cProfile
import re
import os

In [2]:
t = time()

In [3]:
def readFile(filen: str) -> List[List]:
    '''
    读取文件内容
    由于首先需要获取文章数量和单词向量长度，才能计算TF矩阵
    因此要对文本内容进行两次遍历，为了避免两次读取磁盘文件，故先将文本内容保存到内存中的一个list
    '''
    fdata = []
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=' ')
        fdata = [list(row) for row in reader]
    return fdata

In [28]:
def getTFIDF(fdata: List[List]) -> np.array:
    '''
    获取TF-IDF矩阵
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    word_dict = OrderedDict() 
    #文章数
    D = 0
    for row in fdata:
        D += 1
        for word in row:
            if not word in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            TF[i][word_order[word]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log(D / (1 + np.array(list(word_dict.values()))))

    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    return TF_IDF

In [37]:
getTFIDF([["b", "c"], ["a", "c"]])

array([[ 0.        , -0.20273255,  0.        ],
       [ 0.        , -0.20273255,  0.        ]])

In [5]:
semval = readFile('lab1_data/semeval_sliced.txt')
ret = getTFIDF(semval)
#ret = ret.tolist()
#rt = []
#for row in ret:
#    rr = []
#    for w in row:
#        if row != 0:
#           rr.append(w)
#    rt.append(rr)
#ret = np.array(rt)
##print(ret)
np.savetxt("15323032_LiXinrui_TFIDF.txt", ret, delimiter=" ", fmt="%6f")

what


In [6]:
def KNN_getTFIDF(fdata: List[List], word_dict: OrderedDict) -> np.array:
    '''
    获取TF-IDF矩阵，并将每个单词及出现次数存储到word_dict中
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    #文章数
    D = len(fdata)
    if len(word_dict) is 0:
        #训练集
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1
        word_dict[None] = 0
    else:
        #验证集和测试集，丢弃未出现的单词
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] += 1
                else:
                    word_dict[None] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                TF[i][word_order[word]] += 1
            else:
                TF[i][word_order[None]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log2(D / (1 + np.array(list(word_dict.values()))))
    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    return TF_IDF

In [7]:
wdict = OrderedDict()
KNN_getTFIDF([['a','a'],['c','a']], wdict)


array([[-1. ,  0. ,  0. ],
       [-0.5,  0. ,  0. ]])

In [8]:
def getOneHot(fdata: List[List], word_dict: OrderedDict) -> np.array:
    D = len(fdata)
    if len(word_dict) is 0:
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
        word_dict[None] = 0
    else:
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[None] = 1
    word_vec = word_dict.keys()
    word_order = dict(zip(word_vec,range(len(word_vec))))
    oneHot = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                oneHot[i][word_order[word]] = 1
            else:
                oneHot[i][word_order[None]] = 1
    return oneHot

In [9]:
wdict = OrderedDict()
getOneHot([['a','a'],['c','a']], wdict)
getOneHot([['d','d']], wdict)

array([[0., 0., 1.]])

In [10]:
def DisN(vec1: np.array, vec2: np.array, N: Any) -> float:
    '''
    计算N-norm
    '''
    if(N < 1):
        raise ValueError("norm should be a positive integer or np.inf")
    if np.isinf(N):
        return np.max(np.fabs(vec1 - vec2))
    else:
        return np.power(np.sum(np.power(vec1 - vec2, N)), 1.0/N)

#Dis2 = lambda v1, v2: DisN(v1, v2, 2)
# 一范数
Dis1 = lambda v1, v2: np.linalg.norm(v1 - v2, 1)
# 二范数
Dis2 = lambda v1, v2: np.linalg.norm(v1 - v2, 2)
# 无穷范数
DisInf = lambda v1, v2: np.linalg.norm(v1 - v2, np.inf)
# 余弦距离（1-余弦相关度）
def DisCosine(v1, v2):
    t1 = np.dot(v1,v2)
    t2 = np.linalg.norm(v1)
    t3 = np.linalg.norm(v2)
    ret = 1 - t1 / (t2*t3)
    return ret
 

In [11]:
DisCosine([0,1],[0,0.5])

0.0

In [12]:
def DisInvNormAvg(distances: np.array, Y: np.array) -> np.array:
    '''
    按照归一化的距离倒数加权求和，返回均值
    '''
    # 如果训练集中有向量距离和待预测向量完全一致（距离为0）
    for idx, dis in enumerate(distances):
        if np.isclose(dis, 0):
            # 则直接返回该训练集向量对应的Y
            return Y[idx]
    # 求距离的倒数
    distances = np.array(1.0) / distances
    # 归一化
    s = np.sum(distances)
    distances = distances / s
    # 分别作为权值乘以K个最邻近的训练集向量对应的Y
    tmp = np.diag(distances) @ Y  
    # 加权后Y的个分量求和
    if len(tmp.shape) is 1:
        return tmp
    else:
        return np.sum(tmp,  axis = (0))

In [13]:
def KNN(trainSet: Tuple[np.array, np.array],
        testVec: np.array,
        DisFunc: Callable[[np.array, np.array], float],
        K: int,
        WeightFunc: Callable[[np.array, np.array], float]) -> np.array: 
    '''
    一个通用的KNN接口
    trainSet: 二元元组，第一个元素是训练集的X，第二个是Y
    testVec: 待预测向量
    DisFunc: 距离函数
    K: K值
    WeightFunc: 依据第一个参数list<距离>,对第二个参数list<Y值>进行加权，返回预测值
    '''
    #对于多个要预测的值，逐一预测
    if len(testVec.shape) > 1:
        return np.array([KNN(trainSet, vec, DisFunc, K, WeightFunc) for vec in testVec])
    else:
        #测量待预测向量到训练集中每个向量的距离
        #distances是一个list<tuple(index, distance)>
        distances = list(enumerate(map(lambda trainVec: DisFunc(trainVec, testVec), trainSet[0])))
        #依据距离从小到大排序
        distances.sort(key=lambda t: t[1])
        #获取最临近的K个训练样本的下标和对应的距离，输出值
        tmp = list(zip(*distances[:K]))
        kNearIdx = list(tmp[0])
        kNearDis = list(tmp[1])
        kNearY   = trainSet[1][kNearIdx, :]
        #对输出值根据距离加权作为预测输出
        return WeightFunc(kNearDis, kNearY)

In [14]:
def TestCast1():
    trainX = np.array([[10,2],[2,3],[3,5]])
    trainY = np.array([[1,1,1], [2,2,3], [3,3,5]])
    vaildX = np.array([[3,3]])
    return trainX, trainY, vaildX


In [38]:
trainX, trainY, vaildX = TestCast1()
KNN((trainX, trainY), vaildX, Dis2, 1, DisInvNormAvg)

array([[2., 2., 3.]])

In [15]:
def TestCast2():
    xfilen = 'lab1_data/X.txt'
    yfilen = 'lab1_data/Y.txt'
    xdata = readFile(xfilen)
    ydata = readFile(yfilen)
    x_set = getTFIDF(xdata)
    y_set = np.array([list(map(float, row)) for row in ydata])
    DIVIDE_RATE = 0.75
    train_D = int(np.ceil(x_set.shape[0] * DIVIDE_RATE))
    trainX = x_set[0:train_D, :]
    vaildX = x_set[train_D:, :]
    trainY = y_set[0:train_D, :]
    vaildY = y_set[train_D:, :]
    return trainX, trainY, vaildX

In [16]:
def Test(trainX, trainY, vaildX):
    t = time()
    cProfile.run('KNN((trainX, trainY), vaildX,Dis2,2,DisInvNormAvg)')
    print(time()-t) 
#trainX, trainY, vaildX = TestCast2()
#Test(trainX, trainY, vaildX)

In [17]:
def classifyReadFile(filen: str) -> Tuple[List[str], List[str]]:
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=',')
        train_data = [list(row) for row in reader]
        train_data = train_data[1:]
        tmp = list(zip(*train_data))
        xdata = [row.split() for row in list(tmp[0])]
        ydata = list(tmp[1])
        return xdata, ydata

def vectorizeData(xdata, ydata, xVecFunc, yVecFunc):
    return xVecFunc(xdata), yVecFunc(ydata)

def fastHashY(s: str) -> int:
    if s[0] is 'a': return 0
    if s[0] is 'd': return 1
    if s[0] is 'f': return 2
    if s[0] is 'j': return 3
    if s[1] is 'a': return 4
    return 5

def classifyParseY(ydata: List[str])->np.array:
    '''
    Convert Y data from raw string list to matrix consisted of Y vectors
    e.g.
    ["anger", "disgust", ..., "surprise"] -> 
    |1, 0, 0, 0, 0, 0|
    |0, 1, 0, 0, 0, 0|
    |0, 0, ...,  0, 0|
    |0, 0, 0, 0, 1, 0|
    |0, 0, 0, 0, 0, 1|
    '''
    D = len(ydata)
    
    #fast hash ydata from strings ["anger", "disgust", ...] to [1, 2, ...]^T
    ydata = np.array(list(map(fastHashY, ydata))).reshape((-1,1))
    
    '''
    ymat is the column-wise repeat of ydata.
    e.g.
    |0|      |0, 0, 0, 0, 0, 0|
    |1|   -> |1, 1, 1, 1, 1, 1|
    ...      |................|
    |5|      |5, 5, 5, 5, 5, 5|
    ydata -> ymat
    '''
    ymat  = np.tile(ydata, (1, 6))
    
    '''
    ycmp is a matrix of which each row is [0, 1, 2, 3, 4, 5]
    |0, 1, 2, 3, 4, 5|
    |0, 1, 2, 3, 4, 5|
    |................|
    |0, 1, 2, 3, 4, 5|
    '''
    ycmp  = np.tile(np.array(range(6)), (D, 1))
    return np.int_(np.equal(ymat, ycmp))



In [18]:
def do_classify(trainX, trainY, vaildX, vaildY, knnFunc):
    predictY = knnFunc(trainX, trainY, vaildX)
    classifyY = np.zeros_like(predictY)
    for i, row in enumerate(predictY):
        m = 0
        idx = 0
        for j, v in enumerate(row):
            if v > m:
                m = v
                idx = j
        classifyY[i][idx] = 1
    #print("Predicted Y")
    #print(classifyY)
    #print("Correct Y")
    #print(vaildY)
    ret = np.sum(np.logical_and(classifyY, vaildY)) / vaildX.shape[0]
    print("Classification Accuracy: ", ret)
    return ret

In [19]:
def regressReadFile(filen: str) -> Tuple[List[str], List[str]]:
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=',')
        train_data = [list(row) for row in reader]
        train_data = train_data[1:]
        xdata = [row[0].split() for row in train_data]
        ydata = [[row[i] for i in range(1, 7)] for row in train_data]
        return xdata, ydata

def regressParseY(ydata: List[List[str]]) -> np.array:
    if ydata[0][0] is '?':
        return np.zeros_like(ydata)
    return np.float_(np.array(ydata))

from scipy.stats.stats import pearsonr
#def pearsonr(X, Y):
#
#    X_bar = np.average(X)
#    Y_bar = np.average(Y)
#    X = np.subtract(X, X_bar)
#    Y = np.subtract(Y, Y_bar)
#    t1 = np.sum(np.dot(X, Y))
#    t2 = np.sum(np.power(X,2))
#    t3 = np.sum(np.power(Y,2))
#
#    ret = t1 / np.power(t2 * t3, 0.5)
#    return ret
def do_regress(trainX, trainY, vaildX, vaildY, knnFunc, save = False):
    predictY = knnFunc(trainX, trainY, vaildX)
    if save:
        np.savetxt("regress.csv", predictY, delimiter=",", fmt="%4f")
    r = [pearsonr(predictY[:, i], vaildY[:, i])[0] for i in range(6)]
    average = np.average(r)
    print("Correlation Coefficient: ", average)
    return average
    
    
train_filen = 'lab1_data/regression_dataset/train_set.csv'
vaild_filen = 'lab1_data/regression_dataset/validation_set.csv'
test_filen  = 'lab1_data/regression_dataset/test_set.csv'

In [20]:
#train_filen = 'lab1_data/classification_dataset/train_set.csv'
#vaild_filen = 'lab1_data/classification_dataset/validation_set.csv'
#test_filen  = 'lab1_data/classification_dataset/test_set.csv'
#
#trainX_data, trainY_data = classifyReadFile(train_filen)
#vaildX_data, vaildY_data = classifyReadFile(vaild_filen)
#
#ParseFuncs = {"OneHot": getOneHot, "TI-IDF": KNN_getTFIDF}
#K_val = range(1, 20)
#DisFuncs = {"Dis1": Dis1, "Dis2": Dis2, "DisInf": DisInf, "DisCosine": DisCosine}
#
#results = OrderedDict()
#
#for pfname, ParseFunc in ParseFuncs.items():
#    word_dict = OrderedDict()
#    def classifyParseX(fdata: List[List]): return ParseFunc(fdata, word_dict)
#    trainX, trainY = vectorizeData(trainX_data, trainY_data, classifyParseX, classifyParseY)
#    vaildX, vaildY = vectorizeData(vaildX_data, vaildY_data, classifyParseX, classifyParseY)
#    for K in K_val:
#        for dfname, DisFunc in DisFuncs.items():
#            print("ParseFunc = {}, K = {}, DisFunc = {}".format(pfname, K, dfname))
#            def knnFunc(trainX, trainY, vaildX): return KNN((trainX, trainY), vaildX, DisFunc, K, DisInvNormAvg)
#            ret = do_classify(trainX, trainY, vaildX, vaildY, knnFunc)
#            results[(pfname, K, dfname)] = ret

In [21]:
def autoTrain(train_filen, vaild_filen, ReadFileFunc, ParseYFunc, TrainFunc):
    print("Start training...")
    t = time()
    trainX_data, trainY_data = ReadFileFunc(train_filen)
    vaildX_data, vaildY_data = ReadFileFunc(vaild_filen)
    
    ParseFuncs = {"OneHot": getOneHot, "TI-IDF": KNN_getTFIDF}
    K_val = range(8, 14)
    DisFuncs = {"Dis1": Dis1, "Dis2": Dis2, "DisInf": DisInf, "DisCosine": DisCosine}
    #DisFuncs = {"DisCosine": DisCosine}
    results = OrderedDict()
    
    for pfname, ParseFunc in ParseFuncs.items():
        word_dict = OrderedDict()
        def ParseXFunc(fdata: List[List]): return ParseFunc(fdata, word_dict)
        trainX, trainY = vectorizeData(trainX_data, trainY_data, ParseXFunc, ParseYFunc)
        vaildX, vaildY = vectorizeData(vaildX_data, vaildY_data, ParseXFunc, ParseYFunc)
        for K in K_val:
            for dfname, DisFunc in DisFuncs.items():
                print("ParseFunc = {}, K = {}, DisFunc = {}".format(pfname, K, dfname))
                def knnFunc(trainX, trainY, vaildX): return KNN((trainX, trainY), vaildX, DisFunc, K, DisInvNormAvg)
                ret = TrainFunc(trainX, trainY, vaildX, vaildY, knnFunc)
                results[(pfname, K, dfname)] = ret
    print("{} groups of argument tested, spent {}s".format(len(ParseFuncs) * len(K_val) * len(DisFuncs), time() - t))
    return results

In [22]:
train_filen = 'lab1_data/regression_dataset/train_set.csv'
vaild_filen = 'lab1_data/regression_dataset/validation_set.csv'
test_filen  = 'lab1_data/regression_dataset/test_set.csv'
np.set_printoptions(threshold=np.nan)
regressResults = autoTrain(train_filen, vaild_filen, regressReadFile, regressParseY, do_regress)




Start training...
ParseFunc = OneHot, K = 8, DisFunc = Dis1
Correlation Coefficient:  0.2598822818387406
ParseFunc = OneHot, K = 8, DisFunc = Dis2
Correlation Coefficient:  0.24841844862532894
ParseFunc = OneHot, K = 8, DisFunc = DisInf
Correlation Coefficient:  0.1384238210551513
ParseFunc = OneHot, K = 8, DisFunc = DisCosine
Correlation Coefficient:  0.3509801140585453
ParseFunc = OneHot, K = 9, DisFunc = Dis1
Correlation Coefficient:  0.2574921244734181
ParseFunc = OneHot, K = 9, DisFunc = Dis2
Correlation Coefficient:  0.24619015576418102
ParseFunc = OneHot, K = 9, DisFunc = DisInf
Correlation Coefficient:  0.14026498579182584
ParseFunc = OneHot, K = 9, DisFunc = DisCosine
Correlation Coefficient:  0.35723121105176353
ParseFunc = OneHot, K = 10, DisFunc = Dis1
Correlation Coefficient:  0.2629518663576531
ParseFunc = OneHot, K = 10, DisFunc = Dis2
Correlation Coefficient:  0.25118330620673873
ParseFunc = OneHot, K = 10, DisFunc = DisInf
Correlation Coefficient:  0.14204846176177952


In [23]:
train_filen = 'lab1_data/classification_dataset/train_set.csv'
vaild_filen = 'lab1_data/classification_dataset/validation_set.csv'
test_filen  = 'lab1_data/classification_dataset/test_set.csv'
classifyResults = autoTrain(train_filen, vaild_filen, classifyReadFile, classifyParseY, do_classify)



Start training...
ParseFunc = OneHot, K = 8, DisFunc = Dis1
Classification Accuracy:  0.3504823151125402
ParseFunc = OneHot, K = 8, DisFunc = Dis2
Classification Accuracy:  0.3440514469453376
ParseFunc = OneHot, K = 8, DisFunc = DisInf
Classification Accuracy:  0.3729903536977492
ParseFunc = OneHot, K = 8, DisFunc = DisCosine
Classification Accuracy:  0.44694533762057875
ParseFunc = OneHot, K = 9, DisFunc = Dis1
Classification Accuracy:  0.38263665594855306
ParseFunc = OneHot, K = 9, DisFunc = Dis2
Classification Accuracy:  0.37942122186495175
ParseFunc = OneHot, K = 9, DisFunc = DisInf
Classification Accuracy:  0.3729903536977492
ParseFunc = OneHot, K = 9, DisFunc = DisCosine
Classification Accuracy:  0.4437299035369775
ParseFunc = OneHot, K = 10, DisFunc = Dis1
Classification Accuracy:  0.3954983922829582
ParseFunc = OneHot, K = 10, DisFunc = Dis2
Classification Accuracy:  0.3890675241157556
ParseFunc = OneHot, K = 10, DisFunc = DisInf
Classification Accuracy:  0.3729903536977492
Par

In [24]:
sorted(regressResults.items(), key=lambda kv: 0 if np.isnan(kv[1]) else kv[1] , reverse=True)

[(('TI-IDF', 9, 'DisCosine'), 0.4070312355638696),
 (('TI-IDF', 8, 'DisCosine'), 0.4053642423346923),
 (('TI-IDF', 10, 'DisCosine'), 0.4052144071304416),
 (('TI-IDF', 11, 'DisCosine'), 0.3982698654500274),
 (('TI-IDF', 12, 'DisCosine'), 0.394279061903327),
 (('TI-IDF', 13, 'DisCosine'), 0.38745096442658084),
 (('OneHot', 9, 'DisCosine'), 0.35723121105176353),
 (('OneHot', 8, 'DisCosine'), 0.3509801140585453),
 (('OneHot', 10, 'DisCosine'), 0.34626546659053203),
 (('OneHot', 11, 'DisCosine'), 0.32833459241003454),
 (('TI-IDF', 13, 'Dis1'), 0.3281013421794737),
 (('OneHot', 12, 'DisCosine'), 0.32312340116875865),
 (('OneHot', 13, 'DisCosine'), 0.32112232133753077),
 (('TI-IDF', 12, 'Dis1'), 0.3163261098155004),
 (('TI-IDF', 9, 'Dis1'), 0.3146291925970572),
 (('TI-IDF', 8, 'Dis1'), 0.3136233628472089),
 (('TI-IDF', 11, 'Dis1'), 0.3111875963025045),
 (('TI-IDF', 10, 'Dis1'), 0.3041146357566817),
 (('OneHot', 11, 'Dis1'), 0.2735819005847752),
 (('OneHot', 12, 'Dis1'), 0.26976977476136266),


In [25]:
sorted(classifyResults.items(), key=lambda kv: kv[1], reverse=True)

[(('TI-IDF', 10, 'DisCosine'), 0.4855305466237942),
 (('TI-IDF', 11, 'DisCosine'), 0.4855305466237942),
 (('TI-IDF', 13, 'DisCosine'), 0.4790996784565916),
 (('TI-IDF', 12, 'DisCosine'), 0.4694533762057878),
 (('TI-IDF', 8, 'DisCosine'), 0.4630225080385852),
 (('TI-IDF', 9, 'DisCosine'), 0.4533762057877814),
 (('OneHot', 8, 'DisCosine'), 0.44694533762057875),
 (('OneHot', 9, 'DisCosine'), 0.4437299035369775),
 (('OneHot', 10, 'DisCosine'), 0.4437299035369775),
 (('OneHot', 11, 'DisCosine'), 0.4405144694533762),
 (('OneHot', 12, 'DisCosine'), 0.4405144694533762),
 (('OneHot', 13, 'DisCosine'), 0.43729903536977494),
 (('OneHot', 13, 'Dis1'), 0.43086816720257237),
 (('OneHot', 13, 'Dis2'), 0.4212218649517685),
 (('OneHot', 11, 'Dis1'), 0.4180064308681672),
 (('OneHot', 12, 'Dis1'), 0.4180064308681672),
 (('OneHot', 11, 'Dis2'), 0.40836012861736337),
 (('OneHot', 12, 'Dis2'), 0.40836012861736337),
 (('TI-IDF', 13, 'Dis1'), 0.40514469453376206),
 (('TI-IDF', 10, 'Dis1'), 0.40192926045016075

In [26]:
train_filen = 'lab1_data/regression_dataset/train_set.csv'
vaild_filen = 'lab1_data/regression_dataset/validation_set.csv'
test_filen  = 'lab1_data/regression_dataset/test_set.csv'

ReadFileFunc = regressReadFile
ParseYFunc = regressParseY
TrainFunc = do_regress

ParseFunc = KNN_getTFIDF
DisFunc = DisCosine
K = 9

trainX_data, trainY_data = ReadFileFunc(train_filen)
vaildX_data, vaildY_data = ReadFileFunc(test_filen)
word_dict = OrderedDict()
def ParseXFunc(fdata: List[List]): return ParseFunc(fdata, word_dict)
trainX, trainY = vectorizeData(trainX_data, trainY_data, ParseXFunc, ParseYFunc)
vaildX, vaildY = vectorizeData(vaildX_data, vaildY_data, ParseXFunc, ParseYFunc)
def knnFunc(trainX, trainY, vaildX): return KNN((trainX, trainY), vaildX, DisFunc, K, DisInvNormAvg)
ret = TrainFunc(trainX, trainY, vaildX, vaildY, knnFunc, True)

TypeError: cannot perform reduce with flexible type