In [3]:
import numpy as np
import csv
from collections import OrderedDict
import copy
import pandas as pd
from time import time
from typing import Callable
from typing import Any
from typing import Dict, Tuple, List
import cProfile
import re
import os
from pycallgraph import PyCallGraph
from pycallgraph.output import GraphvizOutput


In [4]:
t = time()

In [5]:
def readFile(filen: str) -> List[List]:
    '''
    读取文件内容
    由于首先需要获取文章数量和单词向量长度，才能计算TF矩阵
    因此要对文本内容进行两次遍历，为了避免两次读取磁盘文件，故先将文本内容保存到内存中的一个list
    '''
    fdata = []
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=' ')
        fdata = [list(row) for row in reader]
    return fdata

In [6]:
def getTFIDF(fdata: List[List]) -> np.array:
    '''
    获取TF-IDF矩阵
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    word_dict = OrderedDict() 
    #文章数
    D = 0
    for row in fdata:
        D += 1
        for word in row:
            if not word in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            TF[i][word_order[word]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log2(D / (1 + np.array(list(word_dict.values()))))
    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    return TF_IDF

In [7]:

#TF_IDF
print(time()-t)

0.06311440467834473


In [8]:
def KNN_getTFIDF(fdata: List[List], word_dict: OrderedDict) -> np.array:
    '''
    获取TF-IDF矩阵，并将每个单词及出现次数存储到word_dict中
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    #文章数
    D = len(fdata)
    if len(word_dict) is 0:
        #训练集
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1
        word_dict[None] = 0
    else:
        #验证集和测试集，丢弃未出现的单词
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] += 1
                else:
                    word_dict[None] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                TF[i][word_order[word]] += 1
            else:
                TF[i][word_order[None]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log2(D / (1 + np.array(list(word_dict.values()))))
    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    np.savetxt("foo.csv", TF_IDF, delimiter=",")
    return TF_IDF

In [9]:
wdict = OrderedDict()
KNN_getTFIDF([['a','a'],['c','a']], wdict)


array([[-1. ,  0. ,  0. ],
       [-0.5,  0. ,  0. ]])

In [10]:
def getOneHot(fdata: List[List], word_dict: OrderedDict) -> np.array:
    D = len(fdata)
    if len(word_dict) is 0:
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
        word_dict[None] = 0
    else:
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[None] = 1
    word_vec = word_dict.keys()
    word_order = dict(zip(word_vec,range(len(word_vec))))
    oneHot = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                oneHot[i][word_order[word]] = 1
            else:
                oneHot[i][word_order[None]] = 1
    return oneHot

In [11]:
wdict = OrderedDict()
getOneHot([['a','a'],['c','a']], wdict)
getOneHot([['d','d']], wdict)

array([[0., 0., 1.]])

In [12]:
def DisN(vec1: np.array, vec2: np.array, N: Any) -> float:
    '''
    计算N-norm
    '''
    if(N < 1):
        raise ValueError("norm should be a positive integer or np.inf")
    if np.isinf(N):
        return np.max(np.fabs(vec1 - vec2))
    else:
        return np.power(np.sum(np.power(vec1 - vec2, N)), 1.0/N)

#Dis2 = lambda v1, v2: DisN(v1, v2, 2)
Dis1 = lambda v1, v2: np.linalg.norm(v1 - v2, 1)
Dis2 = lambda v1, v2: np.linalg.norm(v1 - v2, 2)
DisInf = lambda v1, v2: np.linalg.norm(v1 - v2, np.inf)
def DisCosine(v1, v2):
    t1 = np.dot(v1,v2)
    t2 = np.linalg.norm(v1)
    t3 = np.linalg.norm(v2)
    ret = 1 - t1 / (t2*t3)
    return ret
 

In [13]:
DisCosine([0,1],[0,0.5])

0.0

In [14]:
def DisInvNormAvg(distances: np.array, Y: np.array) -> np.array:
    '''
    将距离倒数归一化，返回均值
    '''
    for idx, dis in enumerate(distances):
        if np.isclose(dis, 0):
            return Y[idx]
    distances = np.array(1.0) / distances
    s = np.sum(distances)
    distances = distances / s
    tmp = np.diag(distances) @ Y  
    if len(tmp.shape) is 1:
        return tmp
    else:
        return np.sum(tmp,  axis = (0))

In [15]:
def KNN(trainSet: Tuple[np.array, np.array],
        testVec: np.array,
        DisFunc: Callable[[np.array, np.array], float],
        K: int,
        WeightFunc: Callable[[np.array, np.array], float]) -> np.array: 
    '''
    一个通用的KNN接口
    trainSet: 二元元组，第一个元素是训练集的X，第二个是Y
    testVec: 待预测向量
    DisFunc: 距离函数
    K: K值
    WeightFunc: 依据第一个参数list<距离>,对第二个参数list<Y值>进行加权，返回预测值
    '''
    #对于多个要预测的值，逐一预测
    if len(testVec.shape) > 1:
        return np.array([KNN(trainSet, vec, DisFunc, K, WeightFunc) for vec in testVec])
    else:
        #测量待预测向量到训练集中每个向量的距离
        #distances是一个list<tuple(index, distance)>
        distances = list(enumerate(map(lambda trainVec: DisFunc(trainVec, testVec), trainSet[0])))
        #依据距离从小到大排序
        distances.sort(key=lambda t: t[1])
        #获取最临近的K个训练样本的下标和对应的距离，输出值
        tmp = list(zip(*distances[:K]))
        kNearIdx = list(tmp[0])
        kNearDis = list(tmp[1])
        kNearY   = trainSet[1][kNearIdx, :]
        #对输出值根据距离加权作为预测输出
        return WeightFunc(kNearDis, kNearY)

In [16]:
def TestCast1():
    trainX = np.array([[10,2],[2,3],[3,5]])
    trainY = np.array([[1,1,1], [2,2,3], [3,3,5]])
    vaildX = np.array([[3,3]])
    return trainX, trainY, vaildX


In [17]:
def TestCast2():
    xfilen = 'lab1_data/X.txt'
    yfilen = 'lab1_data/Y.txt'
    xdata = readFile(xfilen)
    ydata = readFile(yfilen)
    x_set = getTFIDF(xdata)
    y_set = np.array([list(map(float, row)) for row in ydata])
    DIVIDE_RATE = 0.75
    train_D = int(np.ceil(x_set.shape[0] * DIVIDE_RATE))
    trainX = x_set[0:train_D, :]
    vaildX = x_set[train_D:, :]
    trainY = y_set[0:train_D, :]
    vaildY = y_set[train_D:, :]
    return trainX, trainY, vaildX

In [18]:
def Test(trainX, trainY, vaildX):
    t = time()
    cProfile.run('KNN((trainX, trainY), vaildX,Dis2,2,DisInvNormAvg)')
    print(time()-t) 
#trainX, trainY, vaildX = TestCast2()
#Test(trainX, trainY, vaildX)

In [19]:
def classifyReadFile(filen: str) -> Tuple[List[str], List[str]]:
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=',')
        train_data = [list(row) for row in reader]
        train_data = train_data[1:]
        tmp = list(zip(*train_data))
        xdata = [row.split() for row in list(tmp[0])]
        ydata = list(tmp[1])
        return xdata, ydata

def vectorizeData(xdata, ydata, xVecFunc, yVecFunc):
    return xVecFunc(xdata), yVecFunc(ydata)

def fastHashY(s: str) -> int:
    if s[0] is 'a': return 0
    if s[0] is 'd': return 1
    if s[0] is 'f': return 2
    if s[0] is 'j': return 3
    if s[1] is 'a': return 4
    return 5

def classifyParseY(ydata: List[str])->np.array:
    '''
    Convert Y data from raw string list to matrix consisted of Y vectors
    e.g.
    ["anger", "disgust", ..., "surprise"] -> 
    |1, 0, 0, 0, 0, 0|
    |0, 1, 0, 0, 0, 0|
    |0, 0, ...,  0, 0|
    |0, 0, 0, 0, 1, 0|
    |0, 0, 0, 0, 0, 1|
    '''
    D = len(ydata)
    
    #fast hash ydata from strings ["anger", "disgust", ...] to [1, 2, ...]^T
    ydata = np.array(list(map(fastHashY, ydata))).reshape((-1,1))
    
    '''
    ymat is the column-wise repeat of ydata.
    e.g.
    |0|      |0, 0, 0, 0, 0, 0|
    |1|   -> |1, 1, 1, 1, 1, 1|
    ...      |................|
    |5|      |5, 5, 5, 5, 5, 5|
    ydata -> ymat
    '''
    ymat  = np.tile(ydata, (1, 6))
    
    '''
    ycmp is a matrix of which each row is [0, 1, 2, 3, 4, 5]
    |0, 1, 2, 3, 4, 5|
    |0, 1, 2, 3, 4, 5|
    |................|
    |0, 1, 2, 3, 4, 5|
    '''
    ycmp  = np.tile(np.array(range(6)), (D, 1))
    return np.int_(np.equal(ymat, ycmp))



In [20]:
def do_classify(trainX, trainY, vaildX, vaildY, knnFunc):
    predictY = knnFunc(trainX, trainY, vaildX)
    classifyY = np.zeros_like(predictY)
    for i, row in enumerate(predictY):
        m = 0
        idx = 0
        for j, v in enumerate(row):
            if v > m:
                m = v
                idx = j
        classifyY[i][idx] = 1
    #print("Predicted Y")
    #print(classifyY)
    #print("Correct Y")
    #print(vaildY)
    ret = np.sum(np.logical_and(classifyY, vaildY)) / vaildX.shape[0]
    print("Classification Accuracy: ", ret)
    return ret

In [21]:
def regressReadFile(filen: str) -> Tuple[List[str], List[str]]:
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=',')
        train_data = [list(row) for row in reader]
        train_data = train_data[1:]
        xdata = [row[0].split() for row in train_data]
        ydata = [[row[i] for i in range(1, 7)] for row in train_data]
        return xdata, ydata

def regressParseY(ydata: List[List[str]]) -> np.array:
    return np.float_(np.array(ydata))

#from scipy.stats.stats import pearsonr
def pearsonr(X, Y):

    X_bar = np.average(X)
    Y_bar = np.average(Y)
    X = np.subtract(X, X_bar)
    Y = np.subtract(Y, Y_bar)
    t1 = np.sum(np.dot(X, Y))
    t2 = np.sum(np.power(X,2))
    t3 = np.sum(np.power(Y,2))

    ret = t1 / np.power(t2 * t3, 0.5)
    return ret
def do_regress(trainX, trainY, vaildX, vaildY, knnFunc, save = False):
    predictY = knnFunc(trainX, trainY, vaildX)
    if save:
        np.savetxt("regress.csv", predictY, delimiter=",", fmt="%4f")
    r = [pearsonr(predictY[:, i], vaildY[:, i]) for i in range(6)]
    average = np.average(r)
    print("Correlation Coefficient: ", average)
    return average
    
    
train_filen = 'lab1_data/regression_dataset/train_set.csv'
vaild_filen = 'lab1_data/regression_dataset/validation_set.csv'
test_filen  = 'lab1_data/regression_dataset/test_set.csv'

In [22]:
#train_filen = 'lab1_data/classification_dataset/train_set.csv'
#vaild_filen = 'lab1_data/classification_dataset/validation_set.csv'
#test_filen  = 'lab1_data/classification_dataset/test_set.csv'
#
#trainX_data, trainY_data = classifyReadFile(train_filen)
#vaildX_data, vaildY_data = classifyReadFile(vaild_filen)
#
#ParseFuncs = {"OneHot": getOneHot, "TI-IDF": KNN_getTFIDF}
#K_val = range(1, 20)
#DisFuncs = {"Dis1": Dis1, "Dis2": Dis2, "DisInf": DisInf, "DisCosine": DisCosine}
#
#results = OrderedDict()
#
#for pfname, ParseFunc in ParseFuncs.items():
#    word_dict = OrderedDict()
#    def classifyParseX(fdata: List[List]): return ParseFunc(fdata, word_dict)
#    trainX, trainY = vectorizeData(trainX_data, trainY_data, classifyParseX, classifyParseY)
#    vaildX, vaildY = vectorizeData(vaildX_data, vaildY_data, classifyParseX, classifyParseY)
#    for K in K_val:
#        for dfname, DisFunc in DisFuncs.items():
#            print("ParseFunc = {}, K = {}, DisFunc = {}".format(pfname, K, dfname))
#            def knnFunc(trainX, trainY, vaildX): return KNN((trainX, trainY), vaildX, DisFunc, K, DisInvNormAvg)
#            ret = do_classify(trainX, trainY, vaildX, vaildY, knnFunc)
#            results[(pfname, K, dfname)] = ret

In [23]:
def autoTrain(train_filen, vaild_filen, ReadFileFunc, ParseYFunc, TrainFunc):
    print("Start training...")
    t = time()
    trainX_data, trainY_data = ReadFileFunc(train_filen)
    vaildX_data, vaildY_data = ReadFileFunc(vaild_filen)
    
    ParseFuncs = {"OneHot": getOneHot, "TI-IDF": KNN_getTFIDF}
    K_val = range(1, 20)
    DisFuncs = {"Dis1": Dis1, "Dis2": Dis2, "DisInf": DisInf, "DisCosine": DisCosine}
    #DisFuncs = {"DisCosine": DisCosine}
    results = OrderedDict()
    
    for pfname, ParseFunc in ParseFuncs.items():
        word_dict = OrderedDict()
        def ParseXFunc(fdata: List[List]): return ParseFunc(fdata, word_dict)
        trainX, trainY = vectorizeData(trainX_data, trainY_data, ParseXFunc, ParseYFunc)
        vaildX, vaildY = vectorizeData(vaildX_data, vaildY_data, ParseXFunc, ParseYFunc)
        for K in K_val:
            for dfname, DisFunc in DisFuncs.items():
                print("ParseFunc = {}, K = {}, DisFunc = {}".format(pfname, K, dfname))
                def knnFunc(trainX, trainY, vaildX): return KNN((trainX, trainY), vaildX, DisFunc, K, DisInvNormAvg)
                ret = TrainFunc(trainX, trainY, vaildX, vaildY, knnFunc)
                results[(pfname, K, dfname)] = ret
    print("{} groups of argument tested, spent {}s".format(len(ParseFuncs) * len(K_val) * len(DisFuncs), time() - t))
    return results

In [24]:
train_filen = 'lab1_data/regression_dataset/train_set.csv'
vaild_filen = 'lab1_data/regression_dataset/validation_set.csv'
test_filen  = 'lab1_data/regression_dataset/test_set.csv'
np.set_printoptions(threshold=np.nan)
regressResults = autoTrain(train_filen, vaild_filen, regressReadFile, regressParseY, do_regress)




Start training...
ParseFunc = OneHot, K = 1, DisFunc = Dis1
Correlation Coefficient:  0.19479721872198139
ParseFunc = OneHot, K = 1, DisFunc = Dis2
Correlation Coefficient:  0.19479721872198139
ParseFunc = OneHot, K = 1, DisFunc = DisInf


KeyboardInterrupt: 

In [None]:
train_filen = 'lab1_data/classification_dataset/train_set.csv'
vaild_filen = 'lab1_data/classification_dataset/validation_set.csv'
test_filen  = 'lab1_data/classification_dataset/test_set.csv'
classifyResults = autoTrain(train_filen, vaild_filen, classifyReadFile, classifyParseY, do_classify)



In [None]:
sorted(regressResults.items(), key=lambda kv: 0 if np.isnan(kv[1]) else kv[1] , reverse=True)


In [None]:
sorted(classifyResults.items(), key=lambda kv: kv[1], reverse=True)

In [25]:
graphviz = GraphvizOutput(output_file='filter_none.png')
with PyCallGraph(output=graphviz):

    train_filen = 'lab1_data/regression_dataset/train_set.csv'
    vaild_filen = 'lab1_data/regression_dataset/validation_set.csv'
    test_filen  = 'lab1_data/regression_dataset/test_set.csv'

    ReadFileFunc = regressReadFile
    ParseYFunc = regressParseY
    TrainFunc = do_regress

    ParseFunc = KNN_getTFIDF
    DisFunc = DisCosine
    K = 9

    trainX_data, trainY_data = ReadFileFunc(train_filen)
    vaildX_data, vaildY_data = ReadFileFunc(vaild_filen)
    word_dict = OrderedDict()
    def ParseXFunc(fdata: List[List]): return ParseFunc(fdata, word_dict)
    trainX, trainY = vectorizeData(trainX_data, trainY_data, ParseXFunc, ParseYFunc)
    vaildX, vaildY = vectorizeData(vaildX_data, vaildY_data, ParseXFunc, ParseYFunc)
    def knnFunc(trainX, trainY, vaildX): return KNN((trainX, trainY), vaildX, DisFunc, K, DisInvNormAvg)
    ret = TrainFunc(trainX, trainY, vaildX, vaildY, knnFunc, True)

Correlation Coefficient:  0.40703123556386966
