In [11]:
    from ai_base import List2CSV, CSV2List2
    from gensim.models.doc2vec import Doc2Vec, TaggedDocument
    from typing import List, Tuple
    import csv
    import numpy as np
def doc2vec(trainData: List[List], testData: List[List], savePath = "",fname = "doc2vec.model")-> Tuple[np.array, np.array]:
    documents = list(range(len(trainData)))
    for i, doc in enumerate(trainData):
        documents[i] = TaggedDocument(doc, [i])
    vec_sz=50
    model = Doc2Vec(documents, vector_size=vec_sz, window=2, min_count=1, workers=12)
    model.save(fname)
    trainX = np.zeros((len(trainData), vec_sz))
    testX = np.zeros((len(testData), vec_sz))
    for i, data in enumerate(trainData):
        trainX[i] = model.infer_vector(data)
    for i, data in enumerate(testData):
        testX[i] = model.infer_vector(data)
    List2CSV(savePath + "doc2vec" + "TrainSet" + str(vec_sz) + "D" + str(len(trainData)) + "L.csv", trainX)
    List2CSV(savePath + "doc2vec" + "TestSet" + str(vec_sz) + "D" + str(len(testData)) + "L.csv", testX)
    return (trainX, testX)

IndentationError: expected an indented block (<ipython-input-11-8eb530577754>, line 7)

In [None]:
from collections import OrderedDict
def _getTFIDF(fdata: List[List], word_dict: OrderedDict) -> np.array:
    '''
    获取TF-IDF矩阵，并将每个单词及出现次数存储到word_dict中
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    #文章数
    D = len(fdata)
    if len(word_dict) is 0:
        #训练集
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1
        word_dict[None] = 0
    else:
        #验证集和测试集，丢弃未出现的单词
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] += 1
                else:
                    word_dict[None] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                TF[i][word_order[word]] += 1
            else:
                TF[i][word_order[None]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log2(D / (1 + np.array(list(word_dict.values()))))
    TF = np.float16(TF)
    IDF = np.float16(IDF)
    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    return TF_IDF

In [None]:
def TFIDF(trainData: List[List], testData: List[List], savePath="")-> Tuple[np.array, np.array]:
    wdict = OrderedDict()
    trainX = _getTFIDF(trainData, wdict)
    testX = _getTFIDF(testData, wdict)
    List2CSV(savePath + "tfidf" + "TrainSet" + str(len(wdict)) + "D" + str(len(trainData)) + "L.csv", trainX)
    List2CSV(savePath + "tfidf" + "TestSet" + str(len(wdict)) + "D" + str(len(testData)) + "L.csv", testData)
    return trainX, testX


In [None]:
def _getOneHot(fdata: List[List], word_dict: OrderedDict) -> np.array:
    D = len(fdata)
    if len(word_dict) is 0:
        for row in fdata:
            for word in row:
                if not word in word_dict:
                    word_dict[word] = 1
        word_dict[None] = 0
    else:
        word_dict = dict(zip(word_dict.keys(), [0 for _ in word_dict.values()]))
        for row in fdata:
            for word in row:
                if word in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[None] = 1
    word_vec = word_dict.keys()
    word_order = dict(zip(word_vec,range(len(word_vec))))
    oneHot = np.zeros((D,len(word_dict)), dtype=np.int8)
    for i,row in enumerate(fdata):
        for word in row:
            if word in word_order:
                oneHot[i][word_order[word]] = 1
            else:
                oneHot[i][word_order[None]] = 1
    return oneHot

In [None]:
def OneHot(trainData: List[List], testData: List[List], savePath="")-> Tuple[np.array, np.array]:
    wdict = OrderedDict()
    trainX = _getOneHot(trainData, wdict)
    testX = _getOneHot(testData, wdict)
    if savePath != "-1":
        List2CSV(savePath + "onehot" + "TrainSet" + str(len(wdict)) + "D" + str(len(trainData)) + "L.csv", trainX)
        List2CSV(savePath + "onehot" + "TestSet" + str(len(wdict)) + "D" + str(len(testData)) + "L.csv", testData)
    return trainX, testX


In [12]:
# import csv
# ret = []
# with open('data/2/clean/trainDataclean.csv', 'r') as f:
#     reader = csv.reader(f)
#     for row in reader:
#         ret.append(list(row))
trainData = CSV2List2('data/2/clean/trainDataclean.csv')
testData = CSV2List2('data/2/clean/testDataclean.csv')
# trainData = [[w.lower() for w in row] for row in trainData]
# trainData = [[w.lower() for w in row] for row in testData]


In [71]:
len(trainData)

24000

In [7]:
trainX, testX = doc2vec(trainData, testData)

In [9]:
trainX2, testX2 = TFIDF(trainData, testData)

In [13]:
trainX3, testX3 = OneHot(trainData, testData)

In [17]:
len(trainX3)

24000