In [1]:
import numpy as np
import csv
from collections import OrderedDict
import copy
import pandas as pd
from time import time
from typing import Callable
from typing import Any
from typing import Dict, Tuple, List
import cProfile
import re
import os

In [2]:
t = time()

In [3]:
def readFile(filen: str) -> List[List]:
    '''
    读取文件内容
    由于首先需要获取文章数量和单词向量长度，才能计算TF矩阵
    因此要对文本内容进行两次遍历，为了避免两次读取磁盘文件，故先将文本内容保存到内存中的一个list
    '''
    fdata = []
    with open(filen) as fd:
        reader = csv.reader(fd, delimiter=' ')
        fdata = [list(row) for row in reader]
    return fdata

In [28]:
def getTFIDF(fdata: List[List]) -> np.array:
    '''
    获取TF-IDF矩阵
    '''
    #首先获取文章数和单词向量
    #使用OrderedDict按单词出现的顺序生成单词列表
    #相比于使用list，好处在于每次判断word是否已经加入单词向量是log(n)复杂度
    word_dict = OrderedDict() 
    #文章数
    D = 0
    for row in fdata:
        D += 1
        for word in row:
            if not word in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
    #word_vec是单词向量
    word_vec = word_dict.keys()
    #word_order的键值是当前单词的序号，在生成TF矩阵时会用到
    word_order = dict(zip(word_vec,range(len(word_vec))))
    #生成TF矩阵
    TF = np.zeros((D,len(word_dict)))
    for i,row in enumerate(fdata):
        for word in row:
            TF[i][word_order[word]] += 1
        #每个文章中单词出现次数归一化
        TF[i] /= len(fdata[i])
    #生成IDF矩阵
    IDF = np.log(D / (1 + np.array(list(word_dict.values()))))

    #生成TF-IDF矩阵
    TF_IDF = np.multiply(TF, IDF)
    return TF_IDF

In [37]:
getTFIDF([["b", "c"], ["a", "c"]])

array([[ 0.        , -0.20273255,  0.        ],
       [ 0.        , -0.20273255,  0.        ]])

In [5]:
semval = readFile('lab1_data/semeval_sliced.txt')
ret = getTFIDF(semval)
#ret = ret.tolist()
#rt = []
#for row in ret:
#    rr = []
#    for w in row:
#        if row != 0:
#           rr.append(w)
#    rt.append(rr)
#ret = np.array(rt)
##print(ret)
np.savetxt("15323032_LiXinrui_TFIDF.txt", ret, delimiter=" ", fmt="%6f")

what
