# 作業目標：搭建一個 TFIDF 模型

### Reference:
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [1]:
import nltk
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

## 1. 首先我們做 tokenize，並取出所有文件中的單詞

In [2]:
tokenize_A = nltk.word_tokenize(documentA)
tokenize_B = nltk.word_tokenize(documentB)

uniqueWords = set(tokenize_A).union(set(tokenize_B)) # 所有文件中的單詞

In [3]:
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

## 2. 計算每個文件中，所有 uniqueWords 出現的次數

In [10]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
numOfWordsB = dict.fromkeys(uniqueWords, 0)

numOfWordsA, numOfWordsB

({'children': 0,
  'for': 0,
  'man': 0,
  'the': 0,
  'sat': 0,
  'a': 0,
  'out': 0,
  'around': 0,
  'fire': 0,
  'went': 0,
  'walk': 0},
 {'children': 0,
  'for': 0,
  'man': 0,
  'the': 0,
  'sat': 0,
  'a': 0,
  'out': 0,
  'around': 0,
  'fire': 0,
  'went': 0,
  'walk': 0})

In [11]:
for word in tokenize_A:
    numOfWordsA[word] += 1

for word in tokenize_B:
    numOfWordsB[word] += 1

In [12]:
numOfWordsA

{'children': 0,
 'for': 1,
 'man': 1,
 'the': 1,
 'sat': 0,
 'a': 1,
 'out': 1,
 'around': 0,
 'fire': 0,
 'went': 1,
 'walk': 1}

In [13]:
numOfWordsB

{'children': 1,
 'for': 0,
 'man': 0,
 'the': 2,
 'sat': 1,
 'a': 0,
 'out': 0,
 'around': 1,
 'fire': 1,
 'went': 0,
 'walk': 0}

## 3. 定義 function: 計算 TF

In [15]:
def computeTF(wordDict, tokenize_item):
    """
    wordDict : 文件內單詞對應出現數量的字典
    tokenize_item : 文件 tokenize 後的輸出
    """
    tfDict = {}
    bagOfWordsCount = len(tokenize_item)  # tokenize_item 單詞數量
    for word, count in wordDict.items():
        tfDict[word] = count/bagOfWordsCount  # 單詞在該文件出現的次數/該文件擁有的所有單詞數量
    return tfDict

## 4. 定義 function: 計算 IDF

In [16]:
def computeIDF(documentsDict):
    """
    documentsDict: 為一個 list，包含所有文件的 wordDict
    """
    import math
    N = len(documentsDict) # 總文件數量
    
    idfDict = dict.fromkeys(documentsDict[0].keys(), 0)
    for document in documentsDict:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1  # 計算單詞在多少文件中出現過
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/val)  # 計算 IDF，Log(所有文件的數目/包含這個單詞的文件數目)
    return idfDict

## 5. 定義 function: 計算 TF-IDF

In [17]:
def computeTFIDF(tf_item, idfs):
    tfidf = {}
    for word, val in tf_item.items():
        tfidf[word] = val * idfs[word]
    return tfidf

## 6. 計算

In [18]:
tfA = computeTF(numOfWordsA, tokenize_A)
tfB = computeTF(numOfWordsB, tokenize_B)

idfs = computeIDF([numOfWordsA, numOfWordsB])

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [19]:
tfidfA

{'children': 0.0,
 'for': 0.09902102579427789,
 'man': 0.09902102579427789,
 'the': 0.0,
 'sat': 0.0,
 'a': 0.09902102579427789,
 'out': 0.09902102579427789,
 'around': 0.0,
 'fire': 0.0,
 'went': 0.09902102579427789,
 'walk': 0.09902102579427789}

In [22]:
tfidfB

{'children': 0.11552453009332421,
 'for': 0.0,
 'man': 0.0,
 'the': 0.0,
 'sat': 0.11552453009332421,
 'a': 0.0,
 'out': 0.0,
 'around': 0.11552453009332421,
 'fire': 0.11552453009332421,
 'went': 0.0,
 'walk': 0.0}