In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kennethshinn/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

In [6]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [9]:
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

In [8]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [10]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [12]:
numOfWordsA

{'man': 1,
 'fire': 0,
 'out': 1,
 'walk': 1,
 'children': 0,
 'sat': 0,
 'went': 1,
 'a': 1,
 'the': 1,
 'for': 1,
 'around': 0}

In [20]:
from nltk.corpus import stopwords
stopwords.words('german')

['aber',
 'alle',
 'allem',
 'allen',
 'aller',
 'alles',
 'als',
 'also',
 'am',
 'an',
 'ander',
 'andere',
 'anderem',
 'anderen',
 'anderer',
 'anderes',
 'anderm',
 'andern',
 'anderr',
 'anders',
 'auch',
 'auf',
 'aus',
 'bei',
 'bin',
 'bis',
 'bist',
 'da',
 'damit',
 'dann',
 'der',
 'den',
 'des',
 'dem',
 'die',
 'das',
 'dass',
 'daß',
 'derselbe',
 'derselben',
 'denselben',
 'desselben',
 'demselben',
 'dieselbe',
 'dieselben',
 'dasselbe',
 'dazu',
 'dein',
 'deine',
 'deinem',
 'deinen',
 'deiner',
 'deines',
 'denn',
 'derer',
 'dessen',
 'dich',
 'dir',
 'du',
 'dies',
 'diese',
 'diesem',
 'diesen',
 'dieser',
 'dieses',
 'doch',
 'dort',
 'durch',
 'ein',
 'eine',
 'einem',
 'einen',
 'einer',
 'eines',
 'einig',
 'einige',
 'einigem',
 'einigen',
 'einiger',
 'einiges',
 'einmal',
 'er',
 'ihn',
 'ihm',
 'es',
 'etwas',
 'euer',
 'eure',
 'eurem',
 'euren',
 'eurer',
 'eures',
 'für',
 'gegen',
 'gewesen',
 'hab',
 'habe',
 'haben',
 'hat',
 'hatte',
 'hatten',
 '

In [21]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [22]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [28]:
sum(tfA.values())
sum(tfB.values())

0.9999999999999999

In [30]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict


In [31]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [32]:
idfs

{'man': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'out': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'children': 0.6931471805599453,
 'sat': 0.6931471805599453,
 'went': 0.6931471805599453,
 'a': 0.6931471805599453,
 'the': 0.0,
 'for': 0.6931471805599453,
 'around': 0.6931471805599453}

In [34]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [35]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])


In [36]:
df

Unnamed: 0,man,fire,out,walk,children,sat,went,a,the,for,around
0,0.099021,0.0,0.099021,0.099021,0.0,0.0,0.099021,0.099021,0.0,0.099021,0.0
1,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.0,0.0,0.0,0.0,0.115525


In [37]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [38]:
df

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
