In [1]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd
import json

In [2]:
path = './docs'

In [3]:
docs_dict = {}; docs = []
for i,filename in enumerate(sorted(os.listdir(path))):
    with open(path + '/' + filename) as file:
        docs_dict['doc_' + str(i)] = filename
        docs.append(file.read())

In [4]:
print(docs_dict)

{'doc_0': 'Douglas Adams - Hitchhikers Trilogy - Hitchhikers Guide to the Galaxy.txt', 'doc_1': 'Douglas Adams - Hitchhikers Trilogy - Mostly Harmless.txt', 'doc_2': 'Douglas Adams - Hitchhikers Trilogy - Restaurant End of the Universe.txt', 'doc_3': 'Douglas Adams - Hitchhikers Trilogy - So Long, and Thanks for All the Fish.txt', 'doc_4': 'Douglas Adams - The Long Dark Tea Time of the Soul.txt', 'doc_5': 'Edwin Arnold - Guliver of Mars.txt', 'doc_6': 'Jane Austen - Northanger Abbey.txt', 'doc_7': 'Jane Austen - Persuasion.txt', 'doc_8': 'Jane Austen - Pride and Prejudice.txt', 'doc_9': 'Poul Anderson - In Memoriam.txt'}


In [5]:
vectorizer = CountVectorizer()

In [6]:
transformer = TfidfTransformer()

In [7]:
count = vectorizer.fit_transform(docs)

In [8]:
# Display first 20 cols from count matrix
print(count.toarray().shape)
print(count.toarray()[:,:20])

(10, 23138)
[[ 0  1  0  1  0  1  0  0  0  0  0  0  0  0  0  1  0  2  0  0]
 [ 0  2  1  0  0  1  0  0  0  0  0  0  0  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  1  0  1  0  0  0  0  0  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  2  1  0  0  0  0  0  0  0  0  2  0  1  1  1]
 [ 1  1  0  0  0  1  3  0  2  1  1 12  1 14  5  1  3  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  1  0  0]
 [ 0  9  0  0  0  5  1  0  0  0  0  0  0  0  0  3  0  3  0  0]
 [ 0  4  0  0  1  3  2  0  0  0  0  0  0  0  0  1  0  0  0  0]]


In [9]:
# sorted vectorizer vocabulary
vocabulary = dict(sorted(vectorizer.vocabulary_.items()))

In [10]:
# Display first 50 items from vocabulary
print({k: vocabulary[k] for k in list(vocabulary)[:50]})

{'00': 0, '000': 1, '004': 2, '03758': 3, '040700': 4, '10': 5, '100': 6, '1001': 7, '1061': 8, '1076': 9, '1078': 10, '1080': 11, '1081': 12, '1082': 13, '1086': 14, '11': 15, '1105': 16, '12': 17, '121': 18, '123': 19, '124': 20, '126': 21, '127': 22, '13': 23, '132': 24, '14': 25, '15': 26, '15th': 27, '16': 28, '17': 29, '1760': 30, '1784': 31, '1785': 32, '1787': 33, '1789': 34, '1791': 35, '1797': 36, '18': 37, '1800': 38, '1803': 39, '1806': 40, '1810': 41, '1811': 42, '1812': 43, '1813': 44, '1814': 45, '1816': 46, '1818': 47, '1870': 48, '18th': 49}


In [11]:
# store vocabulary in json file
with open('vocabulary.json', 'w') as file:
        json.dump(vocabulary, file, indent=4)

In [12]:
tfidf_matrix = transformer.fit_transform(count)

In [13]:
# Display first 6 cols from tf-idf matrix
print(tfidf_matrix.toarray().shape)
print(tfidf_matrix.toarray()[:,:6])

(10, 23138)
[[0.         0.00038137 0.         0.00064223 0.         0.00026008]
 [0.         0.00053582 0.00045116 0.         0.         0.0001827 ]
 [0.         0.         0.         0.         0.         0.00021665]
 [0.         0.         0.         0.         0.         0.00053189]
 [0.00038013 0.00022573 0.         0.         0.         0.00015394]
 [0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.0001583 ]
 [0.         0.         0.         0.         0.         0.000143  ]
 [0.         0.0013111  0.         0.         0.         0.00049673]
 [0.         0.01541767 0.         0.         0.00649088 0.0078856 ]]


In [14]:
# transform tf-idf matrix for df
tfidf_npmatrix = tfidf_matrix.T.todense()

In [15]:
# feature_names for df as index
feature_names = vectorizer.get_feature_names_out()

In [16]:
# docs names for df as cols
docs_names = sorted(docs_dict.keys())

In [17]:
# tf-idf matrix as dataframe
df = pd.DataFrame(tfidf_npmatrix, index = feature_names, columns = docs_names)
df

Unnamed: 0,doc_0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9
00,0.000000,0.000000,0.000000,0.0,0.000380,0.0,0.0,0.0,0.000000,0.000000
000,0.000381,0.000536,0.000000,0.0,0.000226,0.0,0.0,0.0,0.001311,0.015418
004,0.000000,0.000451,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
03758,0.000642,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
040700,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.006491
...,...,...,...,...,...,...,...,...,...,...
zoom,0.000546,0.000384,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
zowee,0.000000,0.000000,0.000535,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
zwingler,0.000000,0.000902,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
zz,0.001927,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
