In [1]:
## Tutorial inspired by https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
# by Matthew Lavin
#
# We will analyze the first 3 chapters and the preface from the following book:
# The Little World of Don Camillo, by Giovannino Guareschi (1950)
# It also became a movie in 1963
#
# You can locate the entire text of this gem of a book at:
#     https://archive.org/stream/TheLittleWorldOfDonCamillo/doncamillolittleworld_djvu.txt

In [2]:
from pathlib import Path

all_txt_files =[]
for file in Path("data/txt").rglob("lit*.txt"):
    all_txt_files.append(file.parent / file.name)    
# counts the length of the list
n_files = len(all_txt_files)
print(n_files)

4


In [3]:
all_txt_files[0:16]

[PosixPath('data/txt/little-world-of-don-camillo-guareschi-c0.txt'),
 PosixPath('data/txt/little-world-of-don-camillo-guareschi-c1.txt'),
 PosixPath('data/txt/little-world-of-don-camillo-guareschi-c2.txt'),
 PosixPath('data/txt/little-world-of-don-camillo-guareschi-preface.txt')]

In [4]:
all_docs = []
for txt_file in all_txt_files:
    with open(txt_file) as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

In [5]:
#import the TfidfVectorizer from Scikit-Learn.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
countvectorizer = CountVectorizer()
counts = countvectorizer.fit_transform(all_docs) # matrix of token counts
# print(countvectorizer.get_feature_names())
# print(counts.toarray())
# len(counts.toarray()) # of rows
# len(counts.toarray()[0]) # of columns

In [7]:
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

In [8]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents 
# that we have in the file list
len(transformed_documents_as_array)
# transformed_documents_as_array[0,]

4

In [9]:
import pandas as pd

# make the output folder if it doesn't already exist
Path("./data/tf_idf_output").mkdir(parents=True, exist_ok=True)

# construct list of output files using previous list of files and relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv").replace("txt/", "tf_idf_output/") 
                    for txt_file in all_txt_files]

# output_filenames

tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), transformed_documents_as_array[0,]))
tfidf = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score_y']).sort_values(by='score_y', ascending=False).reset_index(drop=True)
# 
tfidf
# tfidf = pd.merge(tfidf, one_doc_as_df[['term', 'score_y']], on = 'term', how = 'outer')
# tf_idf_tuples

Unnamed: 0,term,score_y
0,forgive,9.581454
1,minister,7.665163
2,god,7.554128
3,those,6.043302
4,confession,5.748872
...,...,...
1128,hard,0.000000
1129,harvesters,0.000000
1130,hastened,0.000000
1131,hasty,0.000000


In [10]:
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    score = 'score_'+ str(counter)
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, 
        columns=['term', score]).sort_values(by = score, 
        ascending = False).reset_index(drop = True)
    
    tfidf = pd.merge(tfidf, one_doc_as_df[['term', score]], on = 'term', how = 'outer')
    # tfidf = pd.concat([tfidf, one_doc_as_df], axis=1, join='outer', ignore_index=True, keys=None, 
                 # verify_integrity=False, copy=True)
    # output to a csv using the enumerated value for the filename
    # one_doc_as_df.to_csv(output_filenames[counter], index=False)
# one_doc_as_df

In [11]:
tfidf[['term','score_0', 'score_1', 'score_2', 'score_3']]

Unnamed: 0,term,score_0,score_1,score_2,score_3
0,forgive,9.581454,0.000000,0.000000,0.000000
1,minister,7.665163,0.000000,0.000000,0.000000
2,god,7.554128,1.510826,0.000000,0.000000
3,those,6.043302,3.021651,0.000000,0.000000
4,confession,5.748872,0.000000,0.000000,0.000000
...,...,...,...,...,...
1128,hard,0.000000,1.510826,0.000000,1.510826
1129,harvesters,0.000000,0.000000,0.000000,1.916291
1130,hastened,0.000000,1.916291,0.000000,0.000000
1131,hasty,0.000000,0.000000,1.916291,0.000000


In [13]:
tfidf.sort_values(by=['score_2'], ascending = False)

Unnamed: 0,term,score_y,score_0,score_1,score_2,score_3
704,castellino,0.000000,0.000000,0.000000,11.497744,0.000000
293,bicycle,1.510826,1.510826,0.000000,9.064954,0.000000
871,bells,0.000000,0.000000,0.000000,7.665163,0.000000
254,gone,1.510826,1.510826,0.000000,6.043302,0.000000
566,sight,0.000000,0.000000,0.000000,5.748872,0.000000
...,...,...,...,...,...,...
395,perhaps,0.000000,0.000000,0.000000,0.000000,3.832581
396,prodding,0.000000,0.000000,1.916291,0.000000,0.000000
397,proofs,0.000000,0.000000,0.000000,0.000000,1.916291
398,oppresses,0.000000,0.000000,0.000000,0.000000,1.916291
