In [7]:
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

all_txt_files = []
for file in Path(".").rglob("*.txt"):
    all_txt_files.append(file.parent / file.name)

n_files = len(all_txt_files)
print(n_files)
all_txt_files.sort()

all_docs = []
for txt_file in all_txt_files:
    with open(txt_file, encoding='utf-8') as f:
        txt_file_as_string = f.read().strip()
    if txt_file_as_string:
        all_docs.append(txt_file_as_string)

vectorizer = TfidfVectorizer(max_df=0.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

transformed_documents_as_array = transformed_documents.toarray()
print(len(transformed_documents_as_array))

Path("./tf_idf_output").mkdir(parents=True, exist_ok=True)

output_filenames = [
    str(txt_file).replace(".txt", ".csv").replace("./", "tf_idf_output/")
    for txt_file in all_txt_files
]

for counter, doc in enumerate(transformed_documents_as_array):
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(
        tf_idf_tuples, columns=['term', 'score']
    ).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df.to_csv(output_filenames[counter], index=False)


2
2
