In [4]:
import spacy
import pandas as pd
import numpy as np

In [32]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [23]:
from pathlib import Path
import os
import pickle
import collections

In [6]:
from tqdm.notebook import tqdm

In [7]:
doc_paths = []
for p in Path("Navigations_headed_xml/Parsed_texts").iterdir():
    if p.is_file() and p.suffix == ".txt":
        doc_paths.append(p)

print(f"{len(doc_paths)} documents found.")

1478 documents found.


In [11]:
docs = dict()
print("Reading docs...")
for idoc, doc_path in enumerate(tqdm(doc_paths)):
    with open(doc_path, encoding='utf-8') as f:
        docs[str(doc_path)] = f.read()

Reading docs...


  0%|          | 0/1478 [00:00<?, ?it/s]

In [14]:
nlp = spacy.load("en_core_web_sm")

In [17]:
# Parse documents using spacy (takes a long time)
docs_parsed = {}
failed = set()
print("Parsing documents...")
for docid, doc in tqdm(docs.items()):
    doc_data = dict()
    try:
        parsed = nlp(doc)
    except:
        failed.add(docid)
        continue
    tokens = [t.text for t in parsed]
    doc_data = {
        'nlp': parsed,
        'tokens': tokens,
    }
    docs_parsed[docid] = doc_data

Parsing documents...


  0%|          | 0/1478 [00:00<?, ?it/s]

In [21]:
# out_dir = Path("out")
# out_dir.mkdir(parents=True, exist_ok=True)

# docs_parsed_pickle_path = out_dir / "docs_parsed.pkl"
# with open(docs_parsed_pickle_path, 'wb') as f:
#     pickle.dump(docs_parsed, f)

In [30]:
print(f"Failed (due to max character constraint): {100*len(failed)/len(docs_parsed):.02f}%")

Failed (due to max character constraint): 3.00%


In [24]:
all_tokens = []
for docid, doc_data in docs_parsed.items():
    all_tokens.extend(doc_data.get('tokens', []))

In [25]:
tokens_counter = collections.Counter(all_tokens)

In [43]:
print("Possible stop words (according to spacy's tokenization):")
tokens_counter.most_common(50)

Possible stop words (according to spacy's tokenization):


[(',', 1637379),
 ('\n', 1160476),
 ('the', 1069110),
 ('of', 725657),
 ('\n ', 667841),
 ('and', 616908),
 ('.', 605133),
 ('to', 447912),
 ('\n\n\n', 358586),
 ('in', 308045),
 ('a', 265736),
 ('that', 237415),
 (';', 169826),
 ('is', 154570),
 ('they', 146924),
 ('it', 140903),
 ('with', 140477),
 ('for', 137755),
 ('as', 133202),
 ('which', 132064),
 ('his', 126772),
 ('by', 124783),
 ('their', 116962),
 ('was', 104789),
 (':', 104692),
 ('be', 103423),
 ('he', 99726),
 ('or', 98578),
 ('-', 94394),
 ('not', 92209),
 ('are', 87668),
 ('all', 87564),
 ('this', 85589),
 ('The', 84625),
 ('them', 84446),
 ('I', 83582),
 ('at', 80482),
 ('but', 78917),
 ('so', 78707),
 ('from', 72892),
 ('had', 63138),
 ('were', 57621),
 ('him', 56444),
 ('\n\n', 56428),
 ('on', 54204),
 ('there', 52342),
 ('have', 51814),
 ('great', 49389),
 ('we', 48372),
 ('(', 47899)]

In [34]:
corpus = list(docs.values())
cvect = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X2 = cvect.fit_transform(corpus)
cvect.get_feature_names_out()

array(['00', '000', '0000', ..., 'שוכ', '⅔ds', 'ↂoooooocc'],
      shape=(504643,), dtype=object)

In [35]:
len(cvect.get_feature_names_out())

504643

In [37]:
word_counts_a = X2.toarray().sum(axis=0)
print(f"{len(word_counts_a)=}")

len(word_counts_a)=504643


In [45]:
word_counts_dict_cvect = {}
for w, c in zip(cvect.get_feature_names_out(), word_counts_a):
    word_counts_dict_cvect[w] = int(c)
word_counter_cvect = collections.Counter(word_counts_dict_cvect)

In [46]:
print("Possible stop words (according to CountVectorizer's tokenization):")
word_counter_cvect.most_common(50)

Possible stop words (according to CountVectorizer's tokenization):


[('the', 2106543),
 ('of', 1361462),
 ('and', 1172443),
 ('to', 814277),
 ('in', 638896),
 ('that', 448192),
 ('they', 288660),
 ('it', 270896),
 ('for', 270587),
 ('with', 265306),
 ('his', 265115),
 ('is', 263694),
 ('which', 256269),
 ('as', 244332),
 ('by', 239069),
 ('was', 219228),
 ('he', 217418),
 ('their', 217089),
 ('this', 198884),
 ('be', 182763),
 ('but', 177923),
 ('or', 171654),
 ('at', 158589),
 ('all', 156574),
 ('not', 155101),
 ('them', 152792),
 ('are', 147399),
 ('so', 143913),
 ('from', 140662),
 ('had', 116940),
 ('on', 112914),
 ('were', 112282),
 ('there', 105736),
 ('him', 104682),
 ('great', 91372),
 ('one', 89436),
 ('we', 86567),
 ('other', 86216),
 ('have', 85336),
 ('an', 81125),
 ('who', 77205),
 ('some', 76057),
 ('being', 73118),
 ('our', 72947),
 ('any', 65462),
 ('into', 64279),
 ('no', 63226),
 ('then', 60727),
 ('these', 59319),
 ('king', 58853)]

## TF-IDF
- [TF-IDF — Term Frequency-Inverse Document Frequency](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/)

In [53]:
# Compute TF-IDF scores using TfidfVectorizer, without eliminating stop words
tf_idf_model  = TfidfVectorizer()
tf_idf_vector = tf_idf_model.fit_transform(corpus)

In [54]:
df_tf_idf = pd.DataFrame(tf_idf_vector.toarray(), columns=tf_idf_model.get_feature_names_out())
df_tf_idf.index = list(docs.keys())
display(df_tf_idf)

Unnamed: 0,00,000,0000,00000,000000,000008,0002,0005,001,00130,...,ωτd,ωτe,ϲεβ,ϲεπτιμια,יהוה,ישוכ,שוח,שוכ,⅔ds,ↂoooooocc
Navigations_headed_xml/Parsed_texts/A40568_parsed_text.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A05339_parsed_text.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A19729_footnotes.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A59233_parsed_text.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A32531_footnotes.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Navigations_headed_xml/Parsed_texts/A42631_footnotes.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A20049_footnotes.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A32172_footnotes.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Navigations_headed_xml/Parsed_texts/A08210_parsed_text.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Reduce dimensions
- [Reduce Dimension of word-vectors from TFIDFVectorizer / CountVectorizer](https://stackoverflow.com/questions/61274499/reduce-dimension-of-word-vectors-from-tfidfvectorizer-countvectorizer)
- [How to reduce dimension for TfIdf / BOW vector?](https://www.reddit.com/r/MachineLearning/comments/30xo25/how_to_reduce_dimension_for_tfidf_bow_vector/)

In [58]:
target_dim = 1000
# https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tf_idf_model2 = TfidfVectorizer(max_features=target_dim)
tf_idf_vector2 = tf_idf_model2.fit_transform(corpus)

df_tf_idf2 = pd.DataFrame(tf_idf_vector2.toarray(), columns=tf_idf_model2.get_feature_names_out())
df_tf_idf2.index = list(docs.keys())
display(df_tf_idf2)

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,...,yeare,yeares,years,yeere,yeeres,yere,yet,you,young,your
Navigations_headed_xml/Parsed_texts/A40568_parsed_text.txt,0.006517,0.013044,0.019815,0.000000,0.007937,0.000000,0.000000,0.008285,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Navigations_headed_xml/Parsed_texts/A05339_parsed_text.txt,0.000708,0.000236,0.000598,0.000432,0.000575,0.001192,0.000864,0.000300,0.000888,0.000000,...,0.004930,0.006957,0.0,0.006458,0.004710,0.000512,0.020277,0.010760,0.001333,0.003320
Navigations_headed_xml/Parsed_texts/A19729_footnotes.txt,0.288380,0.226775,0.271416,0.351944,0.250876,0.286220,0.201111,0.130939,0.206619,0.249511,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Navigations_headed_xml/Parsed_texts/A59233_parsed_text.txt,0.010302,0.020621,0.020884,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.016554,0.016872,0.0,0.000000,0.000000,0.000000,0.020348,0.000000,0.000000,0.000000
Navigations_headed_xml/Parsed_texts/A32531_footnotes.txt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Navigations_headed_xml/Parsed_texts/A42631_footnotes.txt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Navigations_headed_xml/Parsed_texts/A20049_footnotes.txt,0.001520,0.000000,0.000000,0.000928,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.012076,0.010113,0.000000,0.000000,0.001593,0.002862,0.000000
Navigations_headed_xml/Parsed_texts/A32172_footnotes.txt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Navigations_headed_xml/Parsed_texts/A08210_parsed_text.txt,0.000472,0.002362,0.000478,0.000000,0.000575,0.000000,0.001728,0.000600,0.001183,0.002541,...,0.008341,0.008502,0.0,0.000833,0.000000,0.000000,0.035420,0.005441,0.002369,0.006905


In [60]:
reduced_tfidf_df_out_path = out_dir / f"tfidf_{target_dim}d.csv"
with open(reduced_tfidf_df_out_path, 'w', encoding='utf-8') as f:
    df_tf_idf2.to_csv(f)
print(f"Saved reduced TF-IDF CSV to: {reduced_tfidf_df_out_path}")

Saved reduced TF-IDF CSV to: out/tfidf_1000d.csv
