## Example

In [1]:
%pip install lxml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from lxml import etree
from pathlib import Path

In [3]:
files = list(Path("tlg0012").glob("./**/*perseus-eng*.xml"))

In [4]:
TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XML/1998/namespace"

NAMESPACES = {
    "tei": TEI_NS,
    "xml": XML_NS,
}

In [5]:
for file in files:
    print(file)
    tree = etree.parse(file)
    text = tree.xpath(f"//tei:div[@subtype='card']//text()", namespaces=NAMESPACES)
    
    cleaned_text = []
    for t in text:
        if t.strip() != "":
            cleaned_text.append(t)

    if len(cleaned_text) > 0:
        with open(str(file).split("/")[-1].replace(".xml", ".txt"), "w+") as f:
            f.write('\n'.join(cleaned_text))
    

tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml


tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml
tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml


In [6]:
from collections import Counter

text_files = list(Path(".").glob("tlg0012.tlg00*.perseus-eng*.txt"))

counts = {}

for t in text_files:
    name = str(t)

    with open(t) as f:
        text = f.read().lower().split()
        counts[name] = Counter(text)


In [7]:
term = 'odysseus'

df_ulysses = 0

for _, els in counts.items():
    if term in els:
        df_ulysses += 1

df_ulysses

4

In [8]:
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
import nltk

# download the files needed for tokenization
# the punkt tokenizer should be installed already,
# but let's download it just in case
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
# Initialize the tokenizer
from nltk.tokenize import word_tokenize

# Initialize an empty dictionary to store the tokenized texts
tokenized_texts = {}

# Get a Path.glob() iterator for the .txt files that you've created in this directory.
# Can you figure out what the new `[1-4]` segment is doing?
text_files = Path(".").glob("tlg0012.tlg00*.perseus-eng[1-4].txt")

# Iterate through the text files, reading and tokenizing them one by one,
# then storing the list of tokens in our `tokenized_texts` dictionary —
# so we'll be getting a dictionary of lists.
for file in text_files:
    name = str(file)

    with open(file) as f:
        # Notice we're lowercasing the text. You don't *have*
        # to do this, but it helps eliminate some noise for
        # our purposes.
        text = f.read().lower()
        tokens = word_tokenize(text)

        # Let's just print the length of the tokens list to make
        # sure we're getting sane results. We'll use string interpolation
        # to identify which text we're working with.
        print(f"There are {len(tokens)} tokens in {name}.")

        # Store each file's `tokens` list in the `tokenized_texts`
        # dictionary, using the filename as the key.
        tokenized_texts[name] = tokens


There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.
There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.
There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.
There are 200630 tokens in tlg0012.tlg001.perseus-eng3.txt.


In [11]:
from collections import Counter

# Using our `tokenized_texts` dictionary, we'll iterate
# through each key-value pair — remember, the keys are
# filenames and the values are lists of tokens.
# We'll get a count of the tokens by passing the list to
# `Counter`, then we'll change the value for that key to
# a dictionary with its own keys, `tokens` and `counts`.

for filename, tokens in tokenized_texts.items():
    counts = Counter(tokens)

    tokenized_texts[filename] = {"tokens": tokens, "counts": counts}

In [12]:
tokenized_texts["tlg0012.tlg001.perseus-eng3.txt"]["counts"]["odysseus"]

128

In [13]:
non_universal_terms = {}

for filename, values in tokenized_texts.items():
    my_set = set(values['counts'].keys())

    for other_file, other_values in tokenized_texts.items():
        # make sure we don't compare the file
        # to itself, otherwise the difference
        # will be the empty set
        if other_file != filename:
            my_set -= set(other_values['counts'].keys())
    
    # now push the remaining set of terms to the dictionary
    non_universal_terms[filename] = my_set

# log `non_universal_terms` as a sanity check
non_universal_terms

{'tlg0012.tlg001.perseus-eng4.txt': {'helmed',
  'whets',
  'distraction',
  'aisyetes',
  'thoe',
  'toppling',
  'sternness',
  'talaos',
  'northwest',
  'embank',
  'drowsy',
  'halcyon',
  'limnorea',
  'galley',
  'shirked',
  'sea-wrack',
  'spiteful',
  'wild-fowl',
  'filly',
  'rendering',
  'euphemos',
  'nestled',
  'bystanders',
  'mykalessos',
  'foments',
  'blithe',
  'myrtle-blossoms',
  'eueneus',
  'unattended',
  'spreads',
  'unfailingly',
  'reconciles',
  'plaiting',
  'gatherer',
  'apparel',
  'echios',
  'gravest',
  'forelands',
  'xanthos',
  'bench',
  'panoply',
  'autophonos',
  'wolf-skin',
  'flocking',
  'amphiklos',
  'echeklos',
  'stabled',
  'heavenwards',
  'winnowing-shovel',
  'eats',
  'thieves',
  'artifice',
  'casque',
  'spear-head',
  'forbearance',
  'needed',
  'insure',
  'opposed',
  'notions',
  'hangs',
  'relent',
  'befriend',
  'head-dress',
  'couples',
  'eyed',
  'phausios',
  'weapon',
  'amaze',
  'prothoon',
  'coronea',
  '

## "Your Turn"

In [15]:
df_term1 = 0
df_term2 = 0

# Calculate the DF for "odysseus" and "achilles".
# We iterate through the dictionary, and then simply
# count the number of files in which we find each term.
# For these two terms, we should probably expect DFs of 4.
for filename, values in tokenized_texts.items():
    if "spiteful" in values['counts']:
        df_term1 += 1
    
    if "northwest" in values["counts"]:
        df_term2 += 1

# Now we'll import the log function to calculate the IDF for each term.
from math import log10

n_docs = len(tokenized_texts.keys())

idf_term1 = log10(n_docs / df_term1)
idf_term2 = log10(n_docs / df_term2)

print(idf_term1)
print(idf_term2)

0.6020599913279624
0.6020599913279624


In [18]:
# Now let's calculate the TF-IDF "score" for each term in each document.

# Once again, iterate through the dictionary.
for filename, values in tokenized_texts.items():
    # Get the total number of terms in each file — we'll
    # use this to calculate the relative frequency as our
    # TF.
    total_terms = len(values['tokens'])

    # Get the TF for each term in this file.
    tf_term1 = values['counts']['spiteful'] / total_terms
    tf_term2 = values['counts']['northwest'] / total_terms

    # Remember, the simplest version of TF-IDF is just
    # TF * 1/DF
    tf_idf_term1 = tf_term1 * idf_term1
    tf_idf_term2 = tf_term2 * idf_term2

    # Now we can report on the statistics for this file
    print(f"""In {filename}:
TF of spiteful: {tf_term1}
TF of halcyon: {tf_term2}
TF-IDF of spiteful: {tf_idf_term1}
TF-IDF of halcyon: {tf_idf_term2}
""")

In tlg0012.tlg001.perseus-eng4.txt:
TF of spiteful: 5.694404109082005e-06
TF of halcyon: 5.694404109082005e-06
TF-IDF of spiteful: 3.4283728885318253e-06
TF-IDF of halcyon: 3.4283728885318253e-06

In tlg0012.tlg002.perseus-eng3.txt:
TF of spiteful: 0.0
TF of halcyon: 0.0
TF-IDF of spiteful: 0.0
TF-IDF of halcyon: 0.0

In tlg0012.tlg002.perseus-eng4.txt:
TF of spiteful: 0.0
TF of halcyon: 0.0
TF-IDF of spiteful: 0.0
TF-IDF of halcyon: 0.0

In tlg0012.tlg001.perseus-eng3.txt:
TF of spiteful: 0.0
TF of halcyon: 0.0
TF-IDF of spiteful: 0.0
TF-IDF of halcyon: 0.0



I got both of the terms I used from more or less the beginning of the set, so it makes sense that they seem to only appear in the first document (although I did try to pick out words that could appear more than once).

In the future I would probably print the term frequency before jumping straight into printing the IDF to make sure that the term appears enough time to produce an interesting result.

TF-IDF tells us about how often a term appears in an individual document relative to a larger collection of documents. So if the term appears frequently in one document relative to others it is probably especially notable there.

TF-IDF could also be useful when trying to determine certain words that are most unique to an individual text or author.