In [6]:
%pip install lxml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
from lxml import etree
from pathlib import Path

In [8]:
files = list(Path("tlg0012").glob("./**/*perseus-eng*.xml"))

In [9]:
TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XML/1998/namespace"

NAMESPACES = {
    "tei": TEI_NS,
    "xml": XML_NS,
}

In [10]:
for file in files:
    print(file)
    tree = etree.parse(file)
    text = tree.xpath(f"//tei:div[@subtype='card']//text()", namespaces=NAMESPACES)
    
    cleaned_text = []
    for t in text:
        if t.strip() != "":
            cleaned_text.append(t)

    if len(cleaned_text) > 0:
        with open(str(file).split("/")[-1].replace(".xml", ".txt"), "w+") as f:
            f.write('\n'.join(cleaned_text))
    

tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml
tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml
tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml


In [11]:
from collections import Counter

text_files = list(Path(".").glob("tlg0012.tlg00*.perseus-eng*.txt"))

counts = {}

for t in text_files:
    name = str(t)

    with open(t) as f:
        text = f.read().lower().split()
        counts[name] = Counter(text)


In [12]:
term = 'odysseus'

df_ulysses = 0

for _, els in counts.items():
    if term in els:
        df_ulysses += 1

df_ulysses

4

In [13]:
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
from nltk.tokenize import word_tokenize

tokenized_texts = {}

text_files = Path(".").glob("tlg0012.tlg00*.perseus-eng[1-4].txt")

for file in text_files:
    name = str(file)

    with open(file) as f:
        text = f.read().lower()
        tokens = word_tokenize(text)

        print(f"There are {len(tokens)} tokens in {name}.")

        tokenized_texts[name] = tokens

There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.
There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.
There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.
There are 200630 tokens in tlg0012.tlg001.perseus-eng3.txt.


In [16]:
from collections import Counter

for filename, tokens in tokenized_texts.items():
    counts = Counter(tokens)

    tokenized_texts[filename] = {"tokens": tokens, "counts": counts}

In [17]:
tokenized_texts["tlg0012.tlg001.perseus-eng3.txt"]["counts"]["odysseus"]

128

In [18]:
df_achilles = 0
df_odysseus = 0

for filename, values in tokenized_texts.items():
    if "odysseus" in values['counts']:
        df_odysseus += 1
    
    if "achilles" in values["counts"]:
        df_achilles += 1

from math import log10

n_docs = len(tokenized_texts.keys())

idf_achilles = log10(n_docs / df_achilles)
idf_odysseus = log10(n_docs / df_odysseus)

print(idf_achilles)

0.0


In [19]:
for filename, values in tokenized_texts.items():
    total_terms = len(values['tokens'])

    tf_achilles = values['counts']['achilles'] / total_terms
    tf_odysseus = values['counts']['odysseus'] / total_terms

    tf_idf_achilles = tf_achilles * idf_achilles
    tf_idf_odysseus = tf_odysseus * idf_odysseus

    print(f"""In {filename}:
TF of achilles: {tf_achilles}
TF of odysseus: {tf_odysseus}
TF-IDF of achilles: {tf_idf_achilles}
TF-IDF of odysseus: {tf_idf_odysseus}
""")

In tlg0012.tlg001.perseus-eng4.txt:
TF of achilles: 0.002403038534032606
TF of odysseus: 0.0007061061095261686
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg002.perseus-eng3.txt:
TF of achilles: 0.0001048279838302835
TF of odysseus: 0.0041603606082643765
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg002.perseus-eng4.txt:
TF of achilles: 0.0001254955227626732
TF of odysseus: 0.0042816119530794386
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg001.perseus-eng3.txt:
TF of achilles: 0.002043562777251657
TF of odysseus: 0.000637990330459054
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0



In [20]:
my_list = [1, 1, 2, 3, 3]

set(my_list)

{1, 2, 3}

In [21]:
non_universal_terms = {}

for filename, values in tokenized_texts.items():
    my_set = set(values['counts'].keys())

    for other_file, other_values in tokenized_texts.items():
        if other_file != filename:
            my_set -= set(other_values['counts'].keys())

    non_universal_terms[filename] = my_set

non_universal_terms

{'tlg0012.tlg001.perseus-eng4.txt': {'reeked',
  'survey',
  'backbone',
  'irritated',
  'cleaves',
  'whets',
  'shirked',
  'salvation',
  'cross-bar',
  'babel',
  'fee',
  'unite',
  'lykastos',
  'aesepos',
  'thymbra',
  'klonios',
  'limpid',
  'another-',
  'disastrous',
  'thyrsi',
  'ceos',
  'orneae',
  'searches',
  'task-master',
  'swerves',
  'koön',
  'consulting',
  'pant',
  'accident',
  'marshy',
  'filly',
  'odios',
  'moira',
  'wolf-skin',
  'kephissos',
  'opposed',
  'creek',
  'populous',
  'clutches',
  'phalanxes',
  'override',
  'flower-bespangled',
  'offend',
  'stichios',
  'foot-races',
  'stinted',
  'flute',
  'said-',
  'kapaneus',
  'ox-horn',
  'minister',
  'embroidering',
  'foolhardiness',
  'showering',
  'therapôn',
  'storming',
  'flecks',
  'aching',
  'mire',
  'couples',
  'bruise',
  'asteropaios',
  'wide-spreading',
  'ganymede',
  'menaces',
  'crouches',
  'unsay',
  'aithe',
  'astypylos',
  'winning-post',
  'tow',
  'three-legg

In [22]:
df_ganymede = 0
df_trickster = 0

for filename, values in tokenized_texts.items():
    if "trickster" in values['counts']:
        df_trickster += 1
    
    if "ganymede" in values["counts"]:
        df_ganymede += 1

from math import log10

n_docs = len(tokenized_texts.keys())

idf_ganymede = log10(n_docs / df_ganymede)
idf_trickster = log10(n_docs / df_trickster)

In [23]:
for filename, values in tokenized_texts.items():
    total_terms = len(values['tokens'])

    tf_ganymede = values['counts']['ganymede'] / total_terms
    tf_trickster = values['counts']['trickster'] / total_terms

    tf_idf_ganymede = tf_ganymede * idf_ganymede
    tf_idf_trickster = tf_trickster * idf_trickster

    print(f"""In {filename}:
TF of ganymede: {tf_ganymede}
TF of trickster: {tf_trickster}
TF-IDF of ganymede: {tf_idf_ganymede}
TF-IDF of trickster: {tf_idf_trickster}
""")

In tlg0012.tlg001.perseus-eng4.txt:
TF of ganymede: 1.138880821816401e-05
TF of trickster: 1.7083212327246014e-05
TF-IDF of ganymede: 6.8567457770636506e-06
TF-IDF of trickster: 1.0285118665595476e-05

In tlg0012.tlg002.perseus-eng3.txt:
TF of ganymede: 0.0
TF of trickster: 0.0
TF-IDF of ganymede: 0.0
TF-IDF of trickster: 0.0

In tlg0012.tlg002.perseus-eng4.txt:
TF of ganymede: 0.0
TF of trickster: 0.0
TF-IDF of ganymede: 0.0
TF-IDF of trickster: 0.0

In tlg0012.tlg001.perseus-eng3.txt:
TF of ganymede: 0.0
TF of trickster: 0.0
TF-IDF of ganymede: 0.0
TF-IDF of trickster: 0.0



In [24]:
df_humans = 0
df_seawater = 0

for filename, values in tokenized_texts.items():
    if "seawater" in values['counts']:
        df_seawater += 1
    
    if "humans" in values["counts"]:
        df_humans += 1

from math import log10

n_docs = len(tokenized_texts.keys())

idf_humans = log10(n_docs / df_humans)
idf_seawater = log10(n_docs / df_seawater)

for filename, values in tokenized_texts.items():
    total_terms = len(values['tokens'])

    tf_humans = values['counts']['humans'] / total_terms
    tf_seawater = values['counts']['seawater'] / total_terms

    tf_idf_humans = tf_humans * idf_humans
    tf_idf_seawater = tf_seawater * idf_seawater

    print(f"""In {filename}:
TF of humans: {tf_humans}
TF of seawater: {tf_seawater}
TF-IDF of humans: {tf_idf_humans}
TF-IDF of seawater: {tf_idf_seawater}
""")

In tlg0012.tlg001.perseus-eng4.txt:
TF of humans: 1.138880821816401e-05
TF of seawater: 0.0
TF-IDF of humans: 6.8567457770636506e-06
TF-IDF of seawater: 0.0

In tlg0012.tlg002.perseus-eng3.txt:
TF of humans: 0.0
TF of seawater: 6.551748989392718e-06
TF-IDF of humans: 0.0
TF-IDF of seawater: 3.944545939736766e-06

In tlg0012.tlg002.perseus-eng4.txt:
TF of humans: 0.0
TF of seawater: 0.0
TF-IDF of humans: 0.0
TF-IDF of seawater: 0.0

In tlg0012.tlg001.perseus-eng3.txt:
TF of humans: 0.0
TF of seawater: 0.0
TF-IDF of humans: 0.0
TF-IDF of seawater: 0.0

