In [5]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [6]:
from lxml import etree
from pathlib import Path

In [7]:
files = list(Path("tlg0012").glob("./**/*perseus-eng*.xml"))

In [8]:
TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XML/1998/namespace"

NAMESPACES = {
    "tei": TEI_NS,
    "xml": XML_NS,
}

In [9]:
for file in files:
    print(file)
    tree = etree.parse(file)
    text = tree.xpath(f"//tei:div[@subtype='card']//text()", namespaces=NAMESPACES)
    
    cleaned_text = []
    for t in text:
        if t.strip() != "":
            cleaned_text.append(t)

    if len(cleaned_text) > 0:
        with open(str(file).split("/")[-1].replace(".xml", ".txt"), "w+") as f:
            f.write('\n'.join(cleaned_text))
    

tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml


tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml
tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml


In [25]:
from collections import Counter

text_files = list(Path(".").glob("tlg0012.tlg00*.perseus-eng*.txt"))

counts = {}

for t in text_files:
    name = str(t)

    with open(t) as f:
        text = f.read().lower().split()
        counts[name] = Counter(text)


In [47]:
df_odysseus = 0
df_achilles = 0

for words in counts.values():
    if "odysseus" in words:
        df_odysseus += 1
    if "achilles" in words:
        df_achilles+= 1

######
print(df_odysseus)
print(df_achilles)

from math import log10

n_docs = len(counts.keys())

idf_odysseus = log10(n_docs / df_odysseus)
idf_achilles = log10(n_docs / df_achilles)

print(idf_odysseus)
print(idf_achilles)

4
4
0.0
0.0


In [58]:

for filename, values in counts.items():

    # Get the total number of terms in each file — we'll
    # use this to calculate the relative frequency as our
    # TF.
    total_terms = len(values)

    # Get the TF for each term in this file.
    tf_achilles = values['achilles'] / total_terms
    tf_odysseus = values['odysseus'] / total_terms

    # Remember, the simplest version of TF-IDF is just
    # TF * 1/DF
    tf_idf_achilles = tf_achilles * idf_achilles
    tf_idf_odysseus = tf_odysseus * idf_odysseus

    # Now we can report on the statistics for this file
    print(f"""In {filename}:
TF of achilles: {tf_achilles}
TF of odysseus: {tf_odysseus}
TF-IDF of achilles: {tf_idf_achilles}
TF-IDF of odysseus: {tf_idf_odysseus}
""")

In tlg0012.tlg001.perseus-eng4.txt:
TF of achilles: 0.024394197260718047
TF of odysseus: 0.006483507577599482
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg002.perseus-eng3.txt:
TF of achilles: 0.0001899335232668566
TF of odysseus: 0.041500474833808164
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg002.perseus-eng4.txt:
TF of achilles: 0.0005608000747733433
TF of odysseus: 0.036825871576782875
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg001.perseus-eng3.txt:
TF of achilles: 0.018137177253286868
TF of odysseus: 0.006098526849358466
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0



In [66]:
# The code above was testing the concepts on the common names. Now I will use TF-IDF to experiment on different words. 
from math import log10

experiment_list = ['the', 'season', 'fears', 'needed']

for term in experiment_list:
    df_term = 0

    for words in counts.values():
        if term in words:
            df_term += 1

    n_docs = len(counts.keys())

    idf_term = log10(n_docs / df_term)

    for filename, values in counts.items():

        total_terms = len(values)

        tf_term = values[term] / total_terms

        tf_idf_term = tf_term * idf_term

        # Now we can report on the statistics for this file
        print(f"""In {filename}:
    TF of {term}: {tf_term}
    TF-IDF of {term}: {tf_idf_term}
    """)

In tlg0012.tlg001.perseus-eng4.txt:
    TF of the: 0.8098711402868952
    TF-IDF of the: 0.0
    
In tlg0012.tlg002.perseus-eng3.txt:
    TF of the: 0.7776828110161443
    TF-IDF of the: 0.0
    
In tlg0012.tlg002.perseus-eng4.txt:
    TF of the: 0.5514534068604543
    TF-IDF of the: 0.0
    
In tlg0012.tlg001.perseus-eng3.txt:
    TF of the: 0.9798827815618565
    TF-IDF of the: 0.0
    
In tlg0012.tlg001.perseus-eng4.txt:
    TF of season: 0.00032417537887997406
    TF-IDF of season: 0.0
    
In tlg0012.tlg002.perseus-eng3.txt:
    TF of season: 0.0003798670465337132
    TF-IDF of season: 0.0
    
In tlg0012.tlg002.perseus-eng4.txt:
    TF of season: 0.0001869333582577811
    TF-IDF of season: 0.0
    
In tlg0012.tlg001.perseus-eng3.txt:
    TF of season: 0.0002376049421827974
    TF-IDF of season: 0.0
    
In tlg0012.tlg001.perseus-eng4.txt:
    TF of fears: 0.00016208768943998703
    TF-IDF of fears: 4.8793256449304025e-05
    
In tlg0012.tlg002.perseus-eng3.txt:
    TF of fears: 9

In [61]:
non_universal_terms = {}

for filename, values in counts.items():
    my_set = set(values.keys())

    for other_file, other_values in counts.items():
        # make sure we don't compare the file
        # to itself, otherwise the difference
        # will be the empty set
        if other_file != filename:
            my_set -= set(other_values.keys())
    
    # now push the remaining set of terms to the dictionary
    non_universal_terms[filename] = my_set

# log `non_universal_terms` as a sanity check
non_universal_terms

{'tlg0012.tlg001.perseus-eng4.txt': {'sipylos,',
  'needed',
  'blew.',
  'wearied,',
  'revealed.',
  'podagros,',
  'fears,',
  'dishonored',
  'arrow."',
  'crests,',
  'tents,',
  'conquering',
  'further,"',
  'peasants',
  'pellene',
  'thump',
  'walls?"',
  'rely',
  'argue',
  'outposts,',
  'bunches',
  'reconciliation,',
  'fight-',
  'kastianeira,',
  'king."',
  'allowances',
  'better-',
  'roasted,',
  'festival',
  'abuse.',
  'shamefacedness',
  'asios,',
  'maimalos,',
  'defeated',
  'offered.',
  'twixt',
  'dupe',
  'hoofs,',
  'plakos,',
  'dinners:',
  'proceed)',
  'myrmidon,',
  'cleonae;',
  'meriones.',
  'spleen',
  'engaged',
  'infuriated',
  'duck',
  'spears:',
  'full-flowing',
  'suing',
  'porridge',
  'thunder;',
  'recoil',
  'bowstring,',
  'firm:',
  'deplore;',
  '"quit',
  'ennomos.',
  'panthoos',
  'hang-dog',
  'sea-fight,',
  'boxwood',
  'train',
  "patroklos'",
  'agapenor',
  'said-',
  'greaved',
  '"pandaros,',
  'exults',
  'aisyetes,'

In [None]:
# Response to written questions:

# 1. During my analysis, one thing I noticed was that when a term never appears, I get a computational error because I attempt division by zero.
# this occurs because that term would have a document frequency of zero, meaning in calculating its inverse, we are dividing by zero. Thus, in
# order to address this problem, a small value would have to be added to all terms' DF. However, doing so would shift the log function graph,
# meaning that we need to create a new way to interpret the result of the IDF calculation since the DF is now different. 

# 2. After the TF IDF calculation I have code above that finds all the unique words. From that list I took "needed" which only appears in 
# tlg1 eng4. However, though this method works in finding words that appear only once, there are words that may appear twice or three times,
# but not four, which would still be useful to analyze. For example, I guessed the word "season" and it turned out that it appears in two of 
# the four corpuses. However, "season" wasn't included in the list generate by the unique-words code. Refining this in the future would mean
# dividing words into their DFs, instead of just DFs of only one or all.

# 3. To me, TF-IDF quantitatively tells me how much a word appears in a corpus but with lots of weighting. In other words, how much it actually
# matters in the context of everything/all things considered. 

# 4. I think somewhere where TF-IDF would be useful is analyzing American news webites. I think by finding specific usages of words and their
# importance, where each individual article acts as a document for a greater corpus for that specific news agency, then we can analyze how 
# their literature is politically aligned. This would reveal/bring to attention any implicit biases within these news agencies. 
# I think that would be a cool project. 