# Overall Aim
Get most common words in each verb form

In [1]:
# Generate the data
import os
import requests

# List of filenames to download
filenames = [
    "1Chr.xml", "1Kgs.xml", "1Sam.xml", "2Chr.xml", "2Kgs.xml", "2Sam.xml", "Amos.xml", "Dan.xml", "Deut.xml",
    "Eccl.xml", "Esth.xml", "Exod.xml", "Ezek.xml", "Ezra.xml", "Gen.xml", "Hab.xml", "Hag.xml", "Hos.xml",
    "Isa.xml", "Jer.xml", "Job.xml", "Joel.xml", "Jonah.xml", "Josh.xml", "Judg.xml", "Lam.xml", "Lev.xml",
    "Mal.xml", "Mic.xml", "Nah.xml", "Neh.xml", "Num.xml", "Obad.xml", "Prov.xml", "Ps.xml", "Ruth.xml",
    "Song.xml", "Zech.xml", "Zeph.xml"
]

# Base URL
base_url = "https://raw.githubusercontent.com/Clear-Bible/macula-hebrew/refs/heads/main/sources/OpenScriptures/xml/"

# Output directory
output_dir = "downloaded_files"
os.makedirs(output_dir, exist_ok=True)

# Download each file
for filename in filenames:
    url = base_url + filename
    response = requests.get(url)

    if response.status_code == 200:
        file_path = os.path.join(output_dir, filename)
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename} (Status Code: {response.status_code})")

print("Download process completed.")


Downloaded: 1Chr.xml
Downloaded: 1Kgs.xml
Downloaded: 1Sam.xml
Downloaded: 2Chr.xml
Downloaded: 2Kgs.xml
Downloaded: 2Sam.xml
Downloaded: Amos.xml
Downloaded: Dan.xml
Downloaded: Deut.xml
Downloaded: Eccl.xml
Downloaded: Esth.xml
Downloaded: Exod.xml
Downloaded: Ezek.xml
Downloaded: Ezra.xml
Downloaded: Gen.xml
Downloaded: Hab.xml
Downloaded: Hag.xml
Downloaded: Hos.xml
Downloaded: Isa.xml
Downloaded: Jer.xml
Downloaded: Job.xml
Downloaded: Joel.xml
Downloaded: Jonah.xml
Downloaded: Josh.xml
Downloaded: Judg.xml
Downloaded: Lam.xml
Downloaded: Lev.xml
Downloaded: Mal.xml
Downloaded: Mic.xml
Downloaded: Nah.xml
Downloaded: Neh.xml
Downloaded: Num.xml
Downloaded: Obad.xml
Downloaded: Prov.xml
Downloaded: Ps.xml
Downloaded: Ruth.xml
Downloaded: Song.xml
Downloaded: Zech.xml
Downloaded: Zeph.xml
Download process completed.


In [2]:
# Analyse the Lemma frequencies
import xml.etree.ElementTree as ET
from collections import Counter

# Directory containing XML files
input_dir = "downloaded_files"

# Dictionary to store lemma occurrences
lemma_count = Counter()

# Function to remove namespaces from tag names
def strip_namespace(tag):
    return tag.split('}')[-1] if '}' in tag else tag

# Process each XML file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".xml"):
        print(f"Looking at {filename}")  # Debugging output
        file_path = os.path.join(input_dir, filename)

        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
        except ET.ParseError as e:
            print(f"Error parsing {filename}: {e}")
            continue

        # Find all <m> tags (handling namespaces if present)
        for elem in root.findall(".//{http://www.bibletechnologies.net/2003/OSIS/namespace}m"):
            # print(f"Looking at m tag: {ET.tostring(elem, encoding='unicode').strip()}")  # Debugging output
            stem = elem.get("stem")
            if stem == "hiphil":
                lemma = elem.get("lemma", "")
                lemma_cleaned = "".join(filter(str.isdigit, lemma))  # Remove non-numeric characters
                if lemma_cleaned:
                    lemma_count[lemma_cleaned] += 1

# Sort by frequency in descending order
sorted_lemmas = lemma_count.most_common(10)

# Print top 10 results
print("Top 10 Lemmas with 'hiphil' Stem:")
for lemma, count in sorted_lemmas:
    print(f"Lemma {lemma}: {count} times")




Looking at Isa.xml
Looking at Dan.xml
Looking at 1Kgs.xml
Looking at Zeph.xml
Looking at 1Sam.xml
Looking at Mic.xml
Looking at Song.xml
Looking at Josh.xml
Looking at Esth.xml
Looking at Judg.xml
Looking at Prov.xml
Looking at Lev.xml
Looking at Ruth.xml
Looking at Joel.xml
Looking at Eccl.xml
Looking at 2Sam.xml
Looking at Obad.xml
Looking at 2Chr.xml
Looking at Num.xml
Looking at Amos.xml
Looking at Lam.xml
Looking at 1Chr.xml
Looking at Neh.xml
Looking at 2Kgs.xml
Looking at Deut.xml
Looking at Zech.xml
Looking at Ps.xml
Looking at Ezek.xml
Looking at Nah.xml
Looking at Jer.xml
Looking at Hos.xml
Looking at Hag.xml
Looking at Ezra.xml
Looking at Exod.xml
Looking at Hab.xml
Looking at Jonah.xml
Looking at Gen.xml
Looking at Job.xml
Looking at Mal.xml
Top 10 Lemmas with 'hiphil' Stem:
Lemma 935: 549 times
Lemma 5221: 481 times
Lemma 7725: 358 times
Lemma 5046: 335 times
Lemma 3318: 276 times
Lemma 5927: 258 times
Lemma 5337: 191 times
Lemma 3467: 184 times
Lemma 7126: 177 times
Lemma