In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!apt-get update -qq
!apt-get install -y openjdk-17-jdk > /dev/null 2>&1
!update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/bin/java > /dev/null 2>&1

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [4]:
!pip install git+https://github.com/dasmiq/passim.git -q


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for passim (setup.py) ... [?25l[?25hdone
  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone


In [5]:
!mkdir -p output

In [6]:
import json
import re
import time
import tarfile
import os

tar_path = '/content/drive/MyDrive/passim-thesis/passim-data.tar.gz'
extract_to = '/content/data'

os.makedirs(extract_to, exist_ok=True)

with tarfile.open(tar_path, 'r:gz') as tar:
    tar.extractall(extract_to)

# List extracted files
print("\nExtracted files:")
for root, dirs, files in os.walk(extract_to):
    for file in files:
        if file.endswith('.json'):
            filepath = os.path.join(root, file)
            size_mb = os.path.getsize(filepath) / (1024*1024)
            print(f"  {filepath} ({size_mb:.1f} MB)")

  tar.extractall(extract_to)



Extracted files:
  /content/data/bullinger-letters.json (37.0 MB)
  /content/data/patristic-sources.json (263.6 MB)


In [7]:
def normalize_latin(text):
    """
    Simple but effective Latin text normalization
    Fixes the j/i bug and standardizes orthography
    """
    # Lowercase
    text = text.lower()

    # Orthographic normalization
    text = text.replace('j', 'i')
    text = text.replace('J', 'I')
    text = text.replace('æ', 'ae')
    text = text.replace('Æ', 'ae')
    text = text.replace('œ', 'oe')
    text = text.replace('Œ', 'oe')

    # Remove quotes and punctuation
    text = re.sub(r'[«»""„‟]', '', text)  # Remove quotes
    text = re.sub(r'[;:,\.\!\?\(\)\[\]\{\}]', ' ', text)  # Replace punctuation with space

    # Normalize whitespace
    text = ' '.join(text.split())

    return text

# Test the function
test = "Videmus duplicem statum non confusum, sed conjunctum"
print(f"\nTest normalization:")
print(f"  Original: {test}")
print(f"  Normalized: {normalize_latin(test)}")


Test normalization:
  Original: Videmus duplicem statum non confusum, sed conjunctum
  Normalized: videmus duplicem statum non confusum sed coniunctum


In [8]:
bullinger_input = f'{extract_to}/bullinger-letters.json'
bullinger_output = '/content/bullinger-normalized.json'

start = time.time()
processed = 0

with open(bullinger_input, 'r', encoding='utf-8') as f_in, \
     open(bullinger_output, 'w', encoding='utf-8') as f_out:

    for line in f_in:
        if not line.strip():
            continue

        doc = json.loads(line)

        if 'text' in doc:
            doc['text'] = normalize_latin(doc['text'])

        doc['corpus'] = 'bullinger'

        f_out.write(json.dumps(doc, ensure_ascii=False) + '\n')
        processed += 1

        if processed % 1000 == 0:
            elapsed = time.time() - start
            rate = processed / elapsed
            print(f"  {processed:,} processed | {rate:.1f} docs/sec")

elapsed = time.time() - start
print(f"Bullinger complete: {processed:,} documents in {elapsed:.1f}s")

  1,000 processed | 2046.4 docs/sec
  2,000 processed | 1882.6 docs/sec
  3,000 processed | 1779.3 docs/sec
  4,000 processed | 1482.4 docs/sec
  5,000 processed | 1541.7 docs/sec
  6,000 processed | 1531.5 docs/sec
  7,000 processed | 1522.6 docs/sec
  8,000 processed | 1543.3 docs/sec
  9,000 processed | 1606.7 docs/sec
  10,000 processed | 1709.1 docs/sec
  11,000 processed | 1809.6 docs/sec
  12,000 processed | 1895.6 docs/sec
  13,000 processed | 1973.9 docs/sec
Bullinger complete: 13,114 documents in 6.6s


In [9]:
patristic_input = f'{extract_to}/patristic-sources.json'
patristic_output = '/content/patristic-normalized.json'

start = time.time()
processed = 0

with open(patristic_input, 'r', encoding='utf-8') as f_in, \
     open(patristic_output, 'w', encoding='utf-8') as f_out:

    for line in f_in:
        if not line.strip():
            continue

        doc = json.loads(line)

        if 'text' in doc:
            doc['text'] = normalize_latin(doc['text'])

        doc['corpus'] = 'patristic'

        f_out.write(json.dumps(doc, ensure_ascii=False) + '\n')
        processed += 1

        if processed % 5000 == 0:
            elapsed = time.time() - start
            rate = processed / elapsed
            remaining = (115655 - processed) / rate / 60  # Approximate
            print(f"  {processed:,} processed | {rate:.1f} docs/sec | ~{remaining:.1f}min remaining")

elapsed = time.time() - start
print(f"Patristic complete: {processed:,} documents in {elapsed:.1f}s")

  5,000 processed | 5894.5 docs/sec | ~0.3min remaining
  10,000 processed | 6107.5 docs/sec | ~0.3min remaining
  15,000 processed | 6260.7 docs/sec | ~0.3min remaining
  20,000 processed | 6347.6 docs/sec | ~0.3min remaining
  25,000 processed | 6348.6 docs/sec | ~0.2min remaining
  30,000 processed | 6400.3 docs/sec | ~0.2min remaining
  35,000 processed | 6379.1 docs/sec | ~0.2min remaining
  40,000 processed | 5605.9 docs/sec | ~0.2min remaining
  45,000 processed | 4367.9 docs/sec | ~0.3min remaining
  50,000 processed | 4466.5 docs/sec | ~0.2min remaining
  55,000 processed | 4594.3 docs/sec | ~0.2min remaining
  60,000 processed | 4695.9 docs/sec | ~0.2min remaining
  65,000 processed | 4789.0 docs/sec | ~0.2min remaining
  70,000 processed | 4859.3 docs/sec | ~0.2min remaining
  75,000 processed | 4933.2 docs/sec | ~0.1min remaining
  80,000 processed | 4946.8 docs/sec | ~0.1min remaining
  85,000 processed | 4928.3 docs/sec | ~0.1min remaining
  90,000 processed | 4998.6 docs

In [10]:
combined_output = '/content/combined_normalized.json'

with open(combined_output, 'w', encoding='utf-8') as f_out:
    # Copy Bullinger
    with open(bullinger_output, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            f_out.write(line)

    # Copy Patristic
    with open(patristic_output, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            f_out.write(line)

# Check file size
size_mb = os.path.getsize(combined_output) / (1024*1024)
print(f"Combined file created: {combined_output}")
print(f"  Size: {size_mb:.1f} MB")


Combined file created: /content/combined_normalized.json
  Size: 294.7 MB


In [None]:
!passim /content/combined_normalized.json /content/output_passim_normalized \
  --fields corpus \
  --filterpairs "corpus != corpus2" \
  -n 25 -m 15 -g 20 -a 20 --pcopy 0.3