In [732]:
%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [733]:
from __future__ import annotations

In [741]:
from gothic_core import process_table, notebook_mode, save_table, get_df_from_doc, get_match_regex, get_pq_doc, LineIterable, register_dataset_seps
import re
import pandas as pd
from pyquery import PyQuery as pq
from typing import Optional
from itertools import chain

In [742]:
notebook_mode(True)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 1009)

In [743]:
register_dataset_seps('got', ['ÜG', 'ÜE', 'Vw', 'Hw', 'Q', 'I', 'E', 'B', 'R', 'L', 'Son'])
register_dataset_seps('edel', ['Vw', 'Hw', 'Q', 'I', 'E', 'L', 'Son', 'GB', 'BM', 'F'])
register_dataset_seps('germ', ['RB', 'Vw', 'Hw', 'Q', 'I', 'E', 'W', 'B', 'L', 'Son'])
register_dataset_seps('idg', ['RB', 'ÜG', 'Vw', 'Hw', 'E', 'W', 'L', 'Son'])
register_dataset_seps('ae', ['ÜG', 'Vw', 'Hw', 'Q', 'I', 'E', 'W', 'R', 'L', 'Son'])
register_dataset_seps('ae', ['ÜG', 'Vw', 'Hw', 'Q', 'I', 'E', 'W', 'R', 'L', 'Son'])

# Process Gothic dataset.

In [744]:
# Gothic data is not very structued. Just process one line at a time.
doc = pq(filename='../data/koebler/got.html', parser='html')
contents = doc.text().split('\n')[2:]

df = get_df_from_doc(contents, 'got')

df.to_csv('../data/koebler/got.tsv', sep='\t', index=None)

# The original EDEL data is in a weird encoding, and much more verbose.

In [745]:
doc = get_pq_doc('../data/koebler/edel.htm', 'cp1252')

df = get_df_from_doc(LineIterable(doc), 'edel')

df.to_csv('../data/koebler/edel.tsv', sep='\t', index=None)

# germ dataset is similar to EDEL.

In [746]:
doc = get_pq_doc('../data/koebler/germ.html', 'cp1252')

df = get_df_from_doc(LineIterable(doc), 'germ')

df.to_csv('../data/koebler/germ.tsv', sep='\t', index=None)

# idg dataset is similar too. But it uses two classes: MsoPlainText and MsoNormal.

In [747]:
doc = get_pq_doc('../data/koebler/idg.html', 'cp1252')

df = get_df_from_doc(LineIterable(doc), 'idg')

df.to_csv('../data/koebler/idg.tsv', sep='\t', index=None)

# ae is similar.

In [748]:
doc = get_pq_doc('../data/koebler/ae.html', 'cp1252')

df = get_df_from_doc(LineIterable(doc), 'ae')

df.to_csv('../data/koebler/ae.tsv', sep='\t', index=None)

# Old code.

In [16]:
save_table('../data/wikiling/edel', '../data/wikiling/edel.tsv')

In [7]:
table = process_table('../data/wikiling/edel.tsv', 'vorwort.got')

In [None]:
pgmc_df = get_proto_dict('../data/entirePGMC_daughters.csv')
x = EtymologicalDictionary.from_dataframe(pgmc_df, 'got', 'de')

# Classes for matchers.

In [641]:
class _BaseMatcher:
    """Base class for matchers that matches something in a processed string, where the vorwort notations have been replaced with special character sequences, i.e., #@!<stuff>!@#."""
    
    _pattern = None
    _remove_notes = True
    
    def __init__(self, raw_string: str):
        self._raw_string = raw_string
    
    def __repr__(self):
        return self._raw_string
    
    def __iter__(self):
        cls = type(self)
        p = re.compile(re.escape('#@!') + r'.+?' + re.escape('!@#'))
        s = self._raw_string
        if cls._remove_notes:
            s = re.sub(p, '', self._raw_string)
        for match in cls._pattern.finditer(s):
            yield match.group()
    
class AbbrMatcher(_BaseMatcher):
    
    _pattern = re.compile(r'\w+\.')
    
    
class NoteMatcher(_BaseMatcher):
    
    _pattern = re.compile(re.escape('#@!') + r'.+?' + re.escape('!@#'))
    _remove_notes = False

class PhraseMatcher(_BaseMatcher):
    
    _pattern = re.compile(r'\w[\w\s]*')

class CapitalizationMatcher(_BaseMatcher):
    
    _pattern = re.compile(r'(?<=\W)[A-Z][\w\s]*')

class GothicMatcher(NoteMatcher):
    
    _pattern = re.compile(re.escape('#@!') + r'gotisch' + re.escape('!@#'))