In [1]:
import pandas as pd
import re
from collections import defaultdict

In [2]:
df = pd.read_csv('../data/hesperia_epigraphy.csv', sep=',', escapechar='\\', encoding='latin1')

In [3]:
df.head()

Unnamed: 0,id,REF MLH,REF. HESPERIA,YACIMIENTO,MUNICIPIO,MATERIAL,OBJETO,TIPO SOPORTE,signario paleohispÃ¡nico?,TEXTO,APARATO CRÃTICO,LENGUA
0,3,C.51.01.S1,GI.11.01,La Ciutadella de Roses,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,obi,,IBERICO
1,4,C.03.03.S1,GI.08.05,El Mas Castellar,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,$[---]takokoba[---]&,"$[---]takokoba[---]&, $tÃ¡kokÃ³ki& Moncunill, ...",IBERICO
2,5,C.03.04.S1,GI.08.11,El Mas Castellar,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,[---]eÅ,,IBERICO
3,6,C.01.34.S1,GI.10.30,EmpÃºries,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,[---]$ban&,"$ban&, $IN& Panosa $á¸¿n& ELM",IBERICO
4,7,C.01.25.S1,GI.10.21,EmpÃºries,Girona,CERAMICA,RECIPIENTE,PROPIEDAD,LEVANTINO,kans+,,IBERICO


In [4]:
df.columns

Index(['id', 'REF MLH', 'REF. HESPERIA', 'YACIMIENTO', 'MUNICIPIO', 'MATERIAL',
       'OBJETO', 'TIPO SOPORTE', 'signario paleohispÃ¡nico?', 'TEXTO',
       'APARATO CRÃTICO', 'LENGUA'],
      dtype='object')

In [5]:
def convert_encoding(series, old_encoding, new_encoding):
    """Convert the encoding of a given column `col_name`."""
    return series.str.encode(old_encoding).str.decode(new_encoding)

In [6]:
# Convert "LENGUA" encoding to utf8.
df['LENGUA'] = convert_encoding(df['LENGUA'], 'latin1', 'utf8')

In [7]:
# Check what languages there are.
df['LENGUA'].value_counts()

IBERICO               2095
INDET.                 160
CELTIBERICO            138
LATIN                   11
GRIEGO                  10
LUSITANO                 6
celtibérico?             4
no ha lugar              3
Celtibérico ?            3
Celtibérico?             2
Etrusco                  2
LATIN/IBERICO            2
celtibérico ?            2
PÚNICO                   2
Púnico                   2
N                        2
ninguna                  1
¿Pseudoescritura?        1
latín? o ibérico?        1
CELTA                    1
latín?                   1
IBÉRICO/LATIN?           1
paleovasco?              1
Celtibérico ??           1
ibérico/latín            1
Celtibérico/latino       1
Name: LENGUA, dtype: int64

In [8]:
# Convert the encoding of column "TEXTO" to utf8.
df['TEXTO'] = convert_encoding(df['TEXTO'], 'latin1', 'utf8')

In [9]:
# Get the subset of epigraphies that are Iberian.
iberico_mask = df['LENGUA'] == 'IBERICO'
non_na_mask = df['TEXTO'].notnull()

iberico = df[iberico_mask & non_na_mask].copy()

In [10]:
# Replace all whitespace tokens with ' ' except for '\n'.
iberico['TEXTO'] = iberico['TEXTO'].str.replace('\n', '<newline>').str.replace(r'\s', ' ').str.replace('<newline>', '\n')

# Clean stuff up. We go through the symbols and check if they should be kept.

In [11]:
def count(df_or_series, col_name=None, as_list=False):
    """Count all characters in the data and track which texts they are from."""
    if isinstance(df_or_series, pd.DataFrame):
        series = df_or_series[col_name]
    else:
        series = df_or_series
    cs = defaultdict(list) # "cs" stands for "Charset with Source".
    for text in series.values.reshape(-1):
        for c in set(text):
            cs[c].append(text)

    lst = sorted(cs.keys())
    if as_list:
        print(lst)
    else:
        print(''.join(lst))
    return cs

In [12]:
cs = count(iberico, 'TEXTO')


 #$%&()*+,-./0123456789:;?ABCDEFGHIJLMNOPQRSTUVWXY[]_abcdefghijklmnopqrstuvxz{}©·Íáéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩϡḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Get rid of inline comments.

In [13]:
pattern = re.compile('%.*?#', re.DOTALL) # The comment might span multiple lines.

In [14]:
tmp_text = iberico['TEXTO'].str.replace(pattern, ' ', regex=True)
cs = count(tmp_text)
iberico['tmp_text'] = tmp_text


 $&()*+,-./0123456789:;?ABCDEFGHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Replace "?" with "+" everywhere. Both indicates some missing character.

In [15]:
iberico['tmp_text'] = tmp_text = tmp_text.str.replace('?', '+', regex=False)

In [16]:
cs = count(tmp_text)


 $&()*+,-./0123456789:;ABCDEFGHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Replace special symbols with "?". We use "?" to indicate some special graphemes that experts are uncertain about.

In [17]:
# Special graphemes are represented by "S" or "G" followed by some integers. For instance, "S47", "G27".
special_pat = r'(?P<sym>[S|G]\d+)'
print(tmp_text[tmp_text.str.contains(special_pat, regex=True)].values)

['A:\n\n$bíśkíbíS47rsetí&:$S47ia&:$bineia& \n\n$bisbíturbiWtín& \n\n\nB:\n\n$śntarlabí+nkos&́:$bíS47rtuan&:$koikakiskítur& \n\n$ebiS56koraS56& \n\n'
 '++sinS41le[-]binkor:iuśtiS56:tarbinbaleś:kooo:aŕakotaŕ[-]laŕ:aianetinS41+talabin:ireba:aśS56:sinbitai:aia[ ]eia:boa:bíDkín:selkí:[-]eśeśanbiabiHkaŕesS56en'
 '  \n\n$+biśbaS56isS47kaS47&\n\n\n'
 '  kokaŕ\n\n  $karekar&:se .$kí&. .S47f. ker:lukeśiŕ:$akailtir .S47f.& kerai:$ir++riatuiaś&\n\n\n\n'
 '$]:S48 v rkibea[&'
 '$ati &$.S47f&. $uia+iskeunir&:$tinkan&:$berśtano&+ .S47f. a\n\n$ś+sebatitaŕ&:.S58. $itan+aŕ&'
 ' \n\n \n\n]śeliŕ:ututa:baśiŕ:tarakar\n\n]nki\n\n\n  \n\n$otalauki S48& S47f ŕ:$siel &:$S48rikan&\n\netaDŕeŕ:sosintikeŕka:nanban\n\n$baneśarikan &:etaŕ                                                                             urketiikeŕka:e S47f tiDŕ:laki \n\n\n  \n\nsaltulakokia kí (6)\n\n$berśiŕka kí& (2)$artakerka kí& (6)$G27 l S48 śtautinka kí& (7)berśiŕka a o (3) kí (1)\n\n$biurtakerka kí& (2)$S47a Dltirka kí& (5)saltulakokia

  return func(self, *args, **kwargs)


In [18]:
# Get all special graphems.
special_signs = set(tmp_text.str.extractall(special_pat).reset_index()['sym'])

In [19]:
# Replace them with "?".
for sign in special_signs:
    tmp_text = tmp_text.str.replace(sign, '?')

In [20]:
iberico['TEXTO'] = tmp_text

In [21]:
cs = count(tmp_text)


 $&()*+,-./0123456789:;?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Unescape html stuff.

In [22]:
import html

In [23]:
iberico['TEXTO'] = tmp_text = tmp_text.apply(html.unescape)
cs = count(tmp_text)


 $&()*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Replace alternative reading indicators that use "$...&" patterns.

In [24]:
pattern = re.compile(r'\$(.+?)&', re.DOTALL)

In [25]:
# Some alternative readings seem to be nested. Therefore we run it twice.
tmp_text = tmp_text.str.replace(pattern, r'\1', regex=True)
tmp_text = tmp_text.str.replace(pattern, r'\1', regex=True)

In [26]:
cs = count(tmp_text)


 $&()*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


In [27]:
cs['$']

['    \n\n     kau(ŕ)ubastíki\n\n     kaiké(irkaiska)rta(u)tínen(obatil)\n\n       otanar :eŕebauśitirtéśierban(e)sitar\n\n       $kaisurarbitan:sakári(s)kéŕtaibataŕati\n\n       aité(rbaebaseŕ+er)ke+te+iba++tíl+\n\n\n  \n\n    abaŕśen:sorse:ertiketór:bitauketitóre\n\n    iunstírabatékaitíukáité(ba)i(sḿ)iltíŕbitúkáŕinar\n\n      uśtalaibi:etais : ekátir\n\n    abaŕtaŕike:iu[---]\n\n    iŕ(ba)ŕbekoekúatebakis\n\n      aba+(tu)+a+akú+\n\n\n   \n\n     ]rŕ:IIIIIIIIIIIIII:IIIIIII\n\n     ]++eba\n\n\n  \n\n]+aká :a IIIIII',
 '      ]tin:iunstír:tau+koteká [\n\n       ]tieśnitíŕatan:biuŕtikise:[\n\n       beŕisetítiatoká:nikokátoar[-]bai: \n\n       túŕkosbebon:uskáŕe:tieká & ultítikán:\n\n 5 eŕtebaśká:bintuŕkeská : abatutíkerká:uke\n\n  ++bo:tíŕatisukil:itíkotesun  kortínte:\n\n  tíekáa:sitíŕakáŕka+:nikokaiatai\n\n  is:beŕteike:ituŕutan:lebósba[+]ibon[\n\n        batíŕ[e+]káŕ[-]iteŕibon\n\n 10         tu\n\n\n      śalir:i[-]ba[---]:banteŕba\n\n  n:tínebeta[3\n\n4]n:itíte++ta[\n\n  salaker:

In [28]:
cs['&']

['barta  &',
 '      ]tin:iunstír:tau+koteká [\n\n       ]tieśnitíŕatan:biuŕtikise:[\n\n       beŕisetítiatoká:nikokátoar[-]bai: \n\n       túŕkosbebon:uskáŕe:tieká & ultítikán:\n\n 5 eŕtebaśká:bintuŕkeská : abatutíkerká:uke\n\n  ++bo:tíŕatisukil:itíkotesun  kortínte:\n\n  tíekáa:sitíŕakáŕka+:nikokaiatai\n\n  is:beŕteike:ituŕutan:lebósba[+]ibon[\n\n        batíŕ[e+]káŕ[-]iteŕibon\n\n 10         tu\n\n\n      śalir:i[-]ba[---]:banteŕba\n\n  n:tínebeta[3\n\n4]n:itíte++ta[\n\n  salaker:itiŕoketebon : iŕ$+[\n\n  ká:iunstíriká:sikite:basir[\n\n\n  kátúlatíen  \n\n\n']

### There is still a couple errors with alternative readings. Just delete these symbols altogether.

In [29]:
iberico['tmp_text'] = tmp_text = tmp_text.str.replace(r'[\$&]', '', regex=True)

In [30]:
cs = count(tmp_text)


 ()*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Try to see what these brackets, parentheses and braces are enclosing.

In [31]:
def get_matches(series, pattern, regex=True):
    """Get all rows that match the pattern"""
    return series[series.str.contains(pattern, regex=regex)]

def get_matches_as_set(series, pattern):
    """Get all matches as set."""
    matches = series.apply(lambda s: re.findall(pattern, s))
    num_matches = matches.apply(len)
    return set(sum(matches[(num_matches > 0)].values, list()))

In [32]:
pattern_parent = re.compile(r'(\(.*?\))')

In [33]:
get_matches_as_set(tmp_text, pattern_parent)

{'(+)',
 '(-)',
 '(1)',
 '(10)',
 '(2)',
 '(3 bi)',
 '(3)',
 '(5)',
 '(6)',
 '(7)',
 '([)',
 '([--])',
 '([-])',
 '(a)',
 '(aspa)',
 '(b)',
 '(ba)',
 '(e)',
 '(irkaiska)',
 '(koŕ)',
 '(marca)',
 '(obatil)',
 '(rbaebaseŕ+er)',
 '(s)',
 '(sḿ)',
 '(tu)',
 '(u)',
 '(vac)',
 '(vacat)',
 '(ŕ)',
 '(ḿ)'}

In [34]:
pd.options.display.max_colwidth = 0 # NOTE Print out every thing in the column.

In [35]:
get_matches(tmp_text, '(+)', regex=False)

108     aŕka(+)ibe[\n\nbiuŕau[\n\neteso\n\n- - -]+s[ \n\n- - -]tar[  \n\n+biuŕso\n\n++  \n\nbiuŕtibaś \n\nlauŕto \n\neteśuŕ \n\nie I I I I[-]o \n\n -]te+[- -]++                                                                                                                                                                                                                
288      \n\nbalkar \n\n\n \n\naḅịḷạḳụs(+)istiŕ(+)lakea(+)banśako+o̲l̲a̲\n\naiuniltiŕte\n\n\n \n\nṇịŕ[.]ṛ́ekoneśanṣ́ại̱t̲e̲a̲ś̲a̲k̲a̲i̱b̲a̲r̲b̲e̲\n\n\n \n\nḅạḷ+ḳạḅạḳạ                                                                                                                                                                                        
878     si++++(+):++[ - - - ]+[\n\nbaidesir:bilosg+\n\nŕe:biosildun                                                                                                                                                                                                   

In [36]:
get_matches(tmp_text, '(-)', regex=False)

1129    ]teḿ̲b̲a̲+[--(-)]bant̲e̲:bantibat̲e̱[
Name: TEXTO, dtype: object

### Remove the parentheses that do not enclose some alphanumeric string.

In [37]:
pattern = re.compile(r'\((\W+)\)')
get_matches_as_set(tmp_text, pattern)

{'+', '-', '[', '[--]', '[-]'}

In [38]:
tmp_text = tmp_text.str.replace(pattern, r'\1', regex=True)

In [39]:
get_matches_as_set(tmp_text, pattern_parent)

{'(1)',
 '(10)',
 '(2)',
 '(3 bi)',
 '(3)',
 '(5)',
 '(6)',
 '(7)',
 '(a)',
 '(aspa)',
 '(b)',
 '(ba)',
 '(e)',
 '(irkaiska)',
 '(koŕ)',
 '(marca)',
 '(obatil)',
 '(rbaebaseŕ+er)',
 '(s)',
 '(sḿ)',
 '(tu)',
 '(u)',
 '(vac)',
 '(vacat)',
 '(ŕ)',
 '(ḿ)'}

### Parentheses with numbers in them are mostly likely from annotators that signify different places in the epigraphies. So are "(a)" and "(b)". We replace them with newlines instead.

In [40]:
patterns = ['1', '10', '2', '3 bi', '3', '5', '6', '7', "a", "b"]
for pattern in patterns:
    tmp_text = tmp_text.str.replace(f'({pattern})', r'\n', regex=False)

In [41]:
get_matches_as_set(tmp_text, pattern_parent)

{'(aspa)',
 '(ba)',
 '(e)',
 '(irkaiska)',
 '(koŕ)',
 '(marca)',
 '(obatil)',
 '(rbaebaseŕ+er)',
 '(s)',
 '(sḿ)',
 '(tu)',
 '(u)',
 '(vac)',
 '(vacat)',
 '(ŕ)',
 '(ḿ)'}

### "(marca)", "(vac)" and "(vacat)" seem to be from the annotators.

In [42]:
# Replace (vac) and (vacate) with whitespace.
tmp_text = tmp_text.str.replace('(marca)', '', regex=False)
tmp_text = tmp_text.str.replace('(vac)', '', regex=False)
tmp_text = tmp_text.str.replace('(vacat)', '', regex=False)

In [43]:
# Replace (a) and (b) with newlines.
tmp_text = tmp_text.str.replace('(a)', '\n', regex=False)
tmp_text = tmp_text.str.replace('(b)', '\n', regex=False)

In [44]:
get_matches(tmp_text, pattern_parent)

  return func(self, *args, **kwargs)


44        lu\n\n\n  (aspa)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
55      :iŕekeśta(ḿ)nataŕśuekiarsinekun\n\nbaibaibar                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [45]:
# I'm not sure about the last text as it contains many characters in parentheses. I remove this text from the dataset.
tmp_text.loc[1021] = ''

In [46]:
# Remove (aspa). It's probably a weird sign.
tmp_text = tmp_text.str.replace('(aspa)', '', regex=False)

In [47]:
get_matches(tmp_text, pattern_parent)

55     :iŕekeśta(ḿ)nataŕśuekiarsinekun\n\nbaibaibar                                         
240    tikirsikoŕ\n\nsakaŕi(koŕ) e-\n\nban:erirtan\n\naŕora:an-\n\nkonạụ[n]ịṇ\n\n------+
Name: TEXTO, dtype: object

In [48]:
# These two texts gets discarded as well.
tmp_text.loc[55] = ''
tmp_text.loc[240] = ''

In [49]:
cs = count(tmp_text)


 )*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[\]_abcdefgiklmnorstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Deal with single ')' characters.

In [50]:
cs[')']

['a)\n\nkutui\n\nb)\n\nu',
 'a)\n\n\nui\n\n\nb)\n\nl',
 'a) o\n\nb) ka\n\nc) kika',
 'a) kebelkuŕe:eŕkunbase \n\nb) śaúśir[\n\nc) ]ṉeotiṉ+\n\nd) kebe',
 'a) e\n\nb) ere',
 'a) ḿ++\n\nb) ḿ++tá+',
 '1) onoisakebatiubiku\n\n\n2) ++++kutun',
 'a) baitolo\n\n\nb) baitolo']

In [51]:
# They seem to indicate different sections of texts. Replace them with newlines.
tmp_text = tmp_text.str.replace(r'[abcd12]\)', r'\n', regex=True)

In [52]:
cs = count(tmp_text)


 *+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[\]_abcdefgiklmnorstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### What are these angle brackets.

In [53]:
get_matches(tmp_text, r'[<>]', regex=True)

302    ]b̲[   \n\n]o i̲b   \n\n]X X<: ibe  [-]+o̲+e̲[                                                 \n\n]o̲ŕbiloske:[-]SS̲X̲X<̲:[-][   \n\n]ŕike:X<X<:k̲ukebuŕke :<[                         5\n\n]X̲X̲ [:+] śaneke̲:X<X<̲:leŕsge [                           \n\n[-]eosdaŕke:><X<̲+[   \n\n]+++tigiś̲++[   \n\n[-]o̲[                                                                           
877      \n\niŕike:orti:gaŕokan:dadula:baśk:\n\nbuiśtiner:bagaŕok:sssX<:tuŕlbai\n\nluŕa:leguśegik:baseŕokeiunbaida:\n\nuŕke:basbidiŕbartin:iŕike:baseŕ\n\nokar:tebind:belagaśikauŕ:isbin                  5 \n\nai:asgandis:tagisgaŕok:binike                         \n\nbin:śalir:kidei:gaibigait:\n\n \n\nsakaŕiskeŕ\n\narnai:                                                                  
998    isbataŕis\n\nsere+e+\n\n>IIIIeriri+                                                                                                                                                                                      

In [54]:
# They are probably special signs. 

### Deal with dots.

In [55]:
get_matches(tmp_text, '.', regex=False)

124     selkití[.]+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
132       tales\n\n\n  a[.]tó\n\n\n  ta\n\n                                                                                                                                                                                                                         

In [56]:
# Dots in brackets seem to suggest missing characters here.
dots_in_brackets = re.compile(r'(?<=\[)\.+(?=\])')
tmp_text = tmp_text.str.replace(dots_in_brackets, lambda m: '-' * len(m.group()))

In [57]:
get_matches(tmp_text, '.', regex=False)

264        [---]nḿ[---]\n\n\n  bele+[-c.3-]nar \n\n\n   inti                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
355     a.ko                                                                                                                                                                                                                                                        

In [58]:
# Some patterns are used for an unspecified number of missing characters.
c_pattern = re.compile(r'\[-*c[\. ][\d/\w\s]+\-*]')

In [59]:
tmp_text = tmp_text.str.replace(c_pattern, r'[---]', regex=True)

In [60]:
get_matches(tmp_text, '.', regex=False)

355     a.ko                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
431     eteśike.ḿi                                                                                                                                                                                                                                                  

In [61]:
# Replace 1. and etc. with newlines -- they indicate different sections.
tmp_text = tmp_text.str.replace(r'\d+[\. ]', r'\n')

In [62]:
get_matches(tmp_text, '.', regex=False)

355     a.ko                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
431     eteśike.ḿi                                                                                                                                                                                                                                                  

In [63]:
# Not sure what the remaining dots are for, but they are mostly isolated. I'm removing them.
tmp_text = tmp_text.str.replace(".", '', regex=False)

In [64]:
iberico['tmp_text'] = tmp_text
cs = count(tmp_text)


 *+,-/0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[\]_abcdefgiklmnorstuv{}©·áéíóúńōŕś̵̠̣̱̲́̂ΑΔΕΙΚΛΝΠΩḅḇḍḳḵḷḻḿṁṃṇṉṛṟṣṭṯạẹịọụ‐’‡…∙│𐘃


### Deal with brackets.

In [65]:
pattern_bracket = re.compile(r'\[.*?\]')

In [66]:
# If brackets contain only whitespaces and hyphens, convert all to hyphens.
tmp_text = tmp_text.str.replace(r'(?<=\[)[ \-+]+?(?=\])', lambda m: '-' * len(m.group()))
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[ 5]',
 '[--- i]',
 '[--- s]',
 '[--- śal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[1-2]',
 '[1]',
 '[2-3]',
 '[2]',
 '[3+]',
 '[3/4]',
 '[3]',
 '[4]',
 '[5]',
 '[7]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[a]',
 '[ar]',
 '[ba]',
 '[bakaś]',
 '[be]',
 '[bo]',
 '[c0-2]',
 '[c0-4]',
 '[c2-3]',
 '[c2-4]',
 '[c3-4]',
 '[c3-5]',
 '[c6-8]',
 '[c7-9]',
 '[c7]',
 '[e+]',
 '[e]',
 '[i]',
 '[leitaŕtin]',
 '[ltun]',
 '[n]',
 '[ne:]',
 '[ne]',
 '[r-]',
 '[s]',
 '[sa]',
 '[te]',
 '[ti]',
 '[tiri]',
 '[u]',
 '[ŕte]',
 '[ŕtin]',
 '[ś ---]',
 '[ś]',
 '[śale]',
 '[̣--]',
 '[̣̣2]',
 '[‐‐‐]',
 '[…]',
 '[│+]'}

In [67]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[ 5]',
 '[--- i]',
 '[--- s]',
 '[--- śal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[1-2]',
 '[1]',
 '[2-3]',
 '[2]',
 '[3+]',
 '[3/4]',
 '[3]',
 '[4]',
 '[5]',
 '[7]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[a]',
 '[ar]',
 '[ba]',
 '[bakaś]',
 '[be]',
 '[bo]',
 '[c0-2]',
 '[c0-4]',
 '[c2-3]',
 '[c2-4]',
 '[c3-4]',
 '[c3-5]',
 '[c6-8]',
 '[c7-9]',
 '[c7]',
 '[e+]',
 '[e]',
 '[i]',
 '[leitaŕtin]',
 '[ltun]',
 '[n]',
 '[ne:]',
 '[ne]',
 '[r-]',
 '[s]',
 '[sa]',
 '[te]',
 '[ti]',
 '[tiri]',
 '[u]',
 '[ŕte]',
 '[ŕtin]',
 '[ś ---]',
 '[ś]',
 '[śale]',
 '[̣--]',
 '[̣̣2]',
 '[‐‐‐]',
 '[…]',
 '[│+]'}

In [68]:
# If the brackets only contain normal characters, it means they are reconstructed. Restore reconstructed letters.
tmp_text = tmp_text.str.replace(r'\[([a-zŕśḷ]+)\]', r'\1')

In [69]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[ 5]',
 '[--- i]',
 '[--- s]',
 '[--- śal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[1-2]',
 '[1]',
 '[2-3]',
 '[2]',
 '[3+]',
 '[3/4]',
 '[3]',
 '[4]',
 '[5]',
 '[7]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[c0-2]',
 '[c0-4]',
 '[c2-3]',
 '[c2-4]',
 '[c3-4]',
 '[c3-5]',
 '[c6-8]',
 '[c7-9]',
 '[c7]',
 '[e+]',
 '[ne:]',
 '[r-]',
 '[ś ---]',
 '[̣--]',
 '[̣̣2]',
 '[‐‐‐]',
 '[…]',
 '[│+]'}

In [70]:
# This is one case of weird digit.
tmp_text = tmp_text.str.replace('[̣̣2]', '[2]', regex=False)

In [71]:
# Brackets with digits in them mean different number of missing characters.
pattern_digit = r'\[[\w\d\-\/\+ ]*\d+[\w\d\-\/\+ ]*\]'

In [72]:
def replace_as_hyphens(m):
    g = m.group()
    max_num = 0
    for c in g:
        try:
            c = int(c)
            max_num = max(max_num, c)
        except ValueError:
            pass
    return '-' * max_num

In [73]:
tmp_text = tmp_text.str.replace(pattern_digit, replace_as_hyphens)
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[--- i]',
 '[--- s]',
 '[--- śal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[e+]',
 '[ne:]',
 '[r-]',
 '[ś ---]',
 '[̣--]',
 '[‐‐‐]',
 '[…]',
 '[│+]'}

In [74]:
# If there are more than three hyphens, reduce them to three instead -- they significant an unspecified number of missing characters.
tmp_text = tmp_text.str.replace(r'\[----+\]', r'[---]')

In [75]:
# If the brackets contain some normal characters, it means they are probably reconstructed. Restore reconstructed letters.
tmp_text = tmp_text.str.replace(r'\[([:\+,\-a-zA-Zŕśḷ̣ ]+)\]', r'\1')

In [76]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]', '[--/]', '[‐‐‐]', '[…]', '[│+]'}

In [77]:
# This looks like segmenters.
get_matches(tmp_text, '[│+]', regex=False)

2050      s---ŕ │ baiteski │ bilosebam │ boioi[│+] balesaika │\n\n  bebatiŕ │ itiŕokanker---++++++ / m │ baika[│+]bilosebam │ boioi │baite+
Name: TEXTO, dtype: object

In [78]:
tmp_text = tmp_text.str.replace('[│+]', ':', regex=False)

In [79]:
# Probably just hyphens.
tmp_text = tmp_text.str.replace('[…]', '[---]', regex=False)

In [80]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]', '[---]', '[--/]', '[‐‐‐]'}

In [81]:
# Remove the remaining brackets
tmp_text = tmp_text.str.replace(r'[\[\]]', '')

### Replace / with newlines.

In [82]:
tmp_text = tmp_text.str.replace("/", '\n')

In [83]:
get_matches_as_set(tmp_text, pattern_bracket)

set()

In [84]:
iberico['tmp_text'] = tmp_text

In [85]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '©', '·', 'á', 'é', 'í', 'ó', 'ú', 'ń', 'ō', 'ŕ', 'ś', '́', '̂', '̠', '̣', '̱', '̲', '̵', 'Α', 'Δ', 'Ε', 'Ι', 'Κ', 'Λ', 'Ν', 'Π', 'Ω', 'ḅ', 'ḇ', 'ḍ', 'ḳ', 'ḵ', 'ḷ', 'ḻ', 'ḿ', 'ṁ', 'ṃ', 'ṇ', 'ṉ', 'ṛ', 'ṟ', 'ṣ', 'ṭ', 'ṯ', 'ạ', 'ẹ', 'ị', 'ọ', 'ụ', '‐', '’', '‡', '∙', '│', '𐘃']


### Get rid of weird and annoying takens.

In [86]:
get_matches(tmp_text, '_')

1673    kítá+\n\n_______\n\n\nbaśons\n\nikasors\n\n_______\n\n       ++
Name: TEXTO, dtype: object

In [87]:
# They are probably just whitespaces.
tmp_text = tmp_text.str.replace('_', ' ', regex=False)

In [88]:
get_matches(tmp_text, '·')

72      ------\n\n---+++ke---\n\n---inti · tan+---\n\n---+ke · koŕnel+---\n\n---ite · ika+---\n\n--- · s---\n\n------
210     aŕe · take \n\natinbelauŕ · antalskar\n\nFVLVIA · LINTEARIA                                                  
211     ------+\n\nHEIC · EST · SITVS, -A ---\n\naŕe · +ki · ar++---\n\nsakaŕil+---                                  
1526    t̲o̲uko · ++                                                                                                 
1538    śunuke · +                                                                                                   
1739    okale · baḿiban · nelai                                                                                      
2027    eluŕaite · bas · uḿmiser                                                                                     
Name: TEXTO, dtype: object

In [89]:
# They are likely segmenters.
tmp_text = tmp_text.str.replace('·', ':')

In [90]:
# We ignore all the other diacritics. Acute accents are meaningful in Iberian, therefore not removed.
mapping = {
    'ḅ': 'b',
    'ḇ': 'b',
    'ḍ': 'd',
    'ḳ': 'k',
    'ḵ': 'k',
    'ḷ': 'l',
    'ḻ': 'l',
    'ṁ': 'm',
    'ṃ': 'm',
    'ṇ': 'n',
    'ṉ': 'n',
    'ṛ': 'r',
    'ṟ': 'r',
    'ṣ': 's',
    'ṭ': 't',
    'ṯ': 't',
    'ạ': 'a',
    'ẹ': 'e',
    'ị': 'i',
    'ọ': 'o',
    'ụ': 'u'
}

In [91]:
for old, new in mapping.items():
    tmp_text = tmp_text.str.replace(old, new)

In [92]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '©', 'á', 'é', 'í', 'ó', 'ú', 'ń', 'ō', 'ŕ', 'ś', '́', '̂', '̠', '̣', '̱', '̲', '̵', 'Α', 'Δ', 'Ε', 'Ι', 'Κ', 'Λ', 'Ν', 'Π', 'Ω', 'ḿ', '‐', '’', '‡', '∙', '│', '𐘃']


In [93]:
# Some remaining isolated diacritics.
for c in ['̠', '̣', '̱', '̲']:
    tmp_text = tmp_text.str.replace(c, '')

In [94]:
# Some acute accents are not combining with characters.
tmp_text = tmp_text.str.replace('s' + '́', 'ś').str.replace('r' + '́', 'ŕ').str.replace('m' + '́', 'ḿ')

In [95]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '©', 'á', 'é', 'í', 'ó', 'ú', 'ń', 'ō', 'ŕ', 'ś', '̂', '̵', 'Α', 'Δ', 'Ε', 'Ι', 'Κ', 'Λ', 'Ν', 'Π', 'Ω', 'ḿ', '‐', '’', '‡', '∙', '│', '𐘃']


In [96]:
iberico['tmp_text'] = tmp_text

### Continue dealing with some weird tokens.

In [97]:
get_matches(tmp_text, '│')

2050      s---ŕ │ baiteski │ bilosebam │ boioi: balesaika │\n\n  bebatiŕ │ itiŕokanker---++++++ \n m │ baika:bilosebam │ boioi │baite+           
2051      ---aiuki │ setibios │ baiteski │ sal---tiŕe │ te \n --- il---e+\n\n│baŕkabiosbaite---ilorse\n\n\n  ---teibalesaika │ uŕtieiuŕ \n uŕ +i+
Name: TEXTO, dtype: object

In [98]:
tmp_text = tmp_text.str.replace('│', ':')

In [99]:
get_matches(tmp_text, '̵')

310      \n\nśebiŕin:taubaśtetaś\n\ntintileis:bantubailkunḿi\n\narbais:neban:kuniŕ                                  \n\nbantaśkalir:belśtaukui:aurelen\n\n I̵̵I̵I̵ I kekeerirtiban:kuteŕnaniśil                        5\n\nbeŕenultiteś:kitei:autiriśa                           \n\nnbali:toroś:balaukiakiŕe\n\nabarbaśtanie:sonai\n\niumabeleś:talkuneu                                    \n\n\n  \n\nbaskiteierinuie
Name: TEXTO, dtype: object

In [100]:
# Remove the crossing-out symbol, since it's for numerical signs. 
tmp_text = tmp_text.str.replace('̵', '')

In [101]:
get_matches(tmp_text, '̂')

2119      kutukiŕbitatikoukebosekom̂ḿ\n\n\n  kutakituŕsborbiokou      
2178    kúkutútutítibabitátatétekókotóto+leś-skáka++a+mnirŕbekíkium̂ḿ+
2232    kutukiŕbitatikokabastokeaubooelm̂nḿite\nśrbe                  
Name: TEXTO, dtype: object

In [102]:
# This is a special sign used only three times.
tmp_text = tmp_text.str.replace('m' + '̂', '?')

In [103]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '©', 'á', 'é', 'í', 'ó', 'ú', 'ń', 'ō', 'ŕ', 'ś', 'Α', 'Δ', 'Ε', 'Ι', 'Κ', 'Λ', 'Ν', 'Π', 'Ω', 'ḿ', '‐', '’', '‡', '∙', '𐘃']


In [104]:
get_matches(tmp_text, '’')

779    +o’os’sto’toaall’+
Name: TEXTO, dtype: object

In [105]:
# Note sure what it means. Discard this text.
tmp_text.loc[779] = ''

In [106]:
iberico['tmp_text'] = tmp_text

### There are still some numbers around. We treat them as starting new sections.

In [107]:
# These are line numbers.
tmp_text = tmp_text.str.replace('\d+', '\n')

In [108]:
iberico['tmp_text'] = tmp_text

In [109]:
cs = count(tmp_text)


 *+,-:<>?ABCDEFHIJLMNOQRSTVWXY\abcdefgiklmnorstuv{}©áéíóúńōŕśΑΔΕΙΚΛΝΠΩḿ‐‡∙𐘃


In [110]:
# New lines here.
cs['\\']

['be \\n I I  \\n I I I',
 '\\n leis \\n ś',
 ' \n\n \n\nśeliŕ:ututa:baśiŕ:tarakar\n\nnki\n\n\n  \n\notalauki ? ?f ŕ:siel :?rikan\n\netaDŕeŕ:sosintikeŕka:nanban\n\nbaneśarikan :etaŕ                                                                             urketiikeŕka:e ?f tiDŕ:laki \n\n\n  \n\nsaltulakokia kí \\n\n\nberśiŕka kí \\nartakerka kí \\n? l ? śtautinka kí \\nberśiŕka a o \\n kí \\n\n\nbiurtakerka kí \\n?a Dltirka kí \\nsaltulakokia kí \\nsaltulakokia o \\n                                        \n\n? beronka kí \\nberśiŕka o \\nsakarbaśka kí\\nberśiŕka ki \\naituar kí kia kí \\n\n\nkaniberonka kí \\nbiuriltirka kí \\ns ? kelka kí \\nbiurtakerka kí \\naituar kí ki a kí \\n',
 '\\n ban ka\n\n\\n ta',
 '\n +baserté:bonantíté:nḿbaŕte:bortébara:káŕesirteekiar:banité:kaŕ\n\n\n +irten\n\n\n +tín+  \n\n\n boi:ban\n\n\n   ekiar \\n kaŕestabikíŕ       \n\n\n ebiŕtéekíar  \n\n\n olekáŕkóeki\n\n\n   oŕotis:\\n káŕbi\n\n\n bébér  \n\n\n bélar:ban:iŕ          \n\n\n   ban   iŕ\n\n\n   e

In [111]:
tmp_text = tmp_text.str.replace('\\', '\n', regex=False)

### What are braces?

In [112]:
get_matches(tmp_text, '[{}]')

1729     etakartalbeteśu:bekebere{kíli}kéletaké+i:bekeberekílieśu
Name: TEXTO, dtype: object

In [113]:
# Get rid of braces since I think they were meant to be brackets or parentheses.
tmp_text = tmp_text.str.replace('[{}]', '')

### And another weird symbol.

In [114]:
get_matches(tmp_text, 'ō')

1801      \n\n eŕeti--areritaŕatis---ne- \n\n ŕer:bekinetaneŕs+ebiŕskaś--ar \n\n  bekinetaneŕs:ḿ seikeḿikeriti \n\n kokōḿ iakila:tiŕabakeśtaiŕtiśan \n\n irlaurtisa:eśkubaŕs:eleŕ:kais             \n\n\n  bora:bitekian:keitiŕeie--eŕ-- \n\n kutuŕaŕ--e
Name: TEXTO, dtype: object

In [115]:
# This is yet another special sign.
tmp_text = tmp_text.str.replace('ō', '?')

In [116]:
iberico['tmp_text'] = tmp_text

### Now Clean up the whitespaces.

In [117]:
cs = count(tmp_text)


 *+,-:<>?ABCDEFHIJLMNOQRSTVWXYabcdefgiklmnorstuv©áéíóúńŕśΑΔΕΙΚΛΝΠΩḿ‐‡∙𐘃


In [118]:
tmp_text = tmp_text.str.replace(' +', ' ')

In [119]:
multiline = re.compile('\n[\s]*\n', re.MULTILINE)

In [120]:
iberico['cleaned'] = tmp_text = tmp_text.str.replace(multiline, '\n').apply(lambda s: [ss.strip() for ss in s.split('\n')])

### Finally, get rid of the empty cells.

In [121]:
cleaned_df = iberico[tmp_text.apply(len) > 0][['REF. HESPERIA', 'cleaned']].rename({'cleaned': 'texts'})

In [122]:
cleaned_df

Unnamed: 0,REF. HESPERIA,cleaned
0,GI.11.01,[obi]
1,GI.08.05,[---takokoba---]
2,GI.08.11,[---eś]
3,GI.10.30,[---ban]
4,GI.10.21,[kans+]
...,...,...
2479,GI.00.01,"[, baitolo, baitolo]"
2481,L.25.01FALSA,[nltiŕbo]
2482,L.23.01FALSA,[śaiti]
2483,L.22.01FALSA,"[ikon-, ḿkeiḿi, iltubel-, eśeban]"


In [123]:
cleaned_df.to_csv('iberian.csv', index=None)