In [1]:
import pandas as pd
import re
from collections import defaultdict

In [2]:
df = pd.read_csv('../data/hesperia_epigraphy.csv', sep=',', escapechar='\\', encoding='latin1')

In [3]:
df.head()

Unnamed: 0,id,REF MLH,REF. HESPERIA,YACIMIENTO,MUNICIPIO,MATERIAL,OBJETO,TIPO SOPORTE,signario paleohisp√É¬°nico?,TEXTO,APARATO CR√É¬çTICO,LENGUA
0,3,C.51.01.S1,GI.11.01,La Ciutadella de Roses,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,obi,,IBERICO
1,4,C.03.03.S1,GI.08.05,El Mas Castellar,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,$[---]takokoba[---]&,"$[---]takokoba[---]&, $t√É¬°kok√É¬≥ki& Moncunill, ...",IBERICO
2,5,C.03.04.S1,GI.08.11,El Mas Castellar,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,[---]e√Ö¬õ,,IBERICO
3,6,C.01.34.S1,GI.10.30,Emp√É¬∫ries,Girona,CERAMICA,RECIPIENTE,INDET.,LEVANTINO,[---]$ban&,"$ban&, $IN& Panosa $√°¬∏¬øn& ELM",IBERICO
4,7,C.01.25.S1,GI.10.21,Emp√É¬∫ries,Girona,CERAMICA,RECIPIENTE,PROPIEDAD,LEVANTINO,kans+,,IBERICO


In [4]:
df.columns

Index(['id', 'REF MLH', 'REF. HESPERIA', 'YACIMIENTO', 'MUNICIPIO', 'MATERIAL',
       'OBJETO', 'TIPO SOPORTE', 'signario paleohisp√É¬°nico?', 'TEXTO',
       'APARATO CR√É¬çTICO', 'LENGUA'],
      dtype='object')

In [5]:
def convert_encoding(series, old_encoding, new_encoding):
    """Convert the encoding of a given column `col_name`."""
    return series.str.encode(old_encoding).str.decode(new_encoding)

In [6]:
# Convert "LENGUA" encoding to utf8.
df['LENGUA'] = convert_encoding(df['LENGUA'], 'latin1', 'utf8')

In [7]:
# Check what languages there are.
df['LENGUA'].value_counts()

IBERICO               2095
INDET.                 160
CELTIBERICO            138
LATIN                   11
GRIEGO                  10
LUSITANO                 6
celtib√©rico?             4
no ha lugar              3
Celtib√©rico ?            3
Celtib√©rico?             2
Etrusco                  2
LATIN/IBERICO            2
celtib√©rico ?            2
P√öNICO                   2
P√∫nico                   2
N                        2
ninguna                  1
¬øPseudoescritura?        1
lat√≠n? o ib√©rico?        1
CELTA                    1
lat√≠n?                   1
IB√âRICO/LATIN?           1
paleovasco?              1
Celtib√©rico ??           1
ib√©rico/lat√≠n            1
Celtib√©rico/latino       1
Name: LENGUA, dtype: int64

In [8]:
# Convert the encoding of column "TEXTO" to utf8.
df['TEXTO'] = convert_encoding(df['TEXTO'], 'latin1', 'utf8')

In [9]:
# Get the subset of epigraphies that are Iberian.
iberico_mask = df['LENGUA'] == 'IBERICO'
non_na_mask = df['TEXTO'].notnull()

iberico = df[iberico_mask & non_na_mask].copy()

In [10]:
# Replace all whitespace tokens with ' ' except for '\n'.
iberico['TEXTO'] = iberico['TEXTO'].str.replace('\n', '<newline>').str.replace(r'\s', ' ').str.replace('<newline>', '\n')

# Clean stuff up. We go through the symbols and check if they should be kept.

In [11]:
def count(df_or_series, col_name=None, as_list=False):
    """Count all characters in the data and track which texts they are from."""
    if isinstance(df_or_series, pd.DataFrame):
        series = df_or_series[col_name]
    else:
        series = df_or_series
    cs = defaultdict(list) # "cs" stands for "Charset with Source".
    for text in series.values.reshape(-1):
        for c in set(text):
            cs[c].append(text)

    lst = sorted(cs.keys())
    if as_list:
        print(lst)
    else:
        print(''.join(lst))
    return cs

In [12]:
cs = count(iberico, 'TEXTO')


 #$%&()*+,-./0123456789:;?ABCDEFGHIJLMNOPQRSTUVWXY[]_abcdefghijklmnopqrstuvxz{}¬©¬∑√ç√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©œ°·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Get rid of inline comments.

In [13]:
pattern = re.compile('%.*?#', re.DOTALL) # The comment might span multiple lines.

In [14]:
tmp_text = iberico['TEXTO'].str.replace(pattern, ' ', regex=True)
cs = count(tmp_text)
iberico['tmp_text'] = tmp_text


 $&()*+,-./0123456789:;?ABCDEFGHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Replace "?" with "+" everywhere. Both indicates some missing character.

In [15]:
iberico['tmp_text'] = tmp_text = tmp_text.str.replace('?', '+', regex=False)

In [16]:
cs = count(tmp_text)


 $&()*+,-./0123456789:;ABCDEFGHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Replace special symbols with "?". We use "?" to indicate some special graphemes that experts are uncertain about.

In [17]:
# Special graphemes are represented by "S" or "G" followed by some integers. For instance, "S47", "G27".
special_pat = r'(?P<sym>[S|G]\d+)'
print(tmp_text[tmp_text.str.contains(special_pat, regex=True)].values)

['A:\n\n$b√≠≈õk√≠b√≠S47rset√≠&:$S47ia&:$bineia& \n\n$bisb√≠turbiWt√≠n& \n\n\nB:\n\n$≈õntarlab√≠+nkos&ÃÅ:$b√≠S47rtuan&:$koikakisk√≠tur& \n\n$ebiS56koraS56& \n\n'
 '++sinS41le[-]binkor:iu≈õtiS56:tarbinbale≈õ:kooo:a≈ïakota≈ï[-]la≈ï:aianetinS41+talabin:ireba:a≈õS56:sinbitai:aia[ ]eia:boa:b√≠Dk√≠n:selk√≠:[-]e≈õe≈õanbiabiHka≈ïesS56en'
 '  \n\n$+bi≈õbaS56isS47kaS47&\n\n\n'
 '  koka≈ï\n\n  $karekar&:se .$k√≠&. .S47f. ker:luke≈õi≈ï:$akailtir .S47f.& kerai:$ir++riatuia≈õ&\n\n\n\n'
 '$]:S48 v rkibea[&'
 '$ati &$.S47f&. $uia+iskeunir&:$tinkan&:$ber≈õtano&+ .S47f. a\n\n$≈õ+sebatita≈ï&:.S58. $itan+a≈ï&'
 ' \n\n \n\n]≈õeli≈ï:ututa:ba≈õi≈ï:tarakar\n\n]nki\n\n\n  \n\n$otalauki S48& S47f ≈ï:$siel &:$S48rikan&\n\netaD≈ïe≈ï:sosintike≈ïka:nanban\n\n$bane≈õarikan &:eta≈ï                                                                             urketiike≈ïka:e S47f tiD≈ï:laki \n\n\n  \n\nsaltulakokia k√≠ (6)\n\n$ber≈õi≈ïka k√≠& (2)$artakerka k√≠& (6)$G27 l S48 ≈õtautinka k√≠& (7)ber≈õi≈ïka a o (3) k√≠ (1)\

  return func(self, *args, **kwargs)


In [18]:
# Get all special graphems.
special_signs = set(tmp_text.str.extractall(special_pat).reset_index()['sym'])

In [19]:
# Replace them with "?".
for sign in special_signs:
    tmp_text = tmp_text.str.replace(sign, '?')

In [20]:
iberico['TEXTO'] = tmp_text

In [21]:
cs = count(tmp_text)


 $&()*+,-./0123456789:;?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Unescape html stuff.

In [22]:
import html

In [23]:
iberico['TEXTO'] = tmp_text = tmp_text.apply(html.unescape)
cs = count(tmp_text)


 $&()*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Replace alternative reading indicators that use "$...&" patterns.

In [24]:
pattern = re.compile(r'\$(.+?)&', re.DOTALL)

In [25]:
# Some alternative readings seem to be nested. Therefore we run it twice.
tmp_text = tmp_text.str.replace(pattern, r'\1', regex=True)
tmp_text = tmp_text.str.replace(pattern, r'\1', regex=True)

In [26]:
cs = count(tmp_text)


 $&()*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


In [27]:
cs['$']

['    \n\n     kau(≈ï)ubast√≠ki\n\n     kaik√©(irkaiska)rta(u)t√≠nen(obatil)\n\n       otanar :e≈ïebau≈õitirt√©≈õierban(e)sitar\n\n       $kaisurarbitan:sak√°ri(s)k√©≈ïtaibata≈ïati\n\n       ait√©(rbaebase≈ï+er)ke+te+iba++t√≠l+\n\n\n  \n\n    aba≈ï≈õen:sorse:ertiket√≥r:bitauketit√≥re\n\n    iunst√≠rabat√©kait√≠uk√°it√©(ba)i(s·∏ø)ilt√≠≈ïbit√∫k√°≈ïinar\n\n      u≈õtalaibi:etais : ek√°tir\n\n    aba≈ïta≈ïike:iu[---]\n\n    i≈ï(ba)≈ïbekoek√∫atebakis\n\n      aba+(tu)+a+ak√∫+\n\n\n   \n\n     ]r≈ï:IIIIIIIIIIIIII:IIIIIII\n\n     ]++eba\n\n\n  \n\n]+ak√° :a IIIIII',
 '      ]tin:iunst√≠r:tau+kotek√° [\n\n       ]tie≈õnit√≠≈ïatan:biu≈ïtikise:[\n\n       be≈ïiset√≠tiatok√°:nikok√°toar[-]bai: \n\n       t√∫≈ïkosbebon:usk√°≈ïe:tiek√° & ult√≠tik√°n:\n\n 5 e≈ïteba≈õk√°:bintu≈ïkesk√° : abatut√≠kerk√°:uke\n\n  ++bo:t√≠≈ïatisukil:it√≠kotesun  kort√≠nte:\n\n  t√≠ek√°a:sit√≠≈ïak√°≈ïka+:nikokaiatai\n\n  is:be≈ïteike:itu≈ïutan:leb√≥sba[+]ibon[\n\n        bat√≠≈ï[e+]k√°≈ï[-]ite≈ïibon\n\n 10         tu\n\n\

In [28]:
cs['&']

['barta  &',
 '      ]tin:iunst√≠r:tau+kotek√° [\n\n       ]tie≈õnit√≠≈ïatan:biu≈ïtikise:[\n\n       be≈ïiset√≠tiatok√°:nikok√°toar[-]bai: \n\n       t√∫≈ïkosbebon:usk√°≈ïe:tiek√° & ult√≠tik√°n:\n\n 5 e≈ïteba≈õk√°:bintu≈ïkesk√° : abatut√≠kerk√°:uke\n\n  ++bo:t√≠≈ïatisukil:it√≠kotesun  kort√≠nte:\n\n  t√≠ek√°a:sit√≠≈ïak√°≈ïka+:nikokaiatai\n\n  is:be≈ïteike:itu≈ïutan:leb√≥sba[+]ibon[\n\n        bat√≠≈ï[e+]k√°≈ï[-]ite≈ïibon\n\n 10         tu\n\n\n      ≈õalir:i[-]ba[---]:bante≈ïba\n\n  n:t√≠nebeta[3\n\n4]n:it√≠te++ta[\n\n  salaker:iti≈ïoketebon : i≈ï$+[\n\n  k√°:iunst√≠rik√°:sikite:basir[\n\n\n  k√°t√∫lat√≠en  \n\n\n']

### There is still a couple errors with alternative readings. Just delete these symbols altogether.

In [29]:
iberico['tmp_text'] = tmp_text = tmp_text.str.replace(r'[\$&]', '', regex=True)

In [30]:
cs = count(tmp_text)


 ()*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[]_abcdefgiklmnoprstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Try to see what these brackets, parentheses and braces are enclosing.

In [31]:
def get_matches(series, pattern, regex=True):
    """Get all rows that match the pattern"""
    return series[series.str.contains(pattern, regex=regex)]

def get_matches_as_set(series, pattern):
    """Get all matches as set."""
    matches = series.apply(lambda s: re.findall(pattern, s))
    num_matches = matches.apply(len)
    return set(sum(matches[(num_matches > 0)].values, list()))

In [32]:
pattern_parent = re.compile(r'(\(.*?\))')

In [33]:
get_matches_as_set(tmp_text, pattern_parent)

{'(+)',
 '(-)',
 '(1)',
 '(10)',
 '(2)',
 '(3 bi)',
 '(3)',
 '(5)',
 '(6)',
 '(7)',
 '([)',
 '([--])',
 '([-])',
 '(a)',
 '(aspa)',
 '(b)',
 '(ba)',
 '(e)',
 '(irkaiska)',
 '(ko≈ï)',
 '(marca)',
 '(obatil)',
 '(rbaebase≈ï+er)',
 '(s)',
 '(s·∏ø)',
 '(tu)',
 '(u)',
 '(vac)',
 '(vacat)',
 '(≈ï)',
 '(·∏ø)'}

In [34]:
pd.options.display.max_colwidth = 0 # NOTE Print out every thing in the column.

In [35]:
get_matches(tmp_text, '(+)', regex=False)

108     a≈ïka(+)ibe[\n\nbiu≈ïau[\n\neteso\n\n- - -]+s[ \n\n- - -]tar[  \n\n+biu≈ïso\n\n++  \n\nbiu≈ïtiba≈õ \n\nlau≈ïto \n\nete≈õu≈ï \n\nie I I I I[-]o \n\n -]te+[- -]++                                                                                                                                                                                                                
288      \n\nbalkar \n\n\n \n\nabÃ£iÃ£lÃ£aÃ£kÃ£uÃ£s(+)isti≈ï(+)lakea(+)ban≈õako+oÃ≤lÃ≤aÃ≤\n\naiunilti≈ïte\n\n\n \n\nnÃ£iÃ£≈ï[.]≈ïÃ£ekone≈õan≈õÃ£aÃ£iÃ±tÃ≤eÃ≤aÃ≤≈õÃ≤aÃ≤kÃ≤aÃ≤iÃ±bÃ≤aÃ≤rÃ≤bÃ≤eÃ≤\n\n\n \n\nbÃ£aÃ£lÃ£+kÃ£aÃ£bÃ£aÃ£kÃ£aÃ£                                                                                                                                                                                        
878     si++++(+):++[ - - - ]+[\n\nbaidesir:bilosg+\n\n≈ïe:biosildun                                                                                                                                             

In [36]:
get_matches(tmp_text, '(-)', regex=False)

1129    ]te·∏øÃ≤bÃ≤aÃ≤+[--(-)]bantÃ≤eÃ≤:bantibatÃ≤eÃ±[
Name: TEXTO, dtype: object

### Remove the parentheses that do not enclose some alphanumeric string.

In [37]:
pattern = re.compile(r'\((\W+)\)')
get_matches_as_set(tmp_text, pattern)

{'+', '-', '[', '[--]', '[-]'}

In [38]:
tmp_text = tmp_text.str.replace(pattern, r'\1', regex=True)

In [39]:
get_matches_as_set(tmp_text, pattern_parent)

{'(1)',
 '(10)',
 '(2)',
 '(3 bi)',
 '(3)',
 '(5)',
 '(6)',
 '(7)',
 '(a)',
 '(aspa)',
 '(b)',
 '(ba)',
 '(e)',
 '(irkaiska)',
 '(ko≈ï)',
 '(marca)',
 '(obatil)',
 '(rbaebase≈ï+er)',
 '(s)',
 '(s·∏ø)',
 '(tu)',
 '(u)',
 '(vac)',
 '(vacat)',
 '(≈ï)',
 '(·∏ø)'}

### Parentheses with numbers in them are mostly likely from annotators that signify different places in the epigraphies. So are "(a)" and "(b)". We replace them with newlines instead.

In [40]:
patterns = ['1', '10', '2', '3 bi', '3', '5', '6', '7', "a", "b"]
for pattern in patterns:
    tmp_text = tmp_text.str.replace(f'({pattern})', r'\n', regex=False)

In [41]:
get_matches_as_set(tmp_text, pattern_parent)

{'(aspa)',
 '(ba)',
 '(e)',
 '(irkaiska)',
 '(ko≈ï)',
 '(marca)',
 '(obatil)',
 '(rbaebase≈ï+er)',
 '(s)',
 '(s·∏ø)',
 '(tu)',
 '(u)',
 '(vac)',
 '(vacat)',
 '(≈ï)',
 '(·∏ø)'}

### "(marca)", "(vac)" and "(vacat)" seem to be from the annotators.

In [42]:
# Replace (vac) and (vacate) with whitespace.
tmp_text = tmp_text.str.replace('(marca)', '', regex=False)
tmp_text = tmp_text.str.replace('(vac)', '', regex=False)
tmp_text = tmp_text.str.replace('(vacat)', '', regex=False)

In [43]:
# Replace (a) and (b) with newlines.
tmp_text = tmp_text.str.replace('(a)', '\n', regex=False)
tmp_text = tmp_text.str.replace('(b)', '\n', regex=False)

In [44]:
get_matches(tmp_text, pattern_parent)

  return func(self, *args, **kwargs)


44        lu\n\n\n  (aspa)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
55      :i≈ïeke≈õta(·∏ø)nata≈ï≈õuekiarsinekun\n\nbaibaibar                                                                                                                                                                                                                                                                                                                                                                                                                              

In [45]:
# I'm not sure about the last text as it contains many characters in parentheses. I remove this text from the dataset.
tmp_text.loc[1021] = ''

In [46]:
# Remove (aspa). It's probably a weird sign.
tmp_text = tmp_text.str.replace('(aspa)', '', regex=False)

In [47]:
get_matches(tmp_text, pattern_parent)

55     :i≈ïeke≈õta(·∏ø)nata≈ï≈õuekiarsinekun\n\nbaibaibar                                         
240    tikirsiko≈ï\n\nsaka≈ïi(ko≈ï) e-\n\nban:erirtan\n\na≈ïora:an-\n\nkonaÃ£uÃ£[n]iÃ£nÃ£\n\n------+
Name: TEXTO, dtype: object

In [48]:
# These two texts gets discarded as well.
tmp_text.loc[55] = ''
tmp_text.loc[240] = ''

In [49]:
cs = count(tmp_text)


 )*+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[\]_abcdefgiklmnorstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Deal with single ')' characters.

In [50]:
cs[')']

['a)\n\nkutui\n\nb)\n\nu',
 'a)\n\n\nui\n\n\nb)\n\nl',
 'a) o\n\nb) ka\n\nc) kika',
 'a) kebelku≈ïe:e≈ïkunbase \n\nb) ≈õa√∫≈õir[\n\nc) ]nÃ±eotinÃ±+\n\nd) kebe',
 'a) e\n\nb) ere',
 'a) ·∏ø++\n\nb) ·∏ø++t√°+',
 '1) onoisakebatiubiku\n\n\n2) ++++kutun',
 'a) baitolo\n\n\nb) baitolo']

In [51]:
# They seem to indicate different sections of texts. Replace them with newlines.
tmp_text = tmp_text.str.replace(r'[abcd12]\)', r'\n', regex=True)

In [52]:
cs = count(tmp_text)


 *+,-./0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[\]_abcdefgiklmnorstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### What are these angle brackets.

In [53]:
get_matches(tmp_text, r'[<>]', regex=True)

302    ]bÃ≤[   \n\n]o iÃ≤b   \n\n]X X<: ibe  [-]+oÃ≤+eÃ≤[                                                 \n\n]oÃ≤≈ïbiloske:[-]SSÃ≤XÃ≤X<Ã≤:[-][   \n\n]≈ïike:X<X<:kÃ≤ukebu≈ïke :<[                         5\n\n]XÃ≤XÃ≤ [:+] ≈õanekeÃ≤:X<X<Ã≤:le≈ïsge [                           \n\n[-]eosda≈ïke:><X<Ã≤+[   \n\n]+++tigi≈õÃ≤++[   \n\n[-]oÃ≤[                                                                           
877      \n\ni≈ïike:orti:ga≈ïokan:dadula:ba≈õk:\n\nbui≈õtiner:baga≈ïok:sssX<:tu≈ïlbai\n\nlu≈ïa:legu≈õegik:base≈ïokeiunbaida:\n\nu≈ïke:basbidi≈ïbartin:i≈ïike:base≈ï\n\nokar:tebind:belaga≈õikau≈ï:isbin                  5 \n\nai:asgandis:tagisga≈ïok:binike                         \n\nbin:≈õalir:kidei:gaibigait:\n\n \n\nsaka≈ïiske≈ï\n\narnai:                                                                  
998    isbata≈ïis\n\nsere+e+\n\n>IIIIeriri+                                                                                                                                           

In [54]:
# They are probably special signs. 

### Deal with dots.

In [55]:
get_matches(tmp_text, '.', regex=False)

124     selkit√≠[.]+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
132       tales\n\n\n  a[.]t√≥\n\n\n  ta\n\n                                                                                                                                                                                                                       

In [56]:
# Dots in brackets seem to suggest missing characters here.
dots_in_brackets = re.compile(r'(?<=\[)\.+(?=\])')
tmp_text = tmp_text.str.replace(dots_in_brackets, lambda m: '-' * len(m.group()))

In [57]:
get_matches(tmp_text, '.', regex=False)

264        [---]n·∏ø[---]\n\n\n  bele+[-c.3-]nar \n\n\n   inti                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
355     a.ko                                                                                                                                                                                                                                                      

In [58]:
# Some patterns are used for an unspecified number of missing characters.
c_pattern = re.compile(r'\[-*c[\. ][\d/\w\s]+\-*]')

In [59]:
tmp_text = tmp_text.str.replace(c_pattern, r'[---]', regex=True)

In [60]:
get_matches(tmp_text, '.', regex=False)

355     a.ko                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
431     ete≈õike.·∏øi                                                                                                                                                                                                                                               

In [61]:
# Replace 1. and etc. with newlines -- they indicate different sections.
tmp_text = tmp_text.str.replace(r'\d+[\. ]', r'\n')

In [62]:
get_matches(tmp_text, '.', regex=False)

355     a.ko                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
431     ete≈õike.·∏øi                                                                                                                                                                                                                                               

In [63]:
# Not sure what the remaining dots are for, but they are mostly isolated. I'm removing them.
tmp_text = tmp_text.str.replace(".", '', regex=False)

In [64]:
iberico['tmp_text'] = tmp_text
cs = count(tmp_text)


 *+,-/0123456789:<>?ABCDEFHIJLMNOQRSTVWXY[\]_abcdefgiklmnorstuv{}¬©¬∑√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õÃÅÃÇÃ†Ã£Ã±Ã≤ÃµŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏Ö·∏á·∏ç·∏≥·∏µ·∏∑·∏ª·∏ø·πÅ·πÉ·πá·πâ·πõ·πü·π£·π≠·πØ·∫°·∫π·ªã·ªç·ª•‚Äê‚Äô‚Ä°‚Ä¶‚àô‚îÇêòÉ


### Deal with brackets.

In [65]:
pattern_bracket = re.compile(r'\[.*?\]')

In [66]:
# If brackets contain only whitespaces and hyphens, convert all to hyphens.
tmp_text = tmp_text.str.replace(r'(?<=\[)[ \-+]+?(?=\])', lambda m: '-' * len(m.group()))
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[ 5]',
 '[--- i]',
 '[--- s]',
 '[--- ≈õal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[1-2]',
 '[1]',
 '[2-3]',
 '[2]',
 '[3+]',
 '[3/4]',
 '[3]',
 '[4]',
 '[5]',
 '[7]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[a]',
 '[ar]',
 '[ba]',
 '[baka≈õ]',
 '[be]',
 '[bo]',
 '[c0-2]',
 '[c0-4]',
 '[c2-3]',
 '[c2-4]',
 '[c3-4]',
 '[c3-5]',
 '[c6-8]',
 '[c7-9]',
 '[c7]',
 '[e+]',
 '[e]',
 '[i]',
 '[leita≈ïtin]',
 '[ltun]',
 '[n]',
 '[ne:]',
 '[ne]',
 '[r-]',
 '[s]',
 '[sa]',
 '[te]',
 '[ti]',
 '[tiri]',
 '[u]',
 '[≈ïte]',
 '[≈ïtin]',
 '[≈õ ---]',
 '[≈õ]',
 '[≈õale]',
 '[Ã£--]',
 '[Ã£Ã£2]',
 '[‚Äê‚Äê‚Äê]',
 '[‚Ä¶]',
 '[‚îÇ+]'}

In [67]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[ 5]',
 '[--- i]',
 '[--- s]',
 '[--- ≈õal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[1-2]',
 '[1]',
 '[2-3]',
 '[2]',
 '[3+]',
 '[3/4]',
 '[3]',
 '[4]',
 '[5]',
 '[7]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[a]',
 '[ar]',
 '[ba]',
 '[baka≈õ]',
 '[be]',
 '[bo]',
 '[c0-2]',
 '[c0-4]',
 '[c2-3]',
 '[c2-4]',
 '[c3-4]',
 '[c3-5]',
 '[c6-8]',
 '[c7-9]',
 '[c7]',
 '[e+]',
 '[e]',
 '[i]',
 '[leita≈ïtin]',
 '[ltun]',
 '[n]',
 '[ne:]',
 '[ne]',
 '[r-]',
 '[s]',
 '[sa]',
 '[te]',
 '[ti]',
 '[tiri]',
 '[u]',
 '[≈ïte]',
 '[≈ïtin]',
 '[≈õ ---]',
 '[≈õ]',
 '[≈õale]',
 '[Ã£--]',
 '[Ã£Ã£2]',
 '[‚Äê‚Äê‚Äê]',
 '[‚Ä¶]',
 '[‚îÇ+]'}

In [68]:
# If the brackets only contain normal characters, it means they are reconstructed. Restore reconstructed letters.
tmp_text = tmp_text.str.replace(r'\[([a-z≈ï≈õ·∏∑]+)\]', r'\1')

In [69]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[ 5]',
 '[--- i]',
 '[--- s]',
 '[--- ≈õal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[1-2]',
 '[1]',
 '[2-3]',
 '[2]',
 '[3+]',
 '[3/4]',
 '[3]',
 '[4]',
 '[5]',
 '[7]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[c0-2]',
 '[c0-4]',
 '[c2-3]',
 '[c2-4]',
 '[c3-4]',
 '[c3-5]',
 '[c6-8]',
 '[c7-9]',
 '[c7]',
 '[e+]',
 '[ne:]',
 '[r-]',
 '[≈õ ---]',
 '[Ã£--]',
 '[Ã£Ã£2]',
 '[‚Äê‚Äê‚Äê]',
 '[‚Ä¶]',
 '[‚îÇ+]'}

In [70]:
# This is one case of weird digit.
tmp_text = tmp_text.str.replace('[Ã£Ã£2]', '[2]', regex=False)

In [71]:
# Brackets with digits in them mean different number of missing characters.
pattern_digit = r'\[[\w\d\-\/\+ ]*\d+[\w\d\-\/\+ ]*\]'

In [72]:
def replace_as_hyphens(m):
    g = m.group()
    max_num = 0
    for c in g:
        try:
            c = int(c)
            max_num = max(max_num, c)
        except ValueError:
            pass
    return '-' * max_num

In [73]:
tmp_text = tmp_text.str.replace(pattern_digit, replace_as_hyphens)
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]',
 '[  o  ]',
 '[--- i]',
 '[--- s]',
 '[--- ≈õal]',
 '[--------------------------------------------------]',
 '[----------------------]',
 '[------------------]',
 '[---------------]',
 '[-----------]',
 '[--------]',
 '[-------]',
 '[------]',
 '[-----]',
 '[----]',
 '[---]',
 '[--/]',
 '[--]',
 '[-]',
 '[:+]',
 '[IIII]',
 '[I]',
 '[VS, -A ---]',
 '[e+]',
 '[ne:]',
 '[r-]',
 '[≈õ ---]',
 '[Ã£--]',
 '[‚Äê‚Äê‚Äê]',
 '[‚Ä¶]',
 '[‚îÇ+]'}

In [74]:
# If there are more than three hyphens, reduce them to three instead -- they significant an unspecified number of missing characters.
tmp_text = tmp_text.str.replace(r'\[----+\]', r'[---]')

In [75]:
# If the brackets contain some normal characters, it means they are probably reconstructed. Restore reconstructed letters.
tmp_text = tmp_text.str.replace(r'\[([:\+,\-a-zA-Z≈ï≈õ·∏∑Ã£ ]+)\]', r'\1')

In [76]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]', '[--/]', '[‚Äê‚Äê‚Äê]', '[‚Ä¶]', '[‚îÇ+]'}

In [77]:
# This looks like segmenters.
get_matches(tmp_text, '[‚îÇ+]', regex=False)

2050      s---≈ï ‚îÇ baiteski ‚îÇ bilosebam ‚îÇ boioi[‚îÇ+] balesaika ‚îÇ\n\n  bebati≈ï ‚îÇ iti≈ïokanker---++++++ / m ‚îÇ baika[‚îÇ+]bilosebam ‚îÇ boioi ‚îÇbaite+
Name: TEXTO, dtype: object

In [78]:
tmp_text = tmp_text.str.replace('[‚îÇ+]', ':', regex=False)

In [79]:
# Probably just hyphens.
tmp_text = tmp_text.str.replace('[‚Ä¶]', '[---]', regex=False)

In [80]:
get_matches_as_set(tmp_text, pattern_bracket)

{'[  //  ]', '[---]', '[--/]', '[‚Äê‚Äê‚Äê]'}

In [81]:
# Remove the remaining brackets
tmp_text = tmp_text.str.replace(r'[\[\]]', '')

### Replace / with newlines.

In [82]:
tmp_text = tmp_text.str.replace("/", '\n')

In [83]:
get_matches_as_set(tmp_text, pattern_bracket)

set()

In [84]:
iberico['tmp_text'] = tmp_text

In [85]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '¬©', '¬∑', '√°', '√©', '√≠', '√≥', '√∫', '≈Ñ', '≈ç', '≈ï', '≈õ', 'ÃÅ', 'ÃÇ', 'Ã†', 'Ã£', 'Ã±', 'Ã≤', 'Ãµ', 'Œë', 'Œî', 'Œï', 'Œô', 'Œö', 'Œõ', 'Œù', 'Œ†', 'Œ©', '·∏Ö', '·∏á', '·∏ç', '·∏≥', '·∏µ', '·∏∑', '·∏ª', '·∏ø', '·πÅ', '·πÉ', '·πá', '·πâ', '·πõ', '·πü', '·π£', '·π≠', '·πØ', '·∫°', '·∫π', '·ªã', '·ªç', '·ª•', '‚Äê', '‚Äô', '‚Ä°', '‚àô', '‚îÇ', 'êòÉ']


### Get rid of weird and annoying takens.

In [86]:
get_matches(tmp_text, '_')

1673    k√≠t√°+\n\n_______\n\n\nba≈õons\n\nikasors\n\n_______\n\n       ++
Name: TEXTO, dtype: object

In [87]:
# They are probably just whitespaces.
tmp_text = tmp_text.str.replace('_', ' ', regex=False)

In [88]:
get_matches(tmp_text, '¬∑')

72      ------\n\n---+++ke---\n\n---inti ¬∑ tan+---\n\n---+ke ¬∑ ko≈ïnel+---\n\n---ite ¬∑ ika+---\n\n--- ¬∑ s---\n\n------
210     a≈ïe ¬∑ take \n\natinbelau≈ï ¬∑ antalskar\n\nFVLVIA ¬∑ LINTEARIA                                                  
211     ------+\n\nHEIC ¬∑ EST ¬∑ SITVS, -A ---\n\na≈ïe ¬∑ +ki ¬∑ ar++---\n\nsaka≈ïil+---                                  
1526    tÃ≤oÃ≤uko ¬∑ ++                                                                                                 
1538    ≈õunuke ¬∑ +                                                                                                   
1739    okale ¬∑ ba·∏øiban ¬∑ nelai                                                                                      
2027    elu≈ïaite ¬∑ bas ¬∑ u·∏ømiser                                                                                     
Name: TEXTO, dtype: object

In [89]:
# They are likely segmenters.
tmp_text = tmp_text.str.replace('¬∑', ':')

In [90]:
# We ignore all the other diacritics. Acute accents are meaningful in Iberian, therefore not removed.
mapping = {
    '·∏Ö': 'b',
    '·∏á': 'b',
    '·∏ç': 'd',
    '·∏≥': 'k',
    '·∏µ': 'k',
    '·∏∑': 'l',
    '·∏ª': 'l',
    '·πÅ': 'm',
    '·πÉ': 'm',
    '·πá': 'n',
    '·πâ': 'n',
    '·πõ': 'r',
    '·πü': 'r',
    '·π£': 's',
    '·π≠': 't',
    '·πØ': 't',
    '·∫°': 'a',
    '·∫π': 'e',
    '·ªã': 'i',
    '·ªç': 'o',
    '·ª•': 'u'
}

In [91]:
for old, new in mapping.items():
    tmp_text = tmp_text.str.replace(old, new)

In [92]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '¬©', '√°', '√©', '√≠', '√≥', '√∫', '≈Ñ', '≈ç', '≈ï', '≈õ', 'ÃÅ', 'ÃÇ', 'Ã†', 'Ã£', 'Ã±', 'Ã≤', 'Ãµ', 'Œë', 'Œî', 'Œï', 'Œô', 'Œö', 'Œõ', 'Œù', 'Œ†', 'Œ©', '·∏ø', '‚Äê', '‚Äô', '‚Ä°', '‚àô', '‚îÇ', 'êòÉ']


In [93]:
# Some remaining isolated diacritics.
for c in ['Ã†', 'Ã£', 'Ã±', 'Ã≤']:
    tmp_text = tmp_text.str.replace(c, '')

In [94]:
# Some acute accents are not combining with characters.
tmp_text = tmp_text.str.replace('s' + 'ÃÅ', '≈õ').str.replace('r' + 'ÃÅ', '≈ï').str.replace('m' + 'ÃÅ', '·∏ø')

In [95]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '¬©', '√°', '√©', '√≠', '√≥', '√∫', '≈Ñ', '≈ç', '≈ï', '≈õ', 'ÃÇ', 'Ãµ', 'Œë', 'Œî', 'Œï', 'Œô', 'Œö', 'Œõ', 'Œù', 'Œ†', 'Œ©', '·∏ø', '‚Äê', '‚Äô', '‚Ä°', '‚àô', '‚îÇ', 'êòÉ']


In [96]:
iberico['tmp_text'] = tmp_text

### Continue dealing with some weird tokens.

In [97]:
get_matches(tmp_text, '‚îÇ')

2050      s---≈ï ‚îÇ baiteski ‚îÇ bilosebam ‚îÇ boioi: balesaika ‚îÇ\n\n  bebati≈ï ‚îÇ iti≈ïokanker---++++++ \n m ‚îÇ baika:bilosebam ‚îÇ boioi ‚îÇbaite+           
2051      ---aiuki ‚îÇ setibios ‚îÇ baiteski ‚îÇ sal---ti≈ïe ‚îÇ te \n --- il---e+\n\n‚îÇba≈ïkabiosbaite---ilorse\n\n\n  ---teibalesaika ‚îÇ u≈ïtieiu≈ï \n u≈ï +i+
Name: TEXTO, dtype: object

In [98]:
tmp_text = tmp_text.str.replace('‚îÇ', ':')

In [99]:
get_matches(tmp_text, 'Ãµ')

310      \n\n≈õebi≈ïin:tauba≈õteta≈õ\n\ntintileis:bantubailkun·∏øi\n\narbais:neban:kuni≈ï                                  \n\nbanta≈õkalir:bel≈õtaukui:aurelen\n\n IÃµÃµIÃµIÃµ I kekeerirtiban:kute≈ïnani≈õil                        5\n\nbe≈ïenultite≈õ:kitei:autiri≈õa                           \n\nnbali:toro≈õ:balaukiaki≈ïe\n\nabarba≈õtanie:sonai\n\niumabele≈õ:talkuneu                                    \n\n\n  \n\nbaskiteierinuie
Name: TEXTO, dtype: object

In [100]:
# Remove the crossing-out symbol, since it's for numerical signs. 
tmp_text = tmp_text.str.replace('Ãµ', '')

In [101]:
get_matches(tmp_text, 'ÃÇ')

2119      kutuki≈ïbitatikoukebosekomÃÇ·∏ø\n\n\n  kutakitu≈ïsborbiokou      
2178    k√∫kut√∫tut√≠tibabit√°tat√©tek√≥kot√≥to+le≈õ-sk√°ka++a+mnir≈ïbek√≠kiumÃÇ·∏ø+
2232    kutuki≈ïbitatikokabastokeaubooelmÃÇn·∏øite\n≈õrbe                  
Name: TEXTO, dtype: object

In [102]:
# This is a special sign used only three times.
tmp_text = tmp_text.str.replace('m' + 'ÃÇ', '?')

In [103]:
cs = count(tmp_text, as_list=True)

['\n', ' ', '*', '+', ',', '-', '0', '1', '2', '3', '4', '5', ':', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'v', '{', '}', '¬©', '√°', '√©', '√≠', '√≥', '√∫', '≈Ñ', '≈ç', '≈ï', '≈õ', 'Œë', 'Œî', 'Œï', 'Œô', 'Œö', 'Œõ', 'Œù', 'Œ†', 'Œ©', '·∏ø', '‚Äê', '‚Äô', '‚Ä°', '‚àô', 'êòÉ']


In [104]:
get_matches(tmp_text, '‚Äô')

779    +o‚Äôos‚Äôsto‚Äôtoaall‚Äô+
Name: TEXTO, dtype: object

In [105]:
# Note sure what it means. Discard this text.
tmp_text.loc[779] = ''

In [106]:
iberico['tmp_text'] = tmp_text

### There are still some numbers around. We treat them as starting new sections.

In [107]:
# These are line numbers.
tmp_text = tmp_text.str.replace('\d+', '\n')

In [108]:
iberico['tmp_text'] = tmp_text

In [109]:
cs = count(tmp_text)


 *+,-:<>?ABCDEFHIJLMNOQRSTVWXY\abcdefgiklmnorstuv{}¬©√°√©√≠√≥√∫≈Ñ≈ç≈ï≈õŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏ø‚Äê‚Ä°‚àôêòÉ


In [110]:
# New lines here.
cs['\\']

['be \\n I I  \\n I I I',
 '\\n leis \\n ≈õ',
 ' \n\n \n\n≈õeli≈ï:ututa:ba≈õi≈ï:tarakar\n\nnki\n\n\n  \n\notalauki ? ?f ≈ï:siel :?rikan\n\netaD≈ïe≈ï:sosintike≈ïka:nanban\n\nbane≈õarikan :eta≈ï                                                                             urketiike≈ïka:e ?f tiD≈ï:laki \n\n\n  \n\nsaltulakokia k√≠ \\n\n\nber≈õi≈ïka k√≠ \\nartakerka k√≠ \\n? l ? ≈õtautinka k√≠ \\nber≈õi≈ïka a o \\n k√≠ \\n\n\nbiurtakerka k√≠ \\n?a Dltirka k√≠ \\nsaltulakokia k√≠ \\nsaltulakokia o \\n                                        \n\n? beronka k√≠ \\nber≈õi≈ïka o \\nsakarba≈õka k√≠\\nber≈õi≈ïka ki \\naituar k√≠ kia k√≠ \\n\n\nkaniberonka k√≠ \\nbiuriltirka k√≠ \\ns ? kelka k√≠ \\nbiurtakerka k√≠ \\naituar k√≠ ki a k√≠ \\n',
 '\\n ban ka\n\n\\n ta',
 '\n +basert√©:bonant√≠t√©:n·∏øba≈ïte:bort√©bara:k√°≈ïesirteekiar:banit√©:ka≈ï\n\n\n +irten\n\n\n +t√≠n+  \n\n\n boi:ban\n\n\n   ekiar \\n ka≈ïestabik√≠≈ï       \n\n\n ebi≈ït√©ek√≠ar  \n\n\n olek√°≈ïk√≥eki\n\n\n   o≈ïotis:\\n k√°≈ïbi\n\n\

In [111]:
tmp_text = tmp_text.str.replace('\\', '\n', regex=False)

### What are braces?

In [112]:
get_matches(tmp_text, '[{}]')

1729     etakartalbete≈õu:bekebere{k√≠li}k√©letak√©+i:bekeberek√≠lie≈õu
Name: TEXTO, dtype: object

In [113]:
# Get rid of braces since I think they were meant to be brackets or parentheses.
tmp_text = tmp_text.str.replace('[{}]', '')

### And another weird symbol.

In [114]:
get_matches(tmp_text, '≈ç')

1801      \n\n e≈ïeti--arerita≈ïatis---ne- \n\n ≈ïer:bekinetane≈ïs+ebi≈ïska≈õ--ar \n\n  bekinetane≈ïs:·∏ø seike·∏øikeriti \n\n kok≈ç·∏ø iakila:ti≈ïabake≈õtai≈ïti≈õan \n\n irlaurtisa:e≈õkuba≈ïs:ele≈ï:kais             \n\n\n  bora:bitekian:keiti≈ïeie--e≈ï-- \n\n kutu≈ïa≈ï--e
Name: TEXTO, dtype: object

In [115]:
# This is yet another special sign.
tmp_text = tmp_text.str.replace('≈ç', '?')

In [116]:
iberico['tmp_text'] = tmp_text

### Now Clean up the whitespaces.

In [117]:
cs = count(tmp_text)


 *+,-:<>?ABCDEFHIJLMNOQRSTVWXYabcdefgiklmnorstuv¬©√°√©√≠√≥√∫≈Ñ≈ï≈õŒëŒîŒïŒôŒöŒõŒùŒ†Œ©·∏ø‚Äê‚Ä°‚àôêòÉ


In [118]:
tmp_text = tmp_text.str.replace(' +', ' ')

In [119]:
multiline = re.compile('\n[\s]*\n', re.MULTILINE)

In [120]:
iberico['cleaned'] = tmp_text = tmp_text.str.replace(multiline, '\n').apply(lambda s: [ss.strip() for ss in s.split('\n')])

### Finally, get rid of the empty cells.

In [121]:
cleaned_df = iberico[tmp_text.apply(len) > 0][['REF. HESPERIA', 'cleaned']].rename({'cleaned': 'texts'})

In [122]:
cleaned_df

Unnamed: 0,REF. HESPERIA,cleaned
0,GI.11.01,[obi]
1,GI.08.05,[---takokoba---]
2,GI.08.11,[---e≈õ]
3,GI.10.30,[---ban]
4,GI.10.21,[kans+]
...,...,...
2479,GI.00.01,"[, baitolo, baitolo]"
2481,L.25.01FALSA,[nlti≈ïbo]
2482,L.23.01FALSA,[≈õaiti]
2483,L.22.01FALSA,"[ikon-, ·∏økei·∏øi, iltubel-, e≈õeban]"


In [123]:
cleaned_df.to_csv('iberian.csv', index=None)