# Tsakonian dictionary compiler

# Preparation

## Imports

In [23]:
# Data wrangling
import pandas as pd
import numpy as np
import re

# Data scraping
import requests
from bs4 import BeautifulSoup

# Main

# 1) Scraping Λεξικό τσακώνικης διαλέκτου (Dictionary of the Tsakonian dialect) from Leonidio's Municipal Primary School

As explained [on its webpage](http://apps-dim-leonid.ark.sch.gr/):

> The Tsakonian Electronic Dictionary is an electronic dictionary of the Tsakonian dialect, created by students of the 6th grade of the Leonidio Municipal Primary School under the supervision of their teacher Panagiotis Tsagaroglou, during the school years 2004-2006. The dictionary was digitized, supplemented with audio files, and corrected by the same class during the 2015-2016 school year with the funding of the "Learning Together" Program of the Latsis Foundation. Valuable assistance was provided throughout the program by Ioannis Christodoulos in the development of applications and training of students in technical issues, as well as by Ioannis Pavlopoulos and Georgios Bakagiannis for the development of applications for the final entry and presentation. The project was implemented through Django.
> 
> The Tsakonian Electronic Dictionary is an important project for the preservation and dissemination of the Tsakonian dialect, which is threatened with extinction. The dictionary is available for free on the internet and can be used by everyone.

In [108]:
# Define an auxiliary function to extract the term, definition and notes from a dictionary entry
def extract_information(entry):
    """
    Extracts the term, definition and notes from a dictionary
    entry stored in http://apps-dim-leonid.ark.sch.gr.

    Parameters
    ----------
    entry : BeautifulSoup object.

    Returns
    -------
    term : str
        Dictionary term in Tsakonian.
    definition : str
        Word translation in Greek with notes, if any."""

    # Extract the term
    term = (entry
            .findAll('th')[0]
            .text
            .strip()
            .replace(':', ''))

    ### Extract the definition and notes ###
    rows = entry.findAll('tr')

    # From the first row, retain only the first cell
    # The second cell contains audios
    definition = (entry
                   .findAll('tr')[0]
                   .findAll('td')[0]
                   .text
                   .replace('\n', ' ')
                   .strip())
    
    # Remove double spaces
    definition = re.sub(' +', ' ', definition)
    
    # If there is a second row, extract the notes
    if len(rows) > 1:
        notes = (entry
             .findAll('tr')[1]
             .text
             .strip()
             .replace('\xa0', '')
             .replace('\n', ' '))

        # Remove double spaces
        notes = re.sub(' +', ' ', notes)
    
        # Add the notes to the definition
        definition = f"{definition} — {notes}"

    return term, definition

In [87]:
# Extract information for all pages
base_url = 'http://apps-dim-leonid.ark.sch.gr/?page='
n_pages = 93

# Create a list of all pages
pages = []

for i in range(1, n_pages + 1):
    if i % 10 == 0:
        print(f'Page {i} of {n_pages}')
    
    url = base_url + str(i)

    # Get the html content
    html = requests.get(url).content

    # Parse the html content
    soup = BeautifulSoup(html, 'html.parser')

    # Find 'table' elements
    table = soup.find_all('table')

    # Append the table to the list of pages
    pages.append(table)

Page 1 of 93
Page 2 of 93
Page 3 of 93
Page 4 of 93
Page 5 of 93
Page 6 of 93
Page 7 of 93
Page 8 of 93
Page 9 of 93
Page 10 of 93
Page 11 of 93
Page 12 of 93
Page 13 of 93
Page 14 of 93
Page 15 of 93
Page 16 of 93
Page 17 of 93
Page 18 of 93
Page 19 of 93
Page 20 of 93
Page 21 of 93
Page 22 of 93
Page 23 of 93
Page 24 of 93
Page 25 of 93
Page 26 of 93
Page 27 of 93
Page 28 of 93
Page 29 of 93
Page 30 of 93
Page 31 of 93
Page 32 of 93
Page 33 of 93
Page 34 of 93
Page 35 of 93
Page 36 of 93
Page 37 of 93
Page 38 of 93
Page 39 of 93
Page 40 of 93
Page 41 of 93
Page 42 of 93
Page 43 of 93
Page 44 of 93
Page 45 of 93
Page 46 of 93
Page 47 of 93
Page 48 of 93
Page 49 of 93
Page 50 of 93
Page 51 of 93
Page 52 of 93
Page 53 of 93
Page 54 of 93
Page 55 of 93
Page 56 of 93
Page 57 of 93
Page 58 of 93
Page 59 of 93
Page 60 of 93
Page 61 of 93
Page 62 of 93
Page 63 of 93
Page 64 of 93
Page 65 of 93
Page 66 of 93
Page 67 of 93
Page 68 of 93
Page 69 of 93
Page 70 of 93
Page 71 of 93
Page 72 of 93
P

In [92]:
# Unnest all entries from each table
entries = []
for table in pages:
    for entry in table:
        entries.append(entry)

print(f'Number of entries: {len(entries)}')

Number of entries: 4635


In [98]:
# Build the first version of the Tsakonian dictionary
dict_pairs = [extract_information(entry) for entry in entries]

tsakonian_dict = {pair[0]: pair[1] for pair in dict_pairs}

# Create a Pandas dataframe from the dictionary
tsakonian_df = pd.DataFrame.from_dict(tsakonian_dict, orient='index')

# Rename the columns
tsakonian_df.reset_index(inplace=True)
tsakonian_df.columns = ['tsakonian', 'greek']

tsakonian_df

Unnamed: 0,tsakonian,greek
0,α,αν — Σημειώσεις: πριν από σύμφωνο
1,άα,άλλαξα — Σημειώσεις: Αόριστος του ρήματος άσσο...
2,άβα (α),άλλη (η)
3,αβαίνου,λαμβάνω — Συνώνυμα: λαβαίνου
4,αβάκα,κρυφά — Σημειώσεις: Αβάκα επεράτσε α Τσουρακά
...,...,...
4592,ώμορφο (ο),ώμορφος (ο)
4593,ώς,έως μέχρι
4594,ωφέλεια (α),ωφέλεια (η)
4595,ωφέλιμο (ο),"ωφέλιμος,επωφελής (ο)"


In [100]:
# Store dictionary dataframes in a dictionary for version control
lexiko = {'raw' : tsakonian_df.copy()}

In [107]:
# Separate article from the term in the 'tsakonian' column
# and add the article to the 'article' column
temp_df = lexiko['raw'].copy()

# Articles appear between parenthesis
temp_df['article'] = temp_df['tsakonian'].str.extract(r'\((.*?)\)')
temp_df['article'] = (temp_df['article'].str
                      .strip()
                      .fillna(''))

# Remove the article from the 'tsakonian' column
temp_df['tsakonian'] = (temp_df['tsakonian']
                        .str.replace(r'\(.*?\)', '')
                        .str.strip())

# Reorder columns
temp_df = temp_df[['tsakonian', 'article', 'greek']]

# Save in the 'lexiko' dataframe
version = 'v1'
lexiko[version] = temp_df

  .str.replace(r'\(.*?\)', '')


In [114]:
# Save the dictionary to a PDF file
dictionary_name = 'Tsakonian - Greek Dictionary.md'
dictionary_path = 'materials/' + dictionary_name
lexiko[version].to_markdown(dictionary_path, index=False)