# Dictionary compilation from Για να κ̇οντούμε γρούσσα νάμου

This notebook aims to create a dictionary from the words contained in Ioannis Kambysis' book, _Για να κχοντούμε γρούσσα νάμου_ (2020)

# Preparation

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

# Image processing
import pytesseract
from PIL import Image

# Others
import re
import pyperclip as pc
import unicodedata
import copy
import os

# Path to the Tesseract executable (change this to your Tesseract installation path)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Auxiliary functions

In [2]:
def move_raw_files(segment_name: str,
                   downloads_path: str = r'C:\Users\jgcha\Downloads',
                   ocr_images_path: str = r'C:\Users\jgcha\Desktop\Python\Códigos\Tsakonian tools\other_projects\OCR Gia na khontoume groussa namou\imgs'): 
    
    """
    Cuts the last to downloaded files from the Downloads folder and moves them to the raw folder.
    and moves them to the OCR imgs folder. 

    The first file is the one that was downloaded first and contains the Tsakonian text.
    The second file is Greek.

    Parameters
    ----------
    segment_name : str
        Name of the extracted book segment in format [page]-[segment number].

    downloads_path : str
        Absolute path to the Downloads folder.

    ocr_images_path : str
        Absolute path to the OCR images folder.
    """

    files = os.listdir(downloads_path)
    files = [rf'{downloads_path}\{file}' for file in files]
    files.sort(key=os.path.getctime, reverse=True)
    files = files[:2]

    # Assert that the files start with 'WhatsApp'
    for file in files:
        assert os.path.basename(file).startswith('WhatsApp'), 'Files must start with WhatsApp.'

    # Rename the files

    filenames = [f'{downloads_path}\{segment_name}-gr.jpeg',
                 f'{downloads_path}\{segment_name}-ts.jpeg',
                 ]
    for file, filename in zip(files, filenames):
        os.rename(file, filename)

    # Move the files to the OCR images folder
    for filename in filenames:
        os.replace(filename, f'{ocr_images_path}\{os.path.basename(filename)}')

    print('Files moved to the OCR images folder.')

In [3]:
def ocr(img):
    # Path to the image you want to process
    image_path = os.path.join(r'C:\Users\jgcha\Desktop\Python\Códigos\Tsakonian tools\other_projects\OCR Gia na khontoume groussa namou\imgs', img)

    # Load the image using PIL (Python Imaging Library)
    image = Image.open(image_path)

    # Set the languages parameter to a list of language codes
    languages = ['eng', 'ell', 'fra', 'deu']  # Language codes for English and Greek
    languages = ['ell']


    # Perform OCR on the image with the specified languages
    extracted_text = pytesseract.image_to_string(image, lang='+'.join(languages))

    # Copy the extracted text to the clipboard
    pc.copy(extracted_text)

    return extracted_text        

In [378]:
def postprocessing(text):
    """
    Cleans common OCR errors for easier processing
    """

    # Replace 0 with O
    text = re.sub(r'0', 'Ο', text)

    # Add a new line before the first article
    text = "\n\n" + text

    # Separate articles from the rest of the text
    articles = ['Α', 'Οι']
    for article in articles:
        text = re.sub(rf'\n{article}(\S)', rf'\n{article} \1', text)

    # Add a space before and after the hyphen
    text = re.sub(r'(\S)-', r'\1 -', text)
    text = re.sub(r'-(\S)', r'- \1', text)

    # Separate the articles after the hyphen from the rest of the text
    text = re.sub(r'- Οι(\S)', r'- Οι \1', text)

    # Remove empty lines
    text = re.sub(r'\n+', r'\n', text)

    # Replace initials 
    replaces = {
        "\nϱ" : "Ο",
        "\nΛ" : "Α",
    }
    for key, value in replaces.items():
        text = re.sub(key, value, text)

    # Capitalize the first letter of each line
    text = re.sub(r'^(\S)', lambda m: m.group(1).upper(), text, flags=re.MULTILINE)

    # Strip text
    text = text.strip()

    # Copy the processed text to the clipboard
    pc.copy(text)

    return text

In [304]:
# Keep only necessary information
def condense(postprocessed_text):
    """
    Removes unnecessary information from the postprocessed OCR output.
    """

    # Remove everything after the hyphen except the last two letters
    text = postprocessed_text.strip().split('\n')
    text = [f'{text.split("-")[0]} - {text.split("-")[1][-2:]}' if '-' in text else text
            for text in text]
    text = '\n'.join(text)

    # Remove double spaces
    text = re.sub(r' +', r' ', text)    

    # Copy the reduced text to the clipboard
    # pc.copy(text)

    return text

In [320]:
def extract_plural_suffixes(condensed_text):
    """
    Extracts the plural suffixes from the postprocessed OCR output.
    """

    # Convert to a list
    text_list = condensed_text.strip().split('\n')

    # For suffixes, remove consonantes except when:
    # 1) The word ends in -α and the suffix is -λε
    # 2) The word ends in -ε or /o and the suffix is -νε
    # 3) The plural is -δε and the second to last letter of the word is not δ
    # 4) The word is neuter (article is Το), the suffix is -τα and 
    #    the second to last letter of the word is not τ

    # Note: the last μ in the list of consonants is ASCII 181, not 956 (μ in Greek)
    consonants = 'βγδζθκλμνξπρσςτφχψµ'
    processed_text_list = []
    for text in text_list:
        # Exception 1
        if text.endswith('α - λε'):
            processed_text_list.append(text)

        # Exception 2
        elif text.endswith('ε - νε') or text.endswith('ο - νε'):
            processed_text_list.append(text)

        # Exception 3
        elif text.endswith('δε') and text.split()[1][-2] != 'δ':
            processed_text_list.append(text)

        # Exception 4
        elif text.startswith('Το') and text.endswith('τα') and text.split()[1][-2] != 'τ':
            processed_text_list.append(text)
        
        # General case
        else:
            processed_text_list.append(re.sub(rf' - [{consonants}]', ' - ', text))

    # Rejoin the list
    text = '\n'.join(processed_text_list)

    return text

In [322]:
def move_articles(clean_plurals_text):
    """
    Moves the articles to the end of the word.
    """

    # Convert to a list
    text_list = clean_plurals_text.strip().split('\n')

    # Move the articles to the end of the word
    processed_text_list = []
    for line in text_list:
        article = line.split(' ')[0]
        rest = ' '.join(line.split(' ')[1:])
        line = f'{rest} - {article}'
        processed_text_list.append(line)

    # Rejoin the list
    text = '\n'.join(processed_text_list)

    # Copy the text to the clipboard
    pc.copy(text)

    return text

In [323]:
# Convert to dataframe
def text_to_dataframe(manual_clean_text):
    """
    Converts the text to a dataframe.
    """

    # Convert to a list
    text_list = manual_clean_text.strip().split('\n')

    # Convert to a dataframe
    df = pd.DataFrame([text.split(' - ') for text in text_list], columns=['word', 'plural', 'article'])

    return df

In [None]:
def remove_accents(input_text):
    nfkd_form = unicodedata.normalize('NFKD', input_text)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

In [329]:
def compute_masculine_paradigm(entry_: dict):
    """
    Computes the hypothesized paradigm for a given masculine entry.
    """

    # Remove accents from the word and plural
    for key in ['word', 'plural']:
        entry_[key] = remove_accents(entry_[key])

    ### Apply rules ###
    # A1: plural in -οι
    if entry_['plural'] == 'οι':
        return "Α1"
    
    # A2: plural in -ε
    elif entry_['plural'] == 'ε':
        return "Α2"
    
    # A3: plural in -ου
    elif entry_['plural'] == 'ου':
        return "Α3"
    
    # A4: plural in -νε 
    # NOTE: plurals are formed by adding -ούνε to the singular
    # but it is checked here as -νε for simplicity
    elif entry_['plural'] == 'νε':
        return "Α4"
    
    # A5: plural in -δε
    elif entry_['plural'] == 'δε':
        return "Α5"
    
    # NaN if no rule applies
    else:
        return "Α"

In [330]:
def compute_femenine_paradigm(entry_: dict):
    """
    Computes the hypothesized paradigm for a given feminine entry.
    """

    # Remove accents from the word
    entry_['word'] = remove_accents(entry_['word'])

    ### Apply rules ###
    # Θ3: plural in -άε
    # If it does not hold, remove the accent from the plural suffix
    if entry_['plural'] == 'άε':
        return "Θ3"
    else:
        entry_['plural'] = remove_accents(entry_['plural'])

    # Θ2: plural in -λε
    if entry_['plural'] == 'λε':
        return "Θ2"    
    
    # Θ4: plural in -δε and word does not end in -ου
    elif entry_['plural'] == 'δε' and not entry_['word'].endswith('ου'):
        return "Θ4"

    # Θ5: plural in -δε and word ends in -ου
    elif entry_['plural'] == 'δε' and entry_['word'].endswith('ου'):
        return "Θ5"
    
    # Θ1: rest of plurals in ending in -ε
    elif entry_['plural'].endswith('ε'):
        return "Θ1"
        
    # NaN if no rule applies
    else:
        return "Θ"

In [331]:
def compute_neuter_paradigm(entry_):
    """
    Computes the hypothesized paradigm for a given neuter entry.
    """

    # Remove accents from the word and plural
    for key in ['word', 'plural']:
        entry_[key] = remove_accents(entry_[key])    

    ### Apply rules ###
    # Υ2: word ends in -μα
    if entry_['word'].endswith('μα'):
        return "Υ2"
    
    # Υ3: plural ends in -ια
    elif entry_['plural'] == 'ια':
        return "Υ3"
    
    # Υ4: word ends in -ι and plural ends in -τα
    elif entry_['word'].endswith('ι') and entry_['plural'] == 'τα':
        return "Υ4"
    
    # Υ5: word ends in -ε and plural ends in -τα
    elif entry_['word'].endswith('ε') and entry_['plural'] == 'τα':
        return "Υ5"
    
    # Y1: rest of plurals in -α
    elif entry_['plural'] == 'α':
        return "Υ1"
    
    # Υ if no rule applies
    else:
        return 'Υ'

In [332]:
def compute_paradigms(basic_dataframe):
    """
    Extracts noun paradigms based on the plural suffixes.
    """

    # Convert to a dictionary
    df_dict = basic_dataframe.to_dict('records')

    # Create a modifyable copy of the df_dict
    # The values will lose the accents, which is not desired
    # for the final dataframe
    df_dict_copy = copy.deepcopy(df_dict)

    ### Paradigm rules ###
    paradigm_df_lines = []
    for entry, copy_entry in zip(df_dict, df_dict_copy):
        # Masculine nouns
        if entry['article'] == 'Ο':
            paradigm = compute_masculine_paradigm(copy_entry)
        
        # Femenine nouns
        elif entry['article'] == 'Α':
            paradigm = compute_femenine_paradigm(copy_entry)

        # Neuter nouns
        elif entry['article'] == 'Το':
            paradigm = compute_neuter_paradigm(copy_entry)

        # NaN if no rule applies
        else:
            paradigm = np.nan

        # Add the paradigm to the entry
        entry['paradigm'] = paradigm
        paradigm_df_lines.append(entry)

    # Convert to a dataframe
    paradigm_df = pd.DataFrame(paradigm_df_lines)

    return paradigm_df

In [285]:
def extract_word(postprocessed_text: str):
    """
    For Greek terms, extracts the word from the postprocessed OCR output.
    """

    # Convert to a list
    text_list = postprocessed_text.strip().split('\n')

    # Extract the part of the word before the hyphen
    processed_text_list = [line.split(' - ')[0] for line in text_list]

    # Remove the articles at the beginning of the word
    articles = ['Ο ', 'Η ', 'Το ', 'Οι ', 'Τα ', 'Ἡ']
    processed_text_list = [' '.join(line.split(' ')[1:]) if line.startswith(tuple(articles)) else line.lower() for line in processed_text_list]

    # Rejoin the list
    text = '\n'.join(processed_text_list)

    # Copy the text to the clipboard
    pc.copy(text)

    return text

# Complete flow

Steps:
* Move image from Downloads folder
* OCR image
* Copy to clipboard
* Fix format in VS Code
* Store in JSON with two keys: `raw` and `processed`
* Copy to Excel

In [342]:
# Move files
img = '37-2'
move_raw_files(img)

Files moved to the OCR images folder.


In [349]:
# Select image
language = '-ts'
extension = '.jpeg'
filename_ = img + language + extension

# OCR
text = {'raw' : ocr(filename_)}
print(text['raw'])

Λελία - Οι ελίε

Το σούκο - Τα σούκα
0 βότόε - Οι βότόου
Το µάλι - Τα µάβα

Α αχρά -Οι αχράε

0 άντε -Οι άντου

Το άρτουµα - Τα αρτούµατα
Το άτσι

Το ύο - Τα ύβατα

Τα Πίτια

0Οφαέ

Αφακά -Οιφαίσέ



In [362]:
text['postprocessed'] = postprocessing(text['raw'])
print(text['postprocessed'])

Αελία - Οι ελίε
Το σούκο - Τα σούκα
Ο βότόε - Οι βότόου
Το µάλι - Τα µάβα
Α αχρά - Οι αχράε
Ο άντε - Οι άντου
Το άρτουµα - Τα αρτούµατα
Το άτσι
Το ύο - Τα ύβατα
Τα Πίτια
ΟΟφαέ
Α φακά - Οι φαίσέ


In [363]:
text['condensed'] = condense(text['postprocessed'])
print(text['condensed'])

Αελία - ίε
Το σούκο - κα
Ο βότόε - ου
Το µάλι - βα
Α αχρά - άε
Ο άντε - ου
Το άρτουµα - τα
Το άτσι
Το ύο - τα
Τα Πίτια
ΟΟφαέ
Α φακά - σέ


In [367]:
text['clean_plurals'] = extract_plural_suffixes(text['condensed'])
print(text['clean_plurals'])

Αελία - ίε
Το σούκο - α
Ο βότόε - ου
Το µάλι - α
Α αχρά - άε
Ο άντε - ου
Το άρτουµα - τα
Το άτσι
Το ύο - τα
Τα Πίτια
ΟΟφαέ
Α φακά - έ


In [368]:
text['articles_moved'] = move_articles(text['clean_plurals'])
print(text['articles_moved'])

- ίε - Αελία
σούκο - α - Το
βότόε - ου - Ο
µάλι - α - Το
αχρά - άε - Α
άντε - ου - Ο
άρτουµα - τα - Το
άτσι - Το
ύο - τα - Το
Πίτια - Τα
 - ΟΟφαέ
φακά - έ - Α


In [371]:
# Perform manual cleaning
manual_clean = """
ελία - ίε - Α
σούκο - α - Το
βότσ̌ε - ου - Ο
µάλ̣ι - α - Το
αχρά - άε - Α
άντε - ου - Ο
άρτουµα - τα - Το
άτσι - X - Το
ύο - τα - Το
π̇ίτια - X - Τα
φαέ - X - Ο
φακά - έ - Α
"""

text['manual_clean'] = manual_clean.strip()
print(text['manual_clean'])

ελία - ίε - Α
σούκο - α - Το
βότσ̌ε - ου - Ο
µάλ̣ι - α - Το
αχρά - άε - Α
άντε - ου - Ο
άρτουµα - τα - Το
άτσι - X - Το
ύο - τα - Το
π̇ίτια - X - Τα
φαέ - X - Ο
φακά - έ - Α


In [372]:
text['dataframes'] = {'raw' : text_to_dataframe(text['manual_clean'])}
text['dataframes']['raw']

Unnamed: 0,word,plural,article
0,ελία,ίε,Α
1,σούκο,α,Το
2,βότσ̌ε,ου,Ο
3,µάλ̣ι,α,Το
4,αχρά,άε,Α
5,άντε,ου,Ο
6,άρτουµα,τα,Το
7,άτσι,X,Το
8,ύο,τα,Το
9,π̇ίτια,X,Τα


In [373]:
text['dataframes']['paradigms'] = compute_paradigms(text['dataframes']['raw'])
text['dataframes']['paradigms']

Unnamed: 0,word,plural,article,paradigm
0,ελία,ίε,Α,Θ1
1,σούκο,α,Το,Υ1
2,βότσ̌ε,ου,Ο,Α3
3,µάλ̣ι,α,Το,Υ1
4,αχρά,άε,Α,Θ3
5,άντε,ου,Ο,Α3
6,άρτουµα,τα,Το,Υ2
7,άτσι,X,Το,Υ
8,ύο,τα,Το,Υ
9,π̇ίτια,X,Τα,


# Extract information from Greek terms

In [374]:
# Select image
language = '-gr'
extension = '.jpeg'
filename_ = img + language + extension

# OCR
text['greek_raw'] = ocr(filename_)
print(text['greek_raw'])

Η ελιά - Οι ελιές

Το σύκο - Τα σύκα

Το σταφύλι - Τα σταφύλια
Το μήλο - Τα μήλα

Το αχλάδι - Τα αχλάδια
Το ψωμί - Τα ψωμιά
Το τυρί - Τα τυριά

Το αλάτι

Το νερό - Τα νερά
Οιχυλοπίτες

Το στάρι

Η φακή - Οι φακές



In [379]:
text['greek_postprocessed'] = postprocessing(text['greek_raw'])
print(text['greek_postprocessed'])

Η ελιά - Οι ελιές
Το σύκο - Τα σύκα
Το σταφύλι - Τα σταφύλια
Το μήλο - Τα μήλα
Το αχλάδι - Τα αχλάδια
Το ψωμί - Τα ψωμιά
Το τυρί - Τα τυριά
Το αλάτι
Το νερό - Τα νερά
Οι χυλοπίτες
Το στάρι
Η φακή - Οι φακές


In [380]:
print(extract_word(text['greek_postprocessed']))

ελιά
σύκο
σταφύλι
μήλο
αχλάδι
ψωμί
τυρί
αλάτι
νερό
χυλοπίτες
στάρι
φακή


In [381]:
# Manual clean if necessary
manual_clean = """
ελιά
σύκο
σταφύλι
μήλο
αχλάδι
ψωμί
τυρί
αλάτι
νερό
χυλοπίτες
στάρι
φακή
"""

text['greek_manual_clean'] = manual_clean.strip()
print(text['greek_manual_clean'])

ελιά
σύκο
σταφύλι
μήλο
αχλάδι
ψωμί
τυρί
αλάτι
νερό
χυλοπίτες
στάρι
φακή


In [382]:
# Covert to Pandas Series
greek_series = pd.Series(text['greek_manual_clean'].strip().split('\n'))
greek_series

0          ελιά
1          σύκο
2       σταφύλι
3          μήλο
4        αχλάδι
5          ψωμί
6          τυρί
7         αλάτι
8          νερό
9     χυλοπίτες
10        στάρι
11         φακή
dtype: object

In [383]:
# Append it to the dataframe
temp_df = text['dataframes']['paradigms'].copy()
temp_df = text['dataframes']['paradigms'].copy()
temp_df['greek'] = greek_series

# Add source = 1
temp_df['source'] = 1

# Add an empty column for easy copy-pasting
temp_df['buffer'] = ''

# Reorder columns
order = ['word', 'greek', 'paradigm', 'source', 'buffer', 'article', 'plural']
temp_df = temp_df[order]

# Save the dataframe
text['dataframes']['complete'] = temp_df 
temp_df

Unnamed: 0,word,greek,paradigm,source,buffer,article,plural
0,ελία,ελιά,Θ1,1,,Α,ίε
1,σούκο,σύκο,Υ1,1,,Το,α
2,βότσ̌ε,σταφύλι,Α3,1,,Ο,ου
3,µάλ̣ι,μήλο,Υ1,1,,Το,α
4,αχρά,αχλάδι,Θ3,1,,Α,άε
5,άντε,ψωμί,Α3,1,,Ο,ου
6,άρτουµα,τυρί,Υ2,1,,Το,τα
7,άτσι,αλάτι,Υ,1,,Το,X
8,ύο,νερό,Υ,1,,Το,τα
9,π̇ίτια,χυλοπίτες,,1,,Τα,X


In [384]:
# Save the dataframe to an excel
path_ = f'./results/{img}.xlsx'
text['dataframes']['complete'].to_excel(path_, index=False)
print(f'{img} saved.')

# Open the excel
complete_path = os.path.abspath(path_)
os.startfile(complete_path)

37-2 saved.
