# Database connection test

# Preparation

In [1]:
# Imports
import sqlite3
import pandas as pd

## Main connection

In [2]:
# Connect to main database
db_path = '../TsakonianDB.sqlite3'
conn = sqlite3.connect(db_path)

## Auxiliary functions

In [3]:
def query(query: str,
          cursor = conn.cursor()):

      cursor.execute(query)
      fetched = cursor.fetchall()

      if len(fetched) > 0:
            temp_df = pd.DataFrame(fetched, columns=[i[0] for i in cursor.description])
            return temp_df
      else:
            print('Query executed successfully. No results to show.')

# Add words from main dictionary to the database

## Load dictionary

In [6]:
# Load main dictionary
main_df_path = '../data/tables/Main.xlsx'
main_df = pd.read_excel(main_df_path)

# Drop columns with 'duplicate' in the name
main_df = main_df.loc[:, ~main_df.columns.str.contains('duplicate')]

# Remove rows that have NaN in both `nowakowski` and `kostakis` columns
main_df = main_df.dropna(subset=['nowakowski', 'kostakis'], how='all')
main_df

Unnamed: 0,nowakowski,kostakis,greek,english,paradigm,source_id,lemma,ipa,leonidio,voskina,...,sampatiki,livadi,tyros,melana,sapounakaiika,palaiochora,agios_andreas,kastanitsa,sitaina,prastos
0,άβατθε,άβατ̇ε,άκλαυτος,,Ε,1.0,,,,,...,,,,,,,,,,
1,άγιε,άγιε,εκκλησία,,Α4,3.0,,,,,...,,,,,,,,,,
2,άγο,άγο,άλογο,,Α0,1.0,,,,,...,,,,,,,,,,
3,άγουστε,άγουστε,αύγουστος,,,1.0,,,,,...,,,,,,,,,,
4,άρζα,άζ̌α,"αραία, όχι συχνά",,,1.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,κρόπο,κρόπο,κοπριά,,Θ0,3.0,,,,,...,,,,,,,,,,
1067,καμάρα,καμάρα,καμάρα,,Θ,3.0,,,,,...,,,,,,,,,,
1068,κοπέα,κοπέα,κοπέλα,,Θ,3.0,,,,,...,,,,,,,,,,
1069,προβάτα,προβάτα,προβατίνα,,Θ,3.0,,,,,...,,,,,,,,,,


In [10]:
from Levenshtein import distance as lev_distance
import pandas as pd

def obtain_close_suggestions(input_string: str, 
                             list_of_strings: list, 
                             threshold: int = 2,
                             orthography = 'nowakowski') -> list:
  """
  Generates search suggestions based on Levenshtein distance.
  Entries with a distance less than or equal to the threshold are returned.

  Parameters:
  - input_string (str): The input string to compare against.
  - list_of_strings (list): A list of strings to compare against.
  - threshold (int): The maximum distance allowed for a string to be considered a match.
  - orthography (str): The orthography to use for the distance calculation. 
    Options are 'nowakowski' (default) and 'kostakis'.
  """
  # Compute distances
  distances = {target_string : lev_distance(input_string, target_string) for target_string in list_of_strings}

  # Convert to pd.Series
  distances_df = pd.Series(distances)

  # Filter based on threshold
  close_suggestions = distances_df[distances_df <= threshold].sort_values().index.tolist()

  return close_suggestions

# Obtain list of all words in the dictionary
all_words = main_df['nowakowski'].tolist()

# Test the function
input_string = 'αβατθε'
obtain_close_suggestions(input_string, all_words, threshold=3)

['άβατθε', 'έατε', 'έξατε', 'αβανία', 'ακόρβατθε', 'απατζά', 'τότθε', 'γρότθε']

# Upload main_df to the database

In [5]:
# Show the dataframe
main_df

Unnamed: 0,nowakowski,kostakis,greek,english,paradigm,source_id,lemma,ipa,tsakonian_duplicate,tsakonian_greek_duplicate,...,sampatiki,livadi,tyros,melana,sapounakaiika,palaiochora,agios_andreas,kastanitsa,sitaina,prastos
0,άβατθε,άβατ̇ε,άκλαυτος,,Ε,1.0,,,άβατ̇ε,άβατ̇ε-άκλαυτος,...,,,,,,,,,,
1,άγιε,άγιε,εκκλησία,,Α4,3.0,,,άγιε,άγιε-εκκλησία,...,,,,,,,,,,
2,άγο,άγο,άλογο,,Α0,1.0,,,άγο,άγο-άλογο,...,,,,,,,,,,
3,άγουστε,άγουστε,αύγουστος,,,1.0,,,άγουστε,άγουστε-αύγουστος,...,,,,,,,,,,
4,άρζα,άζ̌α,"αραία, όχι συχνά",,,1.0,,,άζ̌α,"άζ̌α-αραία, όχι συχνά",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1490,,,,,,,-,--,---,,...,,,,,,,,,,
1491,,,,,,,-,--,---,,...,,,,,,,,,,
1492,,,,,,,-,--,---,,...,,,,,,,,,,
1493,,,,,,,-,--,---,,...,,,,,,,,,,


In [6]:
main_df.to_sql('dictionary_entry', 
               conn, 
               if_exists='replace', 
               index=False)

1495

# Update sources table

In [None]:
# Read sources.xlsx
sources_df_path = '../data/tables/sources.xlsx'
sources_df = pd.read_excel(sources_df_path)
sources_df.head()

In [None]:
# Copy the sources into the database
sources_df.to_sql('dictionary_source', 
                  conn, 
                  if_exists='replace', 
                  index=False,
                  dtype = {'id': 'bigint',
                           'source': 'varchar(100)',
                           'author': 'varchar(100)',
                           'year': 'varchar(10)',
                           'notes': 'varchar(100)'
                  })

In [None]:
# Copy the full sources table into the database
sources_df.to_sql('dictionary_source', 
                  conn, 
                  if_exists='replace', 
                  index=False,
                  dtype = {'id': 'bigint',
                           'source': 'varchar(100)',
                           'author': 'varchar(100)',
                           'year': 'varchar(10)',
                           'notes': 'varchar(100)'
                  })

# Delete the temporary table if it exists
try:
    query("DROP TABLE sqlitestudio_temp_table;")
except:
    pass

# Add primary keys and foreign keys by recreating the table
# Obtained from SQLiteStudio
recreating_query = """PRAGMA foreign_keys = 0;

CREATE TABLE sqlitestudio_temp_table AS SELECT *
                                            FROM dictionary_source;

DROP TABLE dictionary_source;

CREATE TABLE dictionary_source (
    source_id   INTEGER       PRIMARY KEY AUTOINCREMENT,
    title       VARCHAR (300),
    url         VARCHAR (200)
);

INSERT INTO dictionary_source (
                                    source_id,
                                    title,
                                    url
                                )
                                SELECT source_id,
                                         title,
                                         url
                                    FROM sqlitestudio_temp_table;

DROP TABLE sqlitestudio_temp_table;

PRAGMA foreign_keys = 1;"""

# Execute queries in a loop
# Only one query can be executed at a time
for q in recreating_query.split(';'):
      query(q)

# Save changes
conn.commit()