Skip to content

Commit

Permalink
Improve logging
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Jan 15, 2024
1 parent 117b47b commit 46971e5
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 5 deletions.
2 changes: 2 additions & 0 deletions lib/dic_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, samp
Returns:
A list of DicChunk objects, each representing a chunk of the dictionary file
"""
LOGGER.debug(f"Splitting dictionary file \"{dic_path}\" into chunks...")
compounds = (True if 'compounds' in dic_path else False)
with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file:
lines = dic_file.readlines()[1:] # Skip the first line
Expand All @@ -65,6 +66,7 @@ def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, samp
chunk_file.write(f"{len(chunk)}\n")
chunk_file.writelines(chunk)
chunks.append(cls(chunk_path, compounds))
LOGGER.debug(f"Split into {len(chunks)} chunks.")
return chunks

def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile:
Expand Down
9 changes: 7 additions & 2 deletions lib/languagetool_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile:
tokenisation_result = ShellCommand(tokenise_cmd).run_with_input(unmunched_str)
tokenised_tmp.write(tokenisation_result)
tokenised_tmp.flush()
LOGGER.debug(f"Done tokenising {unmunched_file.name}!")
return tokenised_tmp

def build_spelling_binary(self, tokenised_temps: List[NamedTemporaryFile]) -> None:
Expand All @@ -62,7 +63,7 @@ def build_spelling_binary(self, tokenised_temps: List[NamedTemporaryFile]) -> No
Returns:
None
"""
LOGGER.info(f"Building binary for {self.variant}...")
LOGGER.info(f"Building spelling binary for {self.variant}...")
megatemp = NamedTemporaryFile(delete=self.delete_tmp, mode='w',
encoding='utf-8') # Open the file with UTF-8 encoding
lines = set()
Expand All @@ -80,11 +81,12 @@ def build_spelling_binary(self, tokenised_temps: List[NamedTemporaryFile]) -> No
f"-o {self.variant.dict()}"
)
ShellCommand(cmd_build).run()
LOGGER.info(f"Done compiling {self.variant} dictionary!")
LOGGER.info(f"Done compiling {self.variant} spelling dictionary!")
self.variant.copy_spell_info()
megatemp.close()

def build_pos_binary(self) -> None:
LOGGER.info(f"Building part-of-speech binary for {self.variant}...")
cmd_build = (
f"java -cp {LT_JAR_PATH} "
f"org.languagetool.tools.POSDictionaryBuilder "
Expand All @@ -93,9 +95,11 @@ def build_pos_binary(self) -> None:
f"-o {self.variant.pos_dict_java_output_path()}"
)
ShellCommand(cmd_build).run()
LOGGER.info(f"Done compiling {self.variant} part-of-speech dictionary!")
self.variant.copy_pos_info()

def build_synth_binary(self) -> None:
LOGGER.info(f"Building synthesiser binary for {self.variant}...")
cmd_build = (
f"java -cp {LT_JAR_PATH} "
f"org.languagetool.tools.SynthDictionaryBuilder "
Expand All @@ -104,5 +108,6 @@ def build_synth_binary(self) -> None:
f"-o {self.variant.synth_dict_java_output_path()}"
)
ShellCommand(cmd_build).run()
LOGGER.info(f"Done compiling {self.variant} synthesiser dictionary!")
self.variant.copy_synth_info()
self.variant.rename_synth_tag_files()
17 changes: 17 additions & 0 deletions lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import codecs
import shutil
from datetime import timedelta
from os import chdir, path
from tempfile import NamedTemporaryFile
from typing import Optional
Expand Down Expand Up @@ -39,3 +40,19 @@ def convert_to_utf8(tmp_file: NamedTemporaryFile, delete_tmp: bool = False) -> N
shutil.copyfileobj(file, utf8_tmp)
utf8_tmp.seek(0)
return utf8_tmp


def pretty_time_delta(time_delta: timedelta) -> str:
"""Taken from https://gist.github.com/thatalextaylor/7408395 and tweaked slightly."""
seconds = int(time_delta.total_seconds())
days, seconds = divmod(seconds, 86400)
hours, seconds = divmod(seconds, 3600)
minutes, seconds = divmod(seconds, 60)
if days > 0:
return '%dd%dh%dmin%ds' % (days, hours, minutes, seconds)
elif hours > 0:
return '%dh%dmin%ds' % (hours, minutes, seconds)
elif minutes > 0:
return '%dmin%ds' % (minutes, seconds)
else:
return '%ds' % (seconds,)
9 changes: 6 additions & 3 deletions scripts/build_spelling_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lib.dic_chunk import DicChunk
from lib.logger import LOGGER
from lib.constants import SPELLING_DICT_DIR
from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8
from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta
from lib.variant import Variant, VARIANT_MAPPING
from lib.languagetool_utils import LanguageToolUtils as LtUtils

Expand Down Expand Up @@ -69,7 +69,8 @@ def process_variant(variant: Variant, dic_chunk: DicChunk) -> tuple[Variant, Nam


def main():
LOGGER.debug(f"started at {datetime.now().strftime('%r')}")
start_time = datetime.now()
LOGGER.debug(f"Started at {start_time.strftime('%r')}")
LOGGER.debug(
f"Options used:\n"
f"TMP_DIR: {TMP_DIR}\n"
Expand Down Expand Up @@ -107,7 +108,9 @@ def main():
file.close()
if FORCE_INSTALL:
install_dictionaries(CUSTOM_INSTALL_VERSION)
LOGGER.debug(f"finished at {datetime.now().strftime('%r')}")
end_time = datetime.now()
LOGGER.debug(f"Finished at {end_time.strftime('%r')}. "
f"Total time elapsed: {pretty_time_delta(end_time - start_time)}.")


if __name__ == "__main__":
Expand Down

0 comments on commit 46971e5

Please sign in to comment.