Skip to content

Commit

Permalink
Refactor for more efficient pycharm work
Browse files Browse the repository at this point in the history
 - i mean, not just pycharm, but yeah, i've renamed the temp files so
   it's easier to debug them by grepping or whatever;

 - moved some stuff around the modules;

 - added the 'validate_aff.py' script just to make sure the .aff files
   are okay (gpt did most of it, so dont trust it).
  • Loading branch information
p-goulart committed Jan 18, 2024
1 parent 3dbf8cd commit 1f2b122
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 27 deletions.
30 changes: 19 additions & 11 deletions lib/dic_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,21 @@
from lib.constants import LATIN_1_ENCODING
from lib.logger import LOGGER
from lib.shell_command import ShellCommand
from lib.variant import Variant


class DicChunk:
"""This class represents a single chunk of a Hunspell dictionary file.
Attributes:
filepath (str): the path to the chunk
name (str): the name of the chunk (e.g. chunk0)
compounds (bool): whether this is a file containing compounds or not; if True, this chunk will *not* be
tokenised;
"""
def __init__(self, filepath: str, compounds: bool = False):
def __init__(self, filepath: str, name: str, compounds: bool = False):
self.filepath = filepath
self.name = name
self.compounds = compounds

def __str__(self) -> str:
Expand All @@ -32,20 +35,27 @@ def rm(self) -> None:
shutil.rmtree(self.filepath)

@classmethod
def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, sample_size: int) -> List:
def from_hunspell_dic(cls, variant: Variant, chunk_size: int, target_dir: str, sample_size: int,
compounds: bool = False) -> List:
"""Splits a dictionary file into smaller files (chunks) of a given number of lines.
Args:
dic_path (str): the path to the Hunspell .dic file
variant (Variant): the variant for which we want to unmunch the .dic file
chunk_size (int): the number of lines per chunk
target_dir (str): the directory where the chunks will be saved
sample_size (int): the number of lines to read from the dictionary file; if 0 or negative, read all lines
compounds (bool): whether this is a file containing compounds or not
Returns:
A list of DicChunk objects, each representing a chunk of the dictionary file
"""
if compounds:
tmp_dir = path.join(target_dir, 'compounds')
dic_path = variant.compounds()
else:
tmp_dir = target_dir
dic_path = variant.dic()
LOGGER.debug(f"Splitting dictionary file \"{dic_path}\" into chunks...")
compounds = (True if 'compounds' in dic_path else False)
with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file:
lines = dic_file.readlines()[1:] # Skip the first line
lines = [line for line in lines if not line.startswith("#")] # Filter out comment lines
Expand All @@ -55,17 +65,14 @@ def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, samp
str_chunks: List[List[str]] = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
chunks: List[cls] = []
for index, chunk in enumerate(str_chunks):
if compounds:
tmp_dir = path.join(target_dir, 'compounds')
else:
tmp_dir = target_dir
filename = path.basename(dic_path).replace('.dic', f'_chunk{index}.dic')
chunk_name = f"{variant.underscored}_chunk{index}"
filename = chunk_name + ".dic"
chunk_path = path.join(tmp_dir, filename)
with open(chunk_path, 'w', encoding=LATIN_1_ENCODING) as chunk_file:
# Prepend the count of lines in this chunk and then write all lines
chunk_file.write(f"{len(chunk)}\n")
chunk_file.writelines(chunk)
chunks.append(cls(chunk_path, compounds))
chunks.append(cls(chunk_path, chunk_name, compounds))
LOGGER.debug(f"Split into {len(chunks)} chunks.")
return chunks

Expand All @@ -79,7 +86,8 @@ def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile
Returns:
the temp file containing the unmunched dictionary
"""
unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb')
unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb',
prefix=f"{self.name}_unmunched_")
LOGGER.debug(f"Unmunching {self} into {unmunched_tmp.name} ...")
cmd_unmunch = f"unmunch {self.filepath} {aff_path}"
unmunch_result = ShellCommand(cmd_unmunch).run()
Expand Down
5 changes: 4 additions & 1 deletion lib/languagetool_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from tempfile import NamedTemporaryFile
from typing import List

Expand Down Expand Up @@ -33,7 +34,9 @@ def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile:
a NamedTemporaryFile with the result of tokenisation written to it; note this is a UTF-8-encoded file; it is
not at this stage that we move from latin-1 encoding to UTF-8.
"""
tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w')
chunk_pattern = re.compile("[a-z]{2}_[A-Z]{2}(?:_[a-zA-Z0-9]+)?_chunk\\d+")
prefix = chunk_pattern.findall(unmunched_file.name.split('/')[-1])[0] + "_tokenised_"
tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w', prefix=prefix)
LOGGER.debug(f"Tokenising {unmunched_file.name} into {tokenised_tmp.name} ...")
tokenise_cmd = (
f"java -cp {LT_JAR_PATH}:"
Expand Down
7 changes: 4 additions & 3 deletions lib/shell_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ def __init__(self, return_code: int, stderr: AnyStr = None):

class ShellCommand:
"""A class for executing Java commands."""
def __init__(self, command_str: str, env: dict = None):
def __init__(self, command_str: str, env: dict = None, cwd: str = '.'):
self.command_str = command_str
self.split_cmd = shlex.split(self.command_str)
self.env: dict = {**os.environ}
self.cwd = cwd
if env is not None:
self.env.update(env)

Expand All @@ -33,13 +34,13 @@ def check_status(return_code: int, stderr: AnyStr) -> None:
def _popen(self, text: bool = False) -> subprocess.Popen:
try:
return subprocess.Popen(self.split_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, text=text, env=self.env)
stderr=subprocess.PIPE, text=text, env=self.env, cwd=self.cwd)
except FileNotFoundError:
raise ShellCommandException(255, "Command or file not found.")

def _run(self) -> subprocess.run:
try:
return subprocess.run(self.split_cmd, capture_output=True, env=self.env)
return subprocess.run(self.split_cmd, capture_output=True, env=self.env, cwd=self.cwd)
except FileNotFoundError:
raise ShellCommandException(255, "Command or file not found.")

Expand Down
19 changes: 11 additions & 8 deletions lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,38 @@
import codecs
import shutil
from datetime import timedelta
from os import chdir, path
from os import path
from tempfile import NamedTemporaryFile
from typing import Optional

from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING
from lib.constants import LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING
from lib.shell_command import ShellCommand
from lib.logger import LOGGER


def compile_lt_dev():
"""Build with maven in the languagetool-dev directory."""
LOGGER.info("Compiling LT dev...")
chdir(path.join(LT_DIR, "languagetool-dev"))
ShellCommand("mvn clean compile assembly:single").run()
chdir(REPO_DIR) # Go back to the repo directory
wd = path.join(LT_DIR, "languagetool-dev")
ShellCommand("mvn clean compile assembly:single", cwd=wd).run()


def compile_lt():
"""Build with maven in the languagetool-dev directory."""
LOGGER.info("Compiling LT...")
ShellCommand("mvn clean install -DskipTests", cwd=LT_DIR).run()


def install_dictionaries(custom_version: Optional[str]):
"""Install our dictionaries to the local ~/.m2."""
LOGGER.info("Installing dictionaries...")
chdir(JAVA_RESULTS_DIR)
env: dict = {}
if custom_version is not None:
LOGGER.info(f"Installing custom version \"{custom_version}\"")
env['PT_DICT_VERSION'] = custom_version
else:
LOGGER.info(f"Installing environment-defined version \"{env['PT_DICT_VERSION']}\"")
ShellCommand("mvn clean install", env=env).run()
chdir(REPO_DIR) # Go back to the repo directory
ShellCommand("mvn clean install", env=env, cwd=JAVA_RESULTS_DIR).run()


def convert_to_utf8(tmp_file: NamedTemporaryFile, delete_tmp: bool = False) -> NamedTemporaryFile:
Expand Down
3 changes: 3 additions & 0 deletions lib/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def __init__(self, locale_code: str):
def __str__(self) -> str:
return self.hyphenated

def __repr__(self) -> str:
return self.hyphenated

def aff(self) -> str:
return path.join(HUNSPELL_DIR, f"{self.underscored}.aff")

Expand Down
10 changes: 7 additions & 3 deletions scripts/build_spelling_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lib.dic_chunk import DicChunk
from lib.logger import LOGGER
from lib.constants import SPELLING_DICT_DIR
from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta
from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta, compile_lt
from lib.variant import Variant, VARIANT_MAPPING
from lib.languagetool_utils import LanguageToolUtils as LtUtils

Expand Down Expand Up @@ -83,7 +83,11 @@ def main():
f"CUSTOM_INSTALL_VERSION: {CUSTOM_INSTALL_VERSION}\n"
f"DIC_VARIANTS: {DIC_VARIANTS}\n"
)
# We might consider *always* compiling, since the spelling dicts depends on the tagger dicts having been *installed*
# and compiled with LT. The reason we need to also re-build LT is that we need to make sure that OUR tagger dicts
# are used by the WordTokenizer.
if FORCE_COMPILE:
compile_lt()
compile_lt_dev()
tasks = []
processed_files: dict[str: List[NamedTemporaryFile]] = {}
Expand All @@ -92,8 +96,8 @@ def main():
# and then split them based on the dialectal and pre/post agreement alternation files
for variant in DIC_VARIANTS:
processed_files[variant] = []
dic_chunks: List[DicChunk] = DicChunk.from_hunspell_dic(variant.dic(), CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE)
dic_chunks.extend(DicChunk.from_hunspell_dic(variant.compounds(), CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE))
dic_chunks: List[DicChunk] = DicChunk.from_hunspell_dic(variant, CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE)
dic_chunks.extend(DicChunk.from_hunspell_dic(variant, CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE, compounds=True))
for chunk in dic_chunks:
tasks.append((variant, chunk))
LOGGER.info("Starting unmunching and tokenisation process...")
Expand Down
8 changes: 7 additions & 1 deletion scripts/build_tagger_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
"""
import argparse
import os
from datetime import datetime

from lib.languagetool_utils import LanguageToolUtils
from lib.logger import LOGGER
from lib.constants import (TAGGER_BUILD_SCRIPT_PATH, FDIC_DIR, RESULT_POS_DICT_FILEPATH,
SORTED_POS_DICT_FILEPATH, POS_DICT_DIFF_FILEPATH, OLD_POS_DICT_FILEPATH, REPO_DIR,
TAGGER_DICT_DIR, LT_RESULTS_DIR)
from lib.shell_command import ShellCommand
from lib.utils import compile_lt_dev, install_dictionaries
from lib.utils import compile_lt_dev, install_dictionaries, pretty_time_delta
from lib.variant import Variant


Expand Down Expand Up @@ -59,6 +60,8 @@ def run_shell_script() -> None:


def main():
start_time = datetime.now()
LOGGER.debug(f"Started at {start_time.strftime('%r')}")
if FORCE_COMPILE:
compile_lt_dev()
run_shell_script()
Expand All @@ -67,6 +70,9 @@ def main():
lt.build_synth_binary()
if FORCE_INSTALL:
install_dictionaries(custom_version=CUSTOM_INSTALL_VERSION)
end_time = datetime.now()
LOGGER.debug(f"Finished at {end_time.strftime('%r')}. "
f"Total time elapsed: {pretty_time_delta(end_time - start_time)}.")


if __name__ == "__main__":
Expand Down
53 changes: 53 additions & 0 deletions scripts/validate_aff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""This was mostly made by chatgpt but of course i had to fix it because AI is stoopid."""
import sys
import re


def validate_hunspell_aff(file_content):
lines = file_content.split('\n')
valid = True
errors = []

i = 0
while i < len(lines):
line = lines[i].strip()
if line.startswith("SFX") or line.startswith("PFX"):
parts = line.split()
if len(parts) >= 4 and parts[2] == 'Y':
rule_count = int(parts[3])
rule_type = parts[0]
rule_name = parts[1]
rule_lines = 0
rule_start_line = i
i += 1
same_block_pattern = re.compile(f"{rule_type}\\s+{rule_name}")
while i < len(lines) and same_block_pattern.search(lines[i]):
if not lines[i].strip().startswith("#"):
rule_lines += 1
i += 1

if rule_lines != rule_count:
valid = False
errors.append(f"Rule {rule_type} {rule_name} at line {rule_start_line + 1}: "
f"Expected {rule_count} rules, found {rule_lines}")
continue
i += 1

return valid, errors


def validate_hunspell_aff_file(filepath):
try:
with open(filepath, 'r', encoding='latin-1') as file:
file_content = file.read()
except FileNotFoundError:
return False, ["File not found."]
except UnicodeDecodeError:
return False, ["File encoding issue. Ensure the file is in LATIN-1 encoding."]
except Exception as e:
return False, [str(e)]
return validate_hunspell_aff(file_content)


if __name__ == '__main__':
print(validate_hunspell_aff_file(sys.argv[1]))

0 comments on commit 1f2b122

Please sign in to comment.