From 2edf1d8bee4e1f592a79609d12bdfd3be326d099 Mon Sep 17 00:00:00 2001 From: sainekk Date: Thu, 14 May 2026 10:59:14 +0300 Subject: [PATCH 1/2] add ruff config and clean unused imports --- pyproject.toml | 22 +++++++++++++++++++++- src/harmony/matching/cluster.py | 2 -- src/harmony/matching/kmeans_clustering.py | 6 ------ src/harmony/matching/wmd_matcher.py | 1 - src/harmony/parsing/html_parser.py | 2 +- src/harmony/schemas/requests/text.py | 3 +-- src/harmony/services/export_pdf_report.py | 3 +-- src/harmony/util/url_loader.py | 2 +- 8 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6c0a6dd8..727be93f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,27 @@ dependencies = [ [project.optional-dependencies] # dev - the developer dependency set, for contributors to harmony -dev = ["check-manifest", "pytest", "matplotlib"] +dev = ["check-manifest", "pytest", "matplotlib", "ruff"] + +[tool.ruff] +target-version = "py310" +line-length = 120 +extend-exclude = ["update.ipynb", "Harmony_example_walkthrough.ipynb"] + +[tool.ruff.lint] +# Pragmatic baseline for a legacy research codebase: +# enable pyflakes (real bugs), ignore stylistic noise that would require +# touching many files for no functional gain. +select = ["F", "E9"] +ignore = [ + "F403", # `from module import *` — used intentionally in __init__.py + "F405", # `*`-import undefined names — paired with F403 + "F841", # unused local — common in legacy code, low value to fix now +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] # re-exports +"tests/*" = ["F401"] [project.urls] "Documentation" = "https://harmonydata.ac.uk/frequently-asked-questions/" diff --git a/src/harmony/matching/cluster.py b/src/harmony/matching/cluster.py index 09dc63cf..018cda39 100644 --- a/src/harmony/matching/cluster.py +++ b/src/harmony/matching/cluster.py @@ -8,9 +8,7 @@ from harmony.matching.default_matcher import convert_texts_to_vector from harmony.schemas.requests.text import Question -from harmony.schemas.responses.text import HarmonyCluster -import numpy as np from sklearn.metrics.pairwise import cosine_similarity from harmony.matching.deterministic_clustering import find_clusters_deterministic diff --git a/src/harmony/matching/kmeans_clustering.py b/src/harmony/matching/kmeans_clustering.py index de336b5f..20b18d0a 100644 --- a/src/harmony/matching/kmeans_clustering.py +++ b/src/harmony/matching/kmeans_clustering.py @@ -1,17 +1,11 @@ -import sys from typing import List -import pandas as pd from sklearn.cluster import KMeans -from sklearn.decomposition import PCA -from sklearn.metrics import silhouette_score from harmony.matching.generate_cluster_topics import generate_cluster_topics from harmony.schemas.requests.text import Question from harmony.schemas.responses.text import HarmonyCluster -import numpy as np -from sklearn.metrics.pairwise import cosine_similarity def perform_kmeans(embeddings_in, num_clusters=5): diff --git a/src/harmony/matching/wmd_matcher.py b/src/harmony/matching/wmd_matcher.py index 4fa829f9..09491844 100644 --- a/src/harmony/matching/wmd_matcher.py +++ b/src/harmony/matching/wmd_matcher.py @@ -1,4 +1,3 @@ -from wmd import WMD import numpy as np import math import libwmdrelax diff --git a/src/harmony/parsing/html_parser.py b/src/harmony/parsing/html_parser.py index 32157ea0..decbfe38 100644 --- a/src/harmony/parsing/html_parser.py +++ b/src/harmony/parsing/html_parser.py @@ -33,7 +33,7 @@ # Try to import lxml for better performance, fall back to html.parser try: - import lxml + import lxml # noqa: F401 # availability probe DEFAULT_PARSER = 'lxml' except ImportError: DEFAULT_PARSER = 'html.parser' diff --git a/src/harmony/schemas/requests/text.py b/src/harmony/schemas/requests/text.py index 045d3f09..c1becd68 100644 --- a/src/harmony/schemas/requests/text.py +++ b/src/harmony/schemas/requests/text.py @@ -32,8 +32,7 @@ from harmony.schemas.catalogue_question import CatalogueQuestion from harmony.schemas.enums.file_types import FileType from harmony.schemas.enums.languages import Language -from pydantic import ConfigDict, BaseModel, Field -from typing import Any, Dict, List, Optional +from typing import Any, Dict DEFAULT_FRAMEWORK = "huggingface" DEFAULT_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' diff --git a/src/harmony/services/export_pdf_report.py b/src/harmony/services/export_pdf_report.py index 448ce09e..f4a4d456 100644 --- a/src/harmony/services/export_pdf_report.py +++ b/src/harmony/services/export_pdf_report.py @@ -1,7 +1,6 @@ import os -import io from datetime import datetime -from typing import List, Optional, Tuple +from typing import List, Tuple import tempfile from fpdf import FPDF diff --git a/src/harmony/util/url_loader.py b/src/harmony/util/url_loader.py index 0f3adc07..ccb47805 100644 --- a/src/harmony/util/url_loader.py +++ b/src/harmony/util/url_loader.py @@ -90,7 +90,7 @@ def _validate_url(self, url: str) -> None: parsed = urllib.parse.urlparse(url) if parsed.scheme not in ALLOWED_SCHEMES: - raise BadRequestError(f"URL must use HTTPS") + raise BadRequestError("URL must use HTTPS") if not parsed.netloc or '.' not in parsed.netloc: raise BadRequestError("Invalid domain") From 1e98f07a3b15553f1b00db47150b8adcc11ad456 Mon Sep 17 00:00:00 2001 From: sainekk Date: Fri, 22 May 2026 18:55:00 +0300 Subject: [PATCH 2/2] consolidate duplicate typing imports --- src/harmony/schemas/requests/text.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/harmony/schemas/requests/text.py b/src/harmony/schemas/requests/text.py index c1becd68..5106a540 100644 --- a/src/harmony/schemas/requests/text.py +++ b/src/harmony/schemas/requests/text.py @@ -26,13 +26,12 @@ ''' import uuid -from typing import List, Optional +from typing import Any, Dict, List, Optional from pydantic import ConfigDict, BaseModel, Field from harmony.schemas.catalogue_instrument import CatalogueInstrument from harmony.schemas.catalogue_question import CatalogueQuestion from harmony.schemas.enums.file_types import FileType from harmony.schemas.enums.languages import Language -from typing import Any, Dict DEFAULT_FRAMEWORK = "huggingface" DEFAULT_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'