In [4]:

"""
A Python-based target-finder for a Rust-based classifier engine:
This revised version:
1. Properly handles frequency and uniqueness calculations
2. Saves results to a JSON file instead of printing
3. Includes input validation
4. Adds error handling and logging
5. Converts ngrams to UTF-8 byte patterns
6. Includes metadata in the output

The output JSON file will look something like:
```json
{
  "metadata": {
    "min_frequency": 2,
    "min_uniqueness": 0.7,
    "ngram_range": [1, 2]
  },
  "targets": {
    "class1": {
      "label": "class1",
      "targets": [
        {
          "text": "sample",
          "weight": 2.8,
          "frequency": 4,
          "uniqueness": 0.7,
          "bytes_pattern": "73616d706c65"
        },
        ...
      ]
    },
    "class2": {
      ...
    }
  }
}
```
"""
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
import json
import logging
from pathlib import Path

@dataclass
class NGramTarget:
    text: str
    weight: float
    frequency: int
    uniqueness: float
    bytes_pattern: bytes = None  # UTF-8 bytes representation

    def to_dict(self):
        return {
            'text': self.text,
            'weight': float(self.weight),  # Convert numpy types to native
            'frequency': int(self.frequency),
            'uniqueness': float(self.uniqueness),
            'bytes_pattern': self.bytes_pattern.hex() if self.bytes_pattern else None
        }

@dataclass
class LabelTargets:
    label: str
    targets: List[NGramTarget]

    def to_dict(self):
        return {
            'label': self.label,
            'targets': [t.to_dict() for t in self.targets]
        }

class NGramTargetFinder:
    def __init__(
        self,
        min_frequency: int = 3,
        min_uniqueness: float = 0.8,
        ngram_range: Tuple[int,int] = (1,3),
        n_folds: int = 5,
        output_path: Path = Path('ngram_targets')
    ):
        self.min_frequency = min_frequency
        self.min_uniqueness = min_uniqueness
        self.ngram_range = ngram_range
        self.n_folds = n_folds
        self.output_path = output_path
        self.output_path.mkdir(exist_ok=True)

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def find_and_save_targets(
        self,
        documents: List[str],
        labels: List[str],
        output_name: str
    ) -> bool:
        """Main method to find targets and save results"""
        try:
            if not self._validate_inputs(documents, labels):
                return False

            targets = self.find_targets(documents, labels)
            if not targets:
                self.logger.error("No targets found")
                return False

            self._save_targets(targets, output_name)
            return True

        except Exception as e:
            self.logger.error(f"Error in find_and_save_targets: {str(e)}")
            return False

    def _validate_inputs(self, documents: List[str], labels: List[str]) -> bool:
        """Validate input data"""
        if not documents or not labels:
            self.logger.error("Empty documents or labels")
            return False
        if len(documents) != len(labels):
            self.logger.error("Mismatched documents and labels lengths")
            return False
        if len(set(labels)) < 2:
            self.logger.error("Need at least two different labels")
            return False
        return True

    def find_targets(
        self,
        documents: List[str],
        labels: List[str]
    ) -> Dict[str, LabelTargets]:
        """Find ngram targets for each label"""
        try:
            vectorizer = CountVectorizer(ngram_range=self.ngram_range)
            X = vectorizer.fit_transform(documents)
            ngrams = vectorizer.get_feature_names_out()

            # Calculate base frequencies for all labels
            label_frequencies = self._calculate_base_frequencies(X, labels, ngrams)

            # Calculate uniqueness scores
            label_targets = self._calculate_targets(label_frequencies, ngrams)

            return label_targets

        except Exception as e:
            self.logger.error(f"Error in find_targets: {str(e)}")
            return None

    def _calculate_base_frequencies(
        self,
        X,
        labels: List[str],
        ngrams: List[str]
    ) -> Dict[str, Dict[str, int]]:
        """Calculate raw frequencies for each label"""
        frequencies = {}
        for label in set(labels):
            label_mask = np.array(labels) == label
            label_docs = X[label_mask]
            frequencies[label] = {
                ngram: int(freq)
                for ngram, freq in zip(ngrams, label_docs.sum(axis=0).A1)
            }
        return frequencies

    def _calculate_targets(
        self,
        label_frequencies: Dict[str, Dict[str, int]],
        ngrams: List[str]
    ) -> Dict[str, LabelTargets]:
        """Calculate final targets with scores"""
        targets = {}

        for label in label_frequencies:
            ngram_targets = []
            other_labels = set(label_frequencies.keys()) - {label}

            for ngram in ngrams:
                label_freq = label_frequencies[label].get(ngram, 0)
                other_freq = max(
                    label_frequencies[other_label].get(ngram, 0)
                    for other_label in other_labels
                )

                if label_freq >= self.min_frequency:
                    uniqueness = label_freq / (label_freq + other_freq) if (label_freq + other_freq) > 0 else 0
                    if uniqueness >= self.min_uniqueness:
                        ngram_targets.append(
                            NGramTarget(
                                text=ngram,
                                weight=label_freq * uniqueness,
                                frequency=label_freq,
                                uniqueness=uniqueness,
                                bytes_pattern=ngram.encode('utf-8')
                            )
                        )

            if ngram_targets:
                ngram_targets.sort(key=lambda x: x.weight, reverse=True)
                targets[label] = LabelTargets(label=label, targets=ngram_targets)

        return targets

    def _save_targets(self, targets: Dict[str, LabelTargets], output_name: str):
        """Save targets to JSON file"""
        output_file = self.output_path / f"{output_name}.json"

        # Convert to serializable format
        output_data = {
            'metadata': {
                'min_frequency': self.min_frequency,
                'min_uniqueness': self.min_uniqueness,
                'ngram_range': self.ngram_range
            },
            'targets': {
                label: targets[label].to_dict()
                for label in targets
            }
        }

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2)

        self.logger.info(f"Saved targets to {output_file}")

# Sample use:
if __name__ == "__main__":
    documents = [
        "this is a sample document",
        "another example document",
        "yet another document example",
        "this is different"
    ] * 3  # Replicate for more samples

    labels = ["class1", "class2", "class2", "class1"] * 3

    finder = NGramTargetFinder(
        min_frequency=2,
        min_uniqueness=0.7,
        ngram_range=(1,2)
    )

    success = finder.find_and_save_targets(
        documents,
        labels,
        "targets"
    )

    if success:
        print("Target finding completed successfully")

Target finding completed successfully
