In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import copy
from pprint import pprint
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple
import spacy
import srsly
# import recon
from recon.corpus import Corpus
from recon.constants import NONE
from recon.corrections import fix_annotations
from recon.dataset import Dataset
from recon.loaders import read_jsonl
from recon.types import Correction, Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState
from recon.stats import (
    get_ner_stats, get_entity_coverage, get_sorted_type_counts, get_probs_from_counts, entropy,
    calculate_entity_coverage_entropy, calculate_label_balance_entropy, calculate_label_distribution_similarity,
    detect_outliers
)
import recon.tokenization as tokenization
from recon.insights import get_ents_by_label, get_label_disparities, top_prediction_errors, top_label_disparities, get_hardest_examples
from recon.recognizer import SpacyEntityRecognizer
from recon.operations import registry
from recon.store import ExampleStore

{'version': 1, 'disable_existing_loggers': False, 'formatters': {'default': {'()': 'uvicorn.logging.DefaultFormatter', 'fmt': '%(levelprefix)s %(message)s', 'use_colors': None}, 'access': {'()': 'uvicorn.logging.AccessFormatter', 'fmt': '%(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'}}, 'handlers': {'default': {'formatter': 'default', 'class': 'logging.StreamHandler', 'stream': 'ext://sys.stderr'}, 'access': {'formatter': 'access', 'class': 'logging.StreamHandler', 'stream': 'ext://sys.stdout'}}, 'loggers': {'uvicorn': {'handlers': ['default'], 'level': 'INFO'}, 'uvicorn.error': {'level': 'INFO'}, 'uvicorn.access': {'handlers': ['access'], 'level': 'INFO', 'propagate': False}}}


In [4]:
# TODO: Fix Dataset loading with different file names
# train = Dataset("train").from_disk("./data/fashion_brands/fashion_brands_training.jsonl")
# dev = Dataset("dev").from_disk("./data/fashion_brands/fashion_brands_eval.jsonl")

corpus = Corpus.from_disk("./data/fashion_brands/", "fashion_brands")

In [23]:
ec = corpus.apply(get_entity_coverage, case_sensitive=True)
ec.all[:5]

[EntityCoverage(text='Nike', label='FASHION_BRAND', count=18, examples=[]),
 EntityCoverage(text='Uniqlo', label='FASHION_BRAND', count=18, examples=[]),
 EntityCoverage(text='Bonobos', label='FASHION_BRAND', count=12, examples=[]),
 EntityCoverage(text='Madewell', label='FASHION_BRAND', count=10, examples=[]),
 EntityCoverage(text='Allen Edmonds', label='FASHION_BRAND', count=9, examples=[])]

In [37]:
unique_ec = {e.text for e in ec.train}
len(unique_ec)

320

In [38]:
# source: https://www.apparelsearch.com/wholesale_clothing/popular_brand_names_clothes.htm
extra_brands = ["Adidas", "Aeffe S.P.A", "Agatha", "Agnes B", "", "Anna Osmushkina", "Anna Sui", "Aquascutum", "Armani Exchange", "Austin Reed", "Avirex", "BCBG", "Benetton", "Bisou-Bisou", "Body Glove", "Bogner", "Burton", "Brioni", "Calvin Klein", "Cesarani", "Champion", "Chanel", "Christian Dior", "", "Christian Lacoix", "Claiborne", "Club Monaco", "Columbia", "Converse", "Courrages", "Cutter & ", "Buck", "Diesel", "Dockers", "", "Dolce & Gabbana", "Donna Karan", "Ecco", "Ecko", "Eddie Bauer", "Ellesse", "Eliott & ", "Lucca", "Energie", "Esprit", "Everlast", "Fia Miami", "Fila", "Fiorelli", "", "Fratelli Corneliani", "Fred Perry", "Fruit of the ", "Loom", "Fubu", "", "Gianfranco Ferre", "Gianni Versace", "Giorgio Armani", "Gucci", "Guess", "Helly Hansen", "Hugo Boss", "J. Crew", "Izod", "Jitrois", "Jennifer Lopez", "", "Jenny Yoo", "Jhane Barnes", "Joe Boxer", "", "John Smedley", "Jordache", "Kenneth Cole ", "/ Reaction", "Lacoste", "Land's End", "", "La Perla", "Laura Ashley", "Lee", "Le Tigre", "Levi's", "Liz Claiborne", "L.L Bean", "", "Louis Feraud", "Lucky Brand ", "Jeans", "", "Madeleine Vionnet", "Mango", "Marc Jacobs", "", "Marcia Grachvogel", "", "Marianne Alvoni", "", "Michael Kors", "Moschino", "Mudd", "Munsingwear", "Nancy LordNew Balance", "Nicole Miller", "Nike", "", "Norma Kamali", "Oky-coky", "Oilily", "", "Olivier Strelli", "Oneill", "OP", "", "OshKosh B'Gosh", "Paul Fredrick", "Paul Shark", "Paul Smith", "", "Pelle Pelle", "Pepe Jeans", "Perry Ellis", "", "Perry Landhaus", "Pierre Cardin", "", "Pierre Garroudi", "Prada", "Puma", "Quiksilver", "Ralph Lauren", "Rampage", "Red Monkey", "Red or Dead", "Roberto Angelico", "Rocawear", "Russell", "Savane", "Salvatore J. ", "Cesarani", "Sean John", "Sinequanone", "Sisley", "Southpole", "Speedo", "Steven Alan", "Swatch", "Timberland", "Todd Oldham", "Tommy Hilfiger", "Van Heusen", "Vans", "Versace", "Vokal", "Wrangler", "Yves Saint ", "Laurent", "", "Z. Cavaricci", "Zanetti", "Zero"]
extra_brands = {eb for eb in extra_brands if eb != ""}
len(extra_brands)

141

In [39]:
unique_ec.update(extra_brands)

In [40]:
len(unique_ec)

444

In [26]:
from typing import Any, Callable, Dict, List, Optional

from recon.dataset import Dataset
from pydantic import root_validator
from recon.types import Example, Span, Token
import numpy as np
from recon.augmentation import augment_example
from recon.operations import operation, registry
from recon.preprocess import SpacyPreProcessor

import names
from snorkel.augmentation import transformation_function
from snorkel.preprocess.nlp import SpacyPreprocessor
from recon.preprocess import SpacyPreProcessor
import spacy

In [27]:
def substitute_spans(example: Example, span_subs: Dict[Span, str]) -> Example:
    """Substitute spans in an example. Replaces span text and alters the example text
    and span offsets to create a valid example.

    Args:
        example (Example): Input example
        span_subs (Dict[int, str]): Mapping of span hash to a str replacement text

    Returns:
        Example: Output example with substituted spans
    """
    span_sub_start_counter = 0

    new_example_text = example.text
    new_example_spans = []
    
    prev_example_spans = {hash(span) for span in example.spans}
    spans = sorted(set(list(span_subs.keys()) + example.spans), key=lambda s: s.start)
        
    for span in spans:
        should_add_span = hash(span) in prev_example_spans
        
        prev_end = span.end
        new_text = span.text

        if span in span_subs:
            new_text = span_subs[span]
            new_start = span.start + span_sub_start_counter
            new_end = new_start + len(new_text)

            new_example_text = (
                new_example_text[: span.start + span_sub_start_counter]
                + new_text
                + new_example_text[span.end + span_sub_start_counter :]
            )

            span.text = new_text
            span.start = new_start
            span.end = new_end
            
            span_sub_start_counter += new_end - prev_end
        else:
            span.start += span_sub_start_counter
            span.end = span.start + len(new_text)
            span_sub_start_counter = span.end - prev_end

        span.text = new_text
        
        if should_add_span:
            new_example_spans.append(span)
        
    example.text = new_example_text
    example.spans = new_example_spans

    return example

In [28]:
np.random.seed(0)

def augment_example(
    example: Example,
    span_f: Callable[[Span, Any], Optional[str]],
    spans: List[Span] = None,
    span_label: str = None,
    **kwargs: Any,
) -> List[Example]:

    if spans is None:
        spans = example.spans

    prev_example_hash = hash(example)
    example_t = None

    if span_label:
        spans = [s for s in spans if s.label == span_label]

    if spans:
        spans_to_sub = [np.random.choice(spans)]

        span_subs = {}
        for span in spans_to_sub:
            res = span_f(span, **kwargs)  #  type: ignore
            if res:
                span_subs[span] = res

        if any(span_subs.values()):
            res = substitute_spans(example, span_subs)
            if hash(res) != prev_example_hash:
                example_t = res

    return example_t

In [53]:
np.random.seed(0)


def ent_label_sub(
    example: Example, label: str, subs: List[str]
) -> List[Example]:
    
    def augmentation_f(span: Span, subs: List[str]) -> Optional[str]:
        subs = [s for s in subs if s != span.text]
        sub = None
        if len(subs) > 0:
            sub = np.random.choice(subs)
        return sub

    return augment_example(example, span_f=augmentation_f, span_label=label, subs=subs)


# replacement_names = [names.get_full_name() for _ in range(50)]


@transformation_function()
def brand_sub(example: Example):
    return ent_label_sub(example.copy(deep=True), label="FASHION_BRAND", subs=list(unique_ec))


# @transformation_function()
# def person_sub(example: Example):
#     return ent_label_sub(example.copy(deep=True), label="PERSON", subs=replacement_names)

@transformation_function()
def gpe_sub(example: Example):
    return ent_label_sub(example.copy(deep=True), label="GPE", subs=["Russia", "USA", "China"])

In [54]:
tfs = [
    brand_sub,
#     person_sub,
    gpe_sub
]

# np.random.seed(0)

# from snorkel.augmentation import ApplyOnePolicy, RandomPolicy

# random_policy = RandomPolicy(
#     len(tfs), sequence_length=2, n_per_original=2, keep_original=True
# )

# random_policy.generate_for_example()


policy = ApplyOnePolicy()
policy.generate_for_example()

[[], [0]]

In [55]:
from tqdm import tqdm

from snorkel.augmentation.apply.core import BaseTFApplier


class ReconDatasetTFApplier(BaseTFApplier):
    
    def __init__(self, tfs, policy, span_label: str = None, sub_prob: float = 0.5):
        super().__init__(tfs, policy)
        self.span_label = span_label
        self.sub_prob = sub_prob
    
    def _apply_policy_to_data_point(self, x: Example) -> List[Example]:
        
        x_transformed = set()
        for seq in self._policy.generate_for_example():
            x_t = x.copy(deep=True)
            # Handle empty sequence for `keep_original`
            transform_applied = len(seq) == 0
            # Apply TFs
            for tf_idx in seq:
                tf = self._tfs[tf_idx]                
                x_t_or_none = tf(x_t)
                # Update if transformation was applied
                if x_t_or_none is not None:
                    transform_applied = True
                    x_t = x_t_or_none.copy(deep=True)
            # Add example if original or transformations applied
            if transform_applied:
                x_transformed.add(x_t)
        return list(x_transformed)


    def apply(self, ds: Dataset, progress_bar: bool = True) -> Dataset:
        
        @operation("recon.v1.augment")
        def augment(example: Example):
            transformed_examples = self._apply_policy_to_data_point(example)
            return transformed_examples
            
        ds.apply_("recon.v1.augment")
        
        return ds

In [56]:
len(corpus.train_ds)

1235

In [58]:
np.random.seed(0)

applier = ReconDatasetTFApplier(tfs, policy)
applier.apply(corpus.train_ds)

  1%|          | 8/1235 [00:00<00:17, 72.00it/s]

=> Applying operation 'recon.v1.augment' to dataset 'train'


100%|██████████| 1235/1235 [00:15<00:00, 81.63it/s]

[38;5;2m✔ Completed operation 'recon.v1.augment'[0m





<recon.dataset.Dataset at 0x7f4874422b38>

In [59]:
len(corpus.train_ds)

1540