In [3]:
%reload_ext autoreload
%autoreload 2

In [4]:
import copy
from pprint import pprint
from pathlib import Path
from typing import Any, Dict, List, Set, Tuple
import spacy
import srsly
# import recon
from recon.corpus import Corpus
from recon.constants import NONE
from recon.corrections import fix_annotations
from recon.dataset import Dataset
from recon.loaders import read_jsonl
from recon.types import Correction, Example, PredictionError, HardestExample, NERStats, EntityCoverageStats, EntityCoverage, Transformation, TransformationType, OperationState
from recon.stats import (
    get_ner_stats, get_entity_coverage, get_sorted_type_counts, get_probs_from_counts, entropy,
    calculate_entity_coverage_entropy, calculate_label_balance_entropy, calculate_label_distribution_similarity,
    detect_outliers
)
import recon.tokenization as tokenization
from recon.insights import get_ents_by_label, get_label_disparities, top_prediction_errors, top_label_disparities, get_hardest_examples
from recon.recognizer import SpacyEntityRecognizer
from recon.operations import registry
from recon.store import ExampleStore

In [42]:
# TODO: Fix Dataset loading with different file names
# train = Dataset("train").from_disk("./data/fashion_brands/fashion_brands_training.jsonl")
# dev = Dataset("dev").from_disk("./data/fashion_brands/fashion_brands_eval.jsonl")

corpus = Corpus.from_disk("./data/fashion_brands/", "fashion_brands")

In [43]:
print(corpus.apply(get_ner_stats, serialize=True).train)
print(corpus.apply(get_ner_stats, serialize=True).dev)
print(corpus.apply(get_ner_stats, serialize=True).all)

{
    "n_examples":1235,
    "n_examples_no_entities":930,
    "n_annotations":527,
    "n_annotations_per_type":{
        "FASHION_BRAND":527
    },
    "examples_with_type":null
}
{
    "n_examples":500,
    "n_examples_no_entities":371,
    "n_annotations":238,
    "n_annotations_per_type":{
        "FASHION_BRAND":238
    },
    "examples_with_type":null
}
{
    "n_examples":1735,
    "n_examples_no_entities":1301,
    "n_annotations":765,
    "n_annotations_per_type":{
        "FASHION_BRAND":765
    },
    "examples_with_type":null
}


In [9]:
ec = corpus.apply(get_entity_coverage, case_sensitive=True)
ec.train[:5]

[EntityCoverage(text='Nike', label='FASHION_BRAND', count=11, examples=[]),
 EntityCoverage(text='Uniqlo', label='FASHION_BRAND', count=11, examples=[]),
 EntityCoverage(text='Madewell', label='FASHION_BRAND', count=8, examples=[]),
 EntityCoverage(text='Bonobos', label='FASHION_BRAND', count=7, examples=[]),
 EntityCoverage(text='Gucci', label='FASHION_BRAND', count=7, examples=[])]

In [10]:
unique_ec = {e.text for e in ec.train}
len(unique_ec)

320

In [11]:
# source: https://www.apparelsearch.com/wholesale_clothing/popular_brand_names_clothes.htm
extra_brands = ["Adidas", "Aeffe S.P.A", "Agatha", "Agnes B", "", "Anna Osmushkina", "Anna Sui", "Aquascutum", "Armani Exchange", "Austin Reed", "Avirex", "BCBG", "Benetton", "Bisou-Bisou", "Body Glove", "Bogner", "Burton", "Brioni", "Calvin Klein", "Cesarani", "Champion", "Chanel", "Christian Dior", "", "Christian Lacoix", "Claiborne", "Club Monaco", "Columbia", "Converse", "Courrages", "Cutter & ", "Buck", "Diesel", "Dockers", "", "Dolce & Gabbana", "Donna Karan", "Ecco", "Ecko", "Eddie Bauer", "Ellesse", "Eliott & ", "Lucca", "Energie", "Esprit", "Everlast", "Fia Miami", "Fila", "Fiorelli", "", "Fratelli Corneliani", "Fred Perry", "Fruit of the ", "Loom", "Fubu", "", "Gianfranco Ferre", "Gianni Versace", "Giorgio Armani", "Gucci", "Guess", "Helly Hansen", "Hugo Boss", "J. Crew", "Izod", "Jitrois", "Jennifer Lopez", "", "Jenny Yoo", "Jhane Barnes", "Joe Boxer", "", "John Smedley", "Jordache", "Kenneth Cole ", "/ Reaction", "Lacoste", "Land's End", "", "La Perla", "Laura Ashley", "Lee", "Le Tigre", "Levi's", "Liz Claiborne", "L.L Bean", "", "Louis Feraud", "Lucky Brand ", "Jeans", "", "Madeleine Vionnet", "Mango", "Marc Jacobs", "", "Marcia Grachvogel", "", "Marianne Alvoni", "", "Michael Kors", "Moschino", "Mudd", "Munsingwear", "Nancy LordNew Balance", "Nicole Miller", "Nike", "", "Norma Kamali", "Oky-coky", "Oilily", "", "Olivier Strelli", "Oneill", "OP", "", "OshKosh B'Gosh", "Paul Fredrick", "Paul Shark", "Paul Smith", "", "Pelle Pelle", "Pepe Jeans", "Perry Ellis", "", "Perry Landhaus", "Pierre Cardin", "", "Pierre Garroudi", "Prada", "Puma", "Quiksilver", "Ralph Lauren", "Rampage", "Red Monkey", "Red or Dead", "Roberto Angelico", "Rocawear", "Russell", "Savane", "Salvatore J. ", "Cesarani", "Sean John", "Sinequanone", "Sisley", "Southpole", "Speedo", "Steven Alan", "Swatch", "Timberland", "Todd Oldham", "Tommy Hilfiger", "Van Heusen", "Vans", "Versace", "Vokal", "Wrangler", "Yves Saint ", "Laurent", "", "Z. Cavaricci", "Zanetti", "Zero"]
extra_brands = {eb for eb in extra_brands if eb != ""}
len(extra_brands)

141

In [12]:
unique_ec.update(extra_brands)

In [13]:
len(unique_ec)

444

In [46]:
from typing import Any, Callable, Dict, List, Optional

from recon.dataset import Dataset
from pydantic import root_validator
from recon.types import Example, Span, Token
import numpy as np
from recon.augmentation import substitute_spans
from recon.operations import operation, registry
from recon.preprocess import SpacyPreProcessor

import names
from snorkel.augmentation import transformation_function, ApplyAllPolicy
from snorkel.preprocess.nlp import SpacyPreprocessor
from recon.preprocess import SpacyPreProcessor
import spacy

In [53]:
nlp = spacy.load("en_core_web_sm")
spacy_pre = SpacyPreProcessor(nlp)

In [45]:
np.random.seed(0)

def augment_example(
    example: Example,
    span_f: Callable[[Span, Any], Optional[str]],
    spans: List[Span] = None,
    span_label: str = None,
    **kwargs: Any,
) -> List[Example]:

    if spans is None:
        spans = example.spans

    prev_example_hash = hash(example)
    example_t = None

    if span_label:
        spans = [s for s in spans if s.label == span_label]

    if spans:
        spans_to_sub = [np.random.choice(spans)]

        span_subs = {}
        for span in spans_to_sub:
            res = span_f(span, **kwargs)  #  type: ignore
            if res:
                span_subs[span] = res

        if any(span_subs.values()):
            res = substitute_spans(example, span_subs)
            if hash(res) != prev_example_hash:
                example_t = res

    return example_t

In [23]:
np.random.seed(0)


def ent_label_sub(
    example: Example, label: str, subs: List[str]
) -> Optional[Example]:
    
    def augmentation_f(span: Span, subs: List[str]) -> Optional[str]:
        subs = [s for s in subs if s != span.text]
        sub = None
        if len(subs) > 0:
            sub = np.random.choice(subs)
        return sub

    return augment_example(example, span_f=augmentation_f, span_label=label, subs=subs)


# replacement_names = [names.get_full_name() for _ in range(50)]


@transformation_function()
def brand_sub(example: Example):
    return ent_label_sub(example.copy(deep=True), label="FASHION_BRAND", subs=list(unique_ec))


# @transformation_function()
# def person_sub(example: Example):
#     return ent_label_sub(example.copy(deep=True), label="PERSON", subs=replacement_names)

@transformation_function()
def gpe_sub(example: Example):
    return ent_label_sub(example.copy(deep=True), label="GPE", subs=["Russia", "USA", "China"])

In [None]:
def kb_sub(
    example: Example, spans_to_aliases_map: Dict[Span, str]
) -> Optional[Example]:
    
    def augmentation_f(span: Span, spans_to_aliases_map: Dict[Span, List[str]]) -> Optional[str]:
        sub = None
        if span in spans_to_aliases_map:
            aliases = spans_to_aliases_map[span]

            if len(aliases) > 0:
                rand_alias = np.random.choice(aliases)
                index = aliases.index(rand_alias)
                del spans_to_aliases_map[span][index]
                sub = rand_alias

        return sub

    return augment_example(example, span_f=augmentation_f, span_label=label, subs=subs)


# @transformation_function()
# def skills_sub(example: Example):
    

In [90]:
from nltk.corpus import wordnet as wn

def get_synonym(word, pos=None):
    """Get synonym for word given its part-of-speech (pos)."""
    synsets = wn.synsets(word, pos=pos)
    # Return None if wordnet has no synsets (synonym sets) for this word and pos.
    if synsets:
        words = [lemma.name() for lemma in synsets[0].lemmas()]
        if words[0].lower() != word.lower():  # Skip if synonym is same as word.
            # Multi word synonyms in wordnet use '_' as a separator e.g. reckon_with. Replace it with space.
            return words[0].replace("_", " ")


# @operation("recon.v1.augment.replace_pos_with_synonym", pre=[spacy_pre])
def replace_pos_with_synonym(
    example: Example, 
    pos: str, 
    synonym_f: Callable[[str], str] = get_synonym
):

    pos_map = {
        "VERB": "v",
        "NOUN": "n",
        "ADJ": "a"
    }
    
    if pos not in pos_map:
        raise ValueError(f"Argument `pos` of {pos} not in {''.join(pos_map.keys())}")

    doc = example.doc
    span_starts = [s.start for s in example.spans]

    # Get indices of verb tokens in sentence.
    pos_idxs = [i for i, token in enumerate(doc) if token.pos_ == pos and token.idx not in span_starts]
    tokens = [doc[idx] for idx in pos_idxs]
    spans = [Span(text=token.text, start=token.idx, end=token.idx + len(token.text), label="") for token in tokens]

    def augmentation_f(span: Span, synonym_f: Callable[[str], str] = synonym_f) -> Optional[str]:
        return synonym_f(span.text)

    return augment_example(
        example,
        augmentation_f,
        spans=spans,
    )
    

spacy_pre = SpacyPreprocessor(text_field="text", doc_field="doc")    

@transformation_function(pre=[spacy_pre])
def replace_verb_with_synonym(example: Example):
    return replace_pos_with_synonym(example, "VERB")

@transformation_function(pre=[spacy_pre])
def replace_noun_with_synonym(example: Example):
    return replace_pos_with_synonym(example, "NOUN")

@transformation_function(pre=[spacy_pre])
def replace_adj_with_synonym(example: Example):
    return replace_pos_with_synonym(example, "ADJ")



In [92]:
tfs = [
    brand_sub,
    replace_verb_with_synonym,
    replace_noun_with_synonym,
    replace_adj_with_synonym,
#     person_sub,
#     gpe_sub
]

np.random.seed(0)

from snorkel.augmentation import ApplyOnePolicy, RandomPolicy

random_policy = RandomPolicy(
    len(tfs), sequence_length=3, n_per_original=2, keep_original=True
)

random_policy.generate_for_example()


# policy = ApplyAllPolicy(len(tfs))
# policy.generate_for_example()

[[], [0, 3, 1], [0, 3, 3]]

In [93]:
from tqdm import tqdm

from snorkel.augmentation.apply.core import BaseTFApplier


class ReconDatasetTFApplier(BaseTFApplier):
    
    def __init__(self, tfs, policy, span_label: str = None, sub_prob: float = 0.5):
        super().__init__(tfs, policy)
        self.span_label = span_label
        self.sub_prob = sub_prob
    
    def _apply_policy_to_data_point(self, x: Example) -> List[Example]:
        
        x_transformed = set()
        for seq in self._policy.generate_for_example():
            x_t = x.copy(deep=True)
            # Handle empty sequence for `keep_original`
            transform_applied = len(seq) == 0
            # Apply TFs
            for tf_idx in seq:
                tf = self._tfs[tf_idx]                
                x_t_or_none = tf(x_t)
                # Update if transformation was applied
                if x_t_or_none is not None:
                    transform_applied = True
                    x_t = x_t_or_none.copy(deep=True)
            # Add example if original or transformations applied
            if transform_applied:
                x_transformed.add(x_t)
        return list(x_transformed)


    def apply(self, ds: Dataset, progress_bar: bool = True) -> Dataset:
        
        @operation("recon.v1.augment")
        def augment(example: Example):
            transformed_examples = self._apply_policy_to_data_point(example)
            return transformed_examples
            
        ds.apply_("recon.v1.augment")
        
        return ds

In [94]:
len(corpus.train_ds)

1235

In [95]:
np.random.seed(0)

applier = ReconDatasetTFApplier(tfs, random_policy)
applier.apply(corpus.train_ds)

=> Applying operation 'recon.v1.augment' inplace
[38;5;2m✔ Completed operation 'recon.v1.augment'[0m


<recon.dataset.Dataset at 0x7f092f7fdb38>

In [96]:
len(corpus.train_ds)

2793

In [97]:
print(corpus.apply(get_ner_stats, serialize=True).train)
print(corpus.apply(get_ner_stats, serialize=True).dev)
print(corpus.apply(get_ner_stats, serialize=True).all)

{
    "n_examples":2793,
    "n_examples_no_entities":1960,
    "n_annotations":1443,
    "n_annotations_per_type":{
        "FASHION_BRAND":1443
    },
    "examples_with_type":null
}
{
    "n_examples":500,
    "n_examples_no_entities":371,
    "n_annotations":238,
    "n_annotations_per_type":{
        "FASHION_BRAND":238
    },
    "examples_with_type":null
}
{
    "n_examples":3293,
    "n_examples_no_entities":2331,
    "n_annotations":1681,
    "n_annotations_per_type":{
        "FASHION_BRAND":1681
    },
    "examples_with_type":null
}


In [101]:
corpus.train[:5]

[Example(text="It's all preference for which looks better, personally I feel that the more natural the hair looks the better the style, which for me means going with a matte finish which leaves the hair looking as natural as possible while still retention it in place", spans=[], tokens=[Token(text='It', start=0, end=2, id=0), Token(text="'s", start=2, end=4, id=1), Token(text='all', start=5, end=8, id=2), Token(text='preference', start=9, end=19, id=3), Token(text='for', start=20, end=23, id=4), Token(text='which', start=24, end=29, id=5), Token(text='looks', start=30, end=35, id=6), Token(text='better', start=36, end=42, id=7), Token(text=',', start=42, end=43, id=8), Token(text='personally', start=44, end=54, id=9), Token(text='I', start=55, end=56, id=10), Token(text='feel', start=57, end=61, id=11), Token(text='that', start=62, end=66, id=12), Token(text='the', start=67, end=70, id=13), Token(text='more', start=71, end=75, id=14), Token(text='natural', start=76, end=83, id=15), Tok

In [114]:
# corpus.train_ds.to_disk("./fixed_data/fashion_brands_4_augmentations/train", force=True)

raw_data = [e.dict() for e in corpus.train_ds.data]
len(raw_data)
for e in raw_data:
    if 'doc' in e:
        del e['doc']

srsly.write_jsonl("./fixed_data/fashion_brands_4_augmentations/train.jsonl", raw_data)

In [33]:
corpus.example_store[corpus.train_ds.operations[0].transformations[2].example]

Example(text="Ooh, that was my shirt! It's Revolve.", spans=[Span(text='Revolve', start=29, end=36, label='FASHION_BRAND', token_start=9, token_end=9, kb_id=None)], tokens=[Token(text='Ooh', start=0, end=3, id=0), Token(text=',', start=3, end=4, id=1), Token(text='that', start=5, end=9, id=2), Token(text='was', start=10, end=13, id=3), Token(text='my', start=14, end=16, id=4), Token(text='shirt', start=17, end=22, id=5), Token(text='!', start=22, end=23, id=6), Token(text='It', start=24, end=26, id=7), Token(text="'s", start=26, end=28, id=8), Token(text='anachronorm', start=29, end=40, id=9), Token(text='.', start=40, end=41, id=10)], meta={'section': 'malefashionadvice'}, formatted=True, answer='accept', _view_id='ner_manual', _input_hash=-187997482, _task_hash=-284841919, _session_id=None)

In [34]:
len(corpus.train_ds)

1540

In [41]:
print(corpus.apply(get_ner_stats, serialize=True).train)
print(corpus.apply(get_ner_stats, serialize=True).dev)
print(corpus.apply(get_ner_stats, serialize=True).all)

{
    "n_examples":1540,
    "n_examples_no_entities":930,
    "n_annotations":1054,
    "n_annotations_per_type":{
        "FASHION_BRAND":1054
    },
    "examples_with_type":null
}
{
    "n_examples":500,
    "n_examples_no_entities":371,
    "n_annotations":238,
    "n_annotations_per_type":{
        "FASHION_BRAND":238
    },
    "examples_with_type":null
}
{
    "n_examples":2040,
    "n_examples_no_entities":1301,
    "n_annotations":1292,
    "n_annotations_per_type":{
        "FASHION_BRAND":1292
    },
    "examples_with_type":null
}


In [37]:
corpus.to_disk("./fixed_data/fashion_brands_ent_label_augment", force=True)