In [1]:
import mmh3
import numpy as np
import pandas as pd
from itertools import islice
from sklearn.base import BaseEstimator, TransformerMixin

import time

from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split

from icepickle.pipeline import make_partial_pipeline, make_partial_union

In [3]:
import augmenty
from spacy.lang.en import English

nlp = English()

  from .autonotebook import tqdm as notebook_tqdm
2022-05-20 21:03:49.225826: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-20 21:03:49.225884: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
models = {
    "base": make_pipeline(
        CountVectorizer(), 
        LogisticRegression()
    ),
    "base_char": make_pipeline(
        make_union(
            CountVectorizer(), 
            CountVectorizer(analyzer="char", ngram_range=(2, 3)), 
        ),
        LogisticRegression()
    ), 
    "hash_char": make_pipeline(
        make_union(
            HashingVectorizer(n_features=5000, analyzer="char", ngram_range=(2, 3)), 
            HashingVectorizer(n_features=2000),
            HashingVectorizer(n_features=5000),
        ),
        LogisticRegression(max_iter=200)
    ), 
    "hash_partial": make_partial_pipeline(
        make_partial_union(
            HashingVectorizer(n_features=5000, analyzer="char", ngram_range=(2, 3)), 
            HashingVectorizer(n_features=2000),
            HashingVectorizer(n_features=5000),
        ),
        SGDClassifier()
    ),
}

datasets = {
    "clinc": "oos-intent.csv"
}

In [15]:
def add_typos(texts, level=0.15):
    char_swap_augmenter = augmenty.load("char_swap.v1", level=level)
    return list(augmenty.texts(texts, char_swap_augmenter, nlp))

def generate_datasets(dataset):
    df = pd.read_csv(datasets[dataset])
    return train_test_split(list(df['text']), df['label'], train_size=2000)

def experiment(dataset, model, add_aug_train=False):
    # First, set up the dataset
    X_train, X_test, y_train, y_test = generate_datasets(dataset)
    if add_aug_train:
        X_train = X_train + add_typos(X_train)
        y_train = np.concatenate([y_train, y_train])
    X_train_aug = add_typos(X_train)
    X_test_aug = add_typos(X_test)
    
    # Train the model
    mod = models[model]
    t0 = time.time()
    mod.fit(X_train, y_train)
    t1 = time.time()
    
    # Collect relevant metrics
    train_pred = mod.predict(X_train)
    train_aug_pred = mod.predict(X_train_aug)
    test_pred = mod.predict(X_test)
    test_aug_pred = mod.predict(X_test_aug)
    t2 = time.time()
    return {
        'train_acc': np.mean(train_pred == y_train),
        'train_aug_acc': np.mean(train_aug_pred == y_train),
        'valid_acc': np.mean(test_pred == y_test),
        'valid_aug_acc': np.mean(test_aug_pred == y_test),
        'train_time': t1 - t0,
        'pred_time': t2 - t1
    }

In [16]:
from memo import grid

data = []
for setting in grid(model=models.keys(), dataset=["clinc"], add_aug_train=[True, False]):
    print(setting)
    data.append({**setting, **experiment(**setting)})

{'model': 'base_char', 'dataset': 'clinc', 'add_aug_train': False}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'model': 'hash_partial', 'dataset': 'clinc', 'add_aug_train': False}
{'model': 'base_char', 'dataset': 'clinc', 'add_aug_train': True}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'model': 'base', 'dataset': 'clinc', 'add_aug_train': True}
{'model': 'hash_char', 'dataset': 'clinc', 'add_aug_train': False}
{'model': 'base', 'dataset': 'clinc', 'add_aug_train': False}
{'model': 'hash_partial', 'dataset': 'clinc', 'add_aug_train': True}
{'model': 'hash_char', 'dataset': 'clinc', 'add_aug_train': True}


In [18]:
df_results = pd.DataFrame(data)

In [21]:
df_results.sort_values("valid_aug_acc")

Unnamed: 0,model,dataset,add_aug_train,train_acc,train_aug_acc,valid_acc,valid_aug_acc,train_time,pred_time
4,hash_char,clinc,False,0.963,0.6805,0.702028,0.429355,15.451088,1.906864
5,base,clinc,False,0.991,0.67,0.723871,0.432488,4.240996,0.400041
3,base,clinc,True,0.9965,0.807,0.734516,0.516544,9.858925,0.380658
1,hash_partial,clinc,False,0.9995,0.867,0.789309,0.575945,0.695704,1.972516
7,hash_char,clinc,True,0.9885,0.85975,0.753041,0.58894,20.169212,2.097261
0,base_char,clinc,False,1.0,0.935,0.785991,0.629401,24.756571,1.936239
6,hash_partial,clinc,True,1.0,0.96125,0.7953,0.683825,1.13482,2.136568
2,base_char,clinc,True,1.0,0.9745,0.775576,0.686267,38.149121,2.092481


Let's now do a similar experiment, but now on a streaming set.

In [92]:
def partial_experiment(dataset, model, add_aug_train=False):
    # First, set up the dataset
    X_train, X_test, y_train, y_test = generate_datasets(dataset)
    X_train_aug = add_typos(X_train)
    X_test_aug = add_typos(X_test)
    
    # Train the model
    metrics = []
    model = models[model]
    for i in range(400):
        t0 = time.time()
        if add_aug_train:
            X_train_aug_again = add_typos(X_train) + X_train
            y_vals = np.concatenate([y_train, y_train])
        else:
            X_train_aug_again = add_typos(X_train)
            y_vals = y_train
        model.partial_fit(X_train_aug_again, y_vals, classes=list(set(y_train)))
        t1 = time.time()    
        
        # Collect relevant metrics
        train_pred = model.predict(X_train)
        train_aug_pred = model.predict(X_train_aug)
        test_pred = model.predict(X_test)
        test_aug_pred = model.predict(X_test_aug)
        t2 = time.time()
        yield {
            'epoch': i,
            'train_acc': np.mean(train_pred == y_train),
            'train_aug_acc': np.mean(train_aug_pred == y_train),
            'valid_acc': np.mean(test_pred == y_test),
            'valid_aug_acc': np.mean(test_aug_pred == y_test),
            'train_time': t1 - t0,
            'pred_time': t2 - t1
        }

In [93]:
data_stream = []

models = {
    "hash_partial": make_partial_pipeline(
        make_partial_union(
            HashingVectorizer(n_features=5000, analyzer="char", ngram_range=(2, 3)), 
            HashingVectorizer(n_features=2000),
            HashingVectorizer(n_features=5000),
        ),
        SGDClassifier()
    ),
}

for i in partial_experiment("clinc", "hash_partial", add_aug_train=False):
    data_stream.append({**i, "add_aug_train": False})
    print(i)

{'epoch': 0, 'train_acc': 0.758, 'train_aug_acc': 0.635, 'valid_acc': 0.5264976958525346, 'valid_aug_acc': 0.42806451612903224, 'train_time': 0.5760188102722168, 'pred_time': 1.9368395805358887}
{'epoch': 1, 'train_acc': 0.893, 'train_aug_acc': 0.805, 'valid_acc': 0.6210599078341014, 'valid_aug_acc': 0.5361751152073733, 'train_time': 0.5807087421417236, 'pred_time': 1.9845702648162842}
{'epoch': 2, 'train_acc': 0.9515, 'train_aug_acc': 0.8795, 'valid_acc': 0.6807834101382488, 'valid_aug_acc': 0.6009677419354839, 'train_time': 0.5815978050231934, 'pred_time': 1.9326162338256836}
{'epoch': 3, 'train_acc': 0.971, 'train_aug_acc': 0.9195, 'valid_acc': 0.721152073732719, 'valid_aug_acc': 0.6465437788018433, 'train_time': 0.57283616065979, 'pred_time': 1.9591758251190186}
{'epoch': 4, 'train_acc': 0.9825, 'train_aug_acc': 0.953, 'valid_acc': 0.7414285714285714, 'valid_aug_acc': 0.6809677419354838, 'train_time': 0.579693078994751, 'pred_time': 1.93621826171875}
{'epoch': 5, 'train_acc': 0.992

In [94]:
df_stream = pd.DataFrame(data_stream)
df_stream.loc[lambda d: d['epoch'] == 399]

Unnamed: 0,epoch,train_acc,train_aug_acc,valid_acc,valid_aug_acc,train_time,pred_time,add_aug_train
399,399,0.9995,0.9985,0.81871,0.80765,0.576202,1.958626,False


In [95]:
import altair as alt 

(alt.Chart(df_stream)
  .mark_line()
  .encode(x='epoch', y='valid_acc', color='add_aug_train')
  .properties(width=600, height=250)
  .interactive())

In [91]:
all_data = data + [{**data_stream[-1], **{'model': 'stream'}}]

pd.DataFrame([{k: v for k, v in d.items() if k in ['model', 'add_aug_train', 'valid_acc', 'valid_aug_acc']} 
              for d in all_data]).sort_values("valid_aug_acc")

Unnamed: 0,model,add_aug_train,valid_acc,valid_aug_acc
4,hash_char,False,0.702028,0.429355
5,base,False,0.723871,0.432488
3,base,True,0.734516,0.516544
1,hash_partial,False,0.789309,0.575945
7,hash_char,True,0.753041,0.58894
0,base_char,False,0.785991,0.629401
6,hash_partial,True,0.7953,0.683825
2,base_char,True,0.775576,0.686267
8,stream,False,0.823963,0.80977


This is pretty interesting. By sampling lots and lots of bad spellings ... we actually score better on data that does not have spelling errors!