# Setup

In [None]:
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 82.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


# Run

## Config

In [None]:
import spacy
import gensim.downloader

from altboolq import *

In [None]:
dfs = get_dfs()

## Mask

In [None]:
config = {
    "name": "maskboolq_train_v5",
    "df": "boolq_train",
    "alpha": 0.01,
    "grouping_cutoff": 0.66,
    "tfidf_cutoff": 5,
    "freq_cutoff": 0.2,
    "min_term_count": 2,
    "max_doc_freq": 0.1,
    "redact_oov": True,
    "group_size": 8,
    "filter": "group_accuracy",
    "resample": "mask",
    "glove_path": "fasttext-wiki-news-subwords-300",
    "nlp_path": "en_core_web_md",
}

In [None]:
aug_df = run_augment(dfs, config, gensim.downloader, spacy)
write_df(config, aug_df)

Loading glove and nlp...
Selecting word groups...


100%|██████████| 9427/9427 [11:12<00:00, 14.03it/s]


Evaluating selections...


9427it [04:57, 31.65it/s]


{'non_empty': 0.9994696085711255, 'avg_length': 5.560199427177257, 'max_length': 25, 'avg_freq': 0.2127573669509582, 'group_accuracy': 0.9843800497882866, 'exact_group_accuracy': 0.8655988119231993}
Making word maps...


9427it [04:37, 33.93it/s]


Augmenting dataset...


In [None]:
config.update({"name": "maskboolq_valid_v5", "df": "boolq_valid"})
aug_df = run_augment(dfs, config, gensim.downloader, spacy)
write_df(config, aug_df)

Loading glove and nlp...
Selecting word groups...


100%|██████████| 3270/3270 [03:42<00:00, 14.73it/s]


Evaluating selections...


3270it [01:43, 31.45it/s]


{'non_empty': 0.999388379204893, 'avg_length': 5.412538226299694, 'max_length': 19, 'avg_freq': 0.21277704765535116, 'group_accuracy': 0.9868536343042836, 'exact_group_accuracy': 0.8880733944954129}
Making word maps...


3270it [01:40, 32.62it/s]


Augmenting dataset...


In [None]:
config.update({"name": "maskboolq_test_v5", "df": "boolq_test"})
aug_df = run_augment(dfs, config, gensim.downloader, spacy)
write_df(config, aug_df)

Loading glove and nlp...
Selecting word groups...


100%|██████████| 3245/3245 [03:37<00:00, 14.93it/s]


Evaluating selections...


3245it [01:42, 31.58it/s]


{'non_empty': 0.9996918335901387, 'avg_length': 5.4157164869029275, 'max_length': 21, 'avg_freq': 0.21281435302727558, 'group_accuracy': 0.9857061075544958, 'exact_group_accuracy': 0.8798151001540832}
Making word maps...


3245it [01:36, 33.76it/s]


Augmenting dataset...


## GMM

In [None]:
config = {
    "name": "altboolq_train_v5",
    "df": "boolq_train",
    "alpha": 0.01,
    "grouping_cutoff": 0.66,
    "tfidf_cutoff": 5,
    "freq_cutoff": 0.2,
    "min_term_count": 2,
    "max_doc_freq": 0.1,
    "redact_oov": True,
    "group_size": 8,
    "filter": "group_accuracy",
    "resample": "gmm",
    "glove_path": "fasttext-wiki-news-subwords-300",
    "nlp_path": "en_core_web_md",
    "seed": 42,
    "gmm_n_components": 41,
    "gmm_covariance_type": "spherical",
    "gmm_n_init": 5,
    "gmm_random_state": 0,
}

In [None]:
aug_df = run_augment(dfs, config, gensim.downloader, spacy)
write_df(config, aug_df)

Loading glove and nlp...
Selecting word groups...


100%|██████████| 9427/9427 [11:57<00:00, 13.14it/s]


Evaluating selections...


9427it [04:52, 32.19it/s]


{'non_empty': 0.9994696085711255, 'avg_length': 5.560199427177257, 'max_length': 25, 'avg_freq': 0.2127573669509582, 'group_accuracy': 0.9843800497882866, 'exact_group_accuracy': 0.8655988119231993}
Making word maps...
Initialization 0
  Iteration 10
  Iteration 20
Initialization converged: True
Initialization 1
  Iteration 10
  Iteration 20
Initialization converged: True
Initialization 2
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True
Initialization 3
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True
Initialization 4
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
9427it [2:31:12,  1.04it/s]


{'exact_cluster_match': 0.5072303921568627, 'avg_cluster_match': 0.9333594954019483, 'avg_sim_scores': 0.9550688314599902}
Augmenting dataset...


In [None]:
config.update({"name": "altboolq_valid_v5", "df": "boolq_valid"})
aug_df = run_augment(dfs, config, gensim.downloader, spacy)
write_df(config, aug_df)

Loading glove and nlp...
Selecting word groups...


100%|██████████| 3270/3270 [04:04<00:00, 13.39it/s]


Evaluating selections...


3270it [01:47, 30.46it/s]


{'non_empty': 0.999388379204893, 'avg_length': 5.412538226299694, 'max_length': 19, 'avg_freq': 0.21277704765535116, 'group_accuracy': 0.9868536343042836, 'exact_group_accuracy': 0.8880733944954129}
Making word maps...
Initialization 0
  Iteration 10
Initialization converged: True
Initialization 1
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
  Iteration 70
  Iteration 80
  Iteration 90
  Iteration 100
Initialization converged: True
Initialization 2
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True
Initialization 3
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True
Initialization 4
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
3270it [53:18,  1.02it/s]


{'exact_cluster_match': 0.518595041322314, 'avg_cluster_match': 0.9358587159644131, 'avg_sim_scores': 0.9547176987639805}
Augmenting dataset...


In [None]:
# config.update({"name": "altboolq_test_v5", "df": "boolq_test"})
# aug_df = run_augment(dfs, config, gensim.downloader, spacy)
# write_df(config, aug_df)