In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import issparse
import pandas as pd
import os
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (25, 13)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.size'] = 20
%matplotlib inline

Using TensorFlow backend.


In [65]:
class Featurizer:

    class InvalidTag(Exception):
        pass

    tag_re = re.compile(r'^[A-Z]+')

    def __init__(self, last_char, N, data_path, full_tag=False,
                 use_padding=True,
                 use_word_as_feature=False, tag_filter=None,
                 include_smaller_ngrams=False,
                 encoding='utf-8',
                 sample_per_class=100,
                 max_lines=2000000,
                 grep_filter=None,
                 uniq_lines=False,
                 **kwargs):
        self.last_char = last_char
        self.N = N
        self.full_tag = full_tag
        self.use_padding = use_padding
        self.use_word_as_feature = use_word_as_feature
        self.include_smaller_ngrams = include_smaller_ngrams
        self.data_path = data_path
        self.sample_per_class = sample_per_class
        self.max_lines = max_lines
        self.encoding = encoding
        if uniq_lines:
            self.uniq_lines = set()
        else:
            self.uniq_lines = None
        if tag_filter is not None:
            self.tag_filter = set(tag_filter)
        else:
            self.tag_filter = None
        self.grep_filter = grep_filter

    def featurize(self):
        X = []
        y = []
        sample_cnt = defaultdict(int)
        line_cnt = 0
        if self.grep_filter is not None:
            categ_cnt = len(self.grep_filter) + 1
        elif self.tag_filter:
            categ_cnt = len(self.tag_filter)
        else:
            categ_cnt = 1
        with open(self.data_path, encoding=self.encoding) as f:
            for line in f:
                line_cnt += 1
                if self.max_lines > 0 and line_cnt > self.max_lines:
                    break
                try:
                    word, tag, pos = self.extract_word_and_tag(line)
                except Featurizer.InvalidTag:
                    continue
                if not word.strip() or not tag.strip():
                    continue
                if self.tag_filter is not None and pos not in self.tag_filter:
                    continue
                if self.sample_per_class > 0 and \
                    (all(v >= self.sample_per_class
                         for v in sample_cnt.values()) and
                     len(sample_cnt) >= categ_cnt):
                    break
                if self.tag_filter is None:
                    pos = tag
                if self.uniq_lines is not None:
                    if (word, tag) in self.uniq_lines:
                        continue
                    self.uniq_lines.add((word, tag))
                sample_cnt[(pos, tag)] += 1
                if self.sample_per_class > 0 and sample_cnt[(pos, tag)] > self.sample_per_class:
                    continue
                self.__featurize_and_store_sample(word, tag, X, y)
        print("READ {} lines".format(line_cnt))
        return self.create_feature_matrix(X, y)

    def create_feature_matrix(self, X, y):
        self.X = X
        self.y = y
        self.X_mtx = self.__get_or_create_vectorizer('X_vectorizer', X)
        self.y_vec = self.__get_or_create_vectorizer('y_vectorizer', y)
        return self.X_mtx, self.y_vec
    
    def create_train_test(self, ratio):
        Xtr, Xte, ytr, yte = train_test_split(self.X_mtx, self.y_vec, train_size=ratio)
        self.X_train = Xtr
        self.X_test = Xte
        self.y_train = ytr
        self.y_test = yte
        
    def __get_or_create_vectorizer(self, name, data):
        if not hasattr(self, name):
            dv = DictVectorizer()
            v = dv.fit_transform(data)
            setattr(self, name, dv)
        return v

    def extract_word_and_tag(self, line):
        fd = line.strip().split('\t')
        if len(fd) > 1 and not '/' in fd[1]:
            return (fd[0], fd[1], fd[1])
        word = fd[0]
        tag = fd[-1].split('/')[-1]
        try:
            pos = Featurizer.tag_re.match(tag).group(0)
        except AttributeError:
            raise Featurizer.InvalidTag()
        if self.grep_filter is not None:
            for cl in self.grep_filter:
                if cl in tag:
                    return word, cl, pos
            return word, "NONE", pos
        if self.full_tag is False:
            return word, pos, pos
        return word, tag, tag

    def __featurize_and_store_sample(self, word, tag, X, y):
        if self.use_word_as_feature:
            f = {'word': word}
        else:
            f = self.__featurize_ngram(word)
        X.append(f)
        y.append({'class': tag})

    def __featurize_ngram(self, word):
        feats = {}
        if self.last_char > 0:
            word = word[-self.last_char:]
        if self.include_smaller_ngrams:
            for n in range(1, self.N+1):
                feats.update(Featurizer.extract_ngrams(
                    word, n, self.use_padding))
        else:
            feats.update(Featurizer.extract_ngrams(
                word, self.N, self.use_padding))
        return feats

    @staticmethod
    def extract_ngrams(text, N, padding=False):
        if padding is True:
            text = '{0}{1}{0}'.format(" " * (N-1), text)
        feats = {}
        for i in range(len(text)-N+1):
            feats['{0}_{1}'.format(N, i)] = text[i:i+N]
        return feats

    def get_theoretical_max(self):
        samples = defaultdict(lambda: defaultdict(int))
        for i in range(len(self.X)):
            xi = self.X[i]
            yi = self.y[i]
            f_str = ','.join('{}:{}'.format(feat, val)
                             for feat, val in sorted(xi.items()))
            samples[f_str][yi['class']] += 1
        return sum(max(v.values()) for v in samples.values()) / len(self.X)

In [66]:
class FFNN:
    @staticmethod
    def create_list_if_str(param, length):
        if isinstance(param, str):
            return [param] * length
        return param
    
    def __init__(self, layers, activations, input_dim, output_dim, *,
                 loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
                nb_epoch=50, batch_size=50, early_stopping=True, **kwargs):
        self.layers = layers
        self.activations = FFNN.create_list_if_str(activations, len(self.layers) + 1)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.early_stopping = early_stopping
        self.model_fit_args = {
            'nb_epoch': nb_epoch,
            'batch_size': batch_size,
        }
        self.model_compile_args = {
            'optimizer': optimizer,
            'metrics': metrics,
            'loss': loss,
        }
        self.create_network()
        
    def create_network(self):
        self.model = Sequential()
        # input layer
        self.model.add(Dense(self.layers[0], input_dim=self.input_dim, activation=self.activations[0]))
        for i in range(1, len(self.layers)):
            self.model.add(Dense(self.layers[i], activation=self.activations[i]))
        #output layer
        self.model.add(Dense(self.output_dim, activation=self.activations[-1]))
        self.model.compile(**self.model_compile_args)
        
    def fit(self, X, y):
        X = FFNN.densify(X)
        y = FFNN.densify(y)
        start = datetime.now()
        if self.early_stopping:
            early_stopping = EarlyStopping(monitor='val_loss', patience=2)
            self.model.fit(X, y, verbose=0, validation_split=0.2, callbacks=[early_stopping], **self.model_fit_args)
        else:
            self.model.fit(X, y, verbose=0, **self.model_fit_args)
        return datetime.now() - start
        
    def evaluate(self, X, y, **kwargs):
        X = FFNN.densify(X)
        y = FFNN.densify(y)
        return self.model.evaluate(X, y, **kwargs)
    
    @staticmethod
    def densify(mtx):
        if issparse(mtx):
            return mtx.todense()
        return mtx

In [67]:
class Config:
    def __init__(self, conf_d):
        self.glob_conf = Config.defaults.get('global', {})
        self.feat_conf = Config.defaults.get('featurizer', {})
        self.model_conf = Config.defaults.get('model', {})
        
        self.glob_conf.update(conf_d.get('global', {}))
        self.feat_conf.update(conf_d.get('featurizer', {}))
        self.model_conf.update(conf_d.get('model', {}))
        
    def to_dict(self):
        d = {}
        d.update(self.serialize_config(self.glob_conf, 'global'))
        d.update(self.serialize_config(self.feat_conf, 'feat'))
        d.update(self.serialize_config(self.model_conf, 'model'))
        return d
        
    def serialize_config(self, section, pre):
        d = {}
        for k, v in section.items():
            d['{0}.{1}'.format(pre, k)] = v
        return d
        
    defaults = {
        'global': {
            'train_test_split': .9,
            'nolog': False,
        },
        'model': {
            'loss': 'binary_crossentropy',
            'optimizer': 'rmsprop',
            'metrics': ['accuracy'],
            'nb_epoch': 50,
            'batch_size': 64,
            'early_stopping': True,
        },
        'featurizer': {
            'data_path': '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged',
            'encoding': 'latin2',
            'include_smaller_ngrams': False,
            'use_padding': True,
            'tag_filter': ("NOUN", "VERB"),
            'uniq_lines': False,
        },
    }

In [68]:
class Result:
    __slots__ = ('train_sample_count', 'test_sample_count', 'feature_count', 'class_no',
                 'success', 'exception',
                 'running_time', 'timestamp',
                 'train_acc', 'test_acc', 'train_loss', 'test_loss')
    
    def to_dict(self):
        d = {}
        for k in Result.__slots__:
            try:
                d['result.{}'.format(k)] = getattr(self, k)
            except AttributeError:
                d['result.{}'.format(k)] = None
        return d
    
    def __str__(self):
        if self.success:
            return "SUCCESS: Train accuracy: {}\nTest accuracy: {}".format(self.train_acc, self.test_acc)
        else:
            return "FAIL: {}".format(self.exception)

In [69]:
class Experiment:
    
    df_path = 'new_results.tsv'
    
    def __init__(self, conf_d):
        self.config = Config(conf_d)
        self.featurizer = Featurizer(**self.config.feat_conf)
        self.featurizer.featurize()
        input_dim = self.featurizer.X_mtx.shape[1]
        output_dim = self.featurizer.y_vec.shape[1]
        self.config.model_conf['input_dim'] = input_dim
        self.config.model_conf['output_dim'] = output_dim
        self.initialize_model()
        self.result = Result()
        self.result.feature_count = input_dim
        
    def initialize_model(self):
        if self.config.model_conf['architecture'].lower() == 'ffnn':
            self.model = FFNN(**self.config.model_conf)
        elif self.config.model_conf['architecture'].lower() == 'recurrent':
            self.model = RecurrentNN(**self.config.model_conf)
        elif self.config.model_conf['architecture'].lower() == 'autoencoder':
            self.model = AutoEncoder(**self.config.model_conf)
        else:
            raise ValueError("Model architecture [{}] not supported".format(
                    self.config.model_conf['architecture']))
            
    def fit_train(self):
        self.result.timestamp = datetime.now()
        self.featurizer.create_train_test(self.config.glob_conf['train_test_split'])
        rt = self.model.fit(self.featurizer.X_train.todense(), self.featurizer.y_train.todense())
        self.result.running_time = rt
        
    def evaluate_train(self):
        l =  self.model.evaluate(self.featurizer.X_train, self.featurizer.y_train, batch_size=16)
        self.result.train_loss = l[0]
        self.result.train_acc = l[1]
        self.result.train_sample_count = self.featurizer.X_train.shape[0]
        self.result.class_no = self.featurizer.y_vec.shape[1]
        
    def evaluate_test(self):
        l =  self.model.evaluate(self.featurizer.X_test, self.featurizer.y_test, batch_size=16)
        self.result.test_loss = l[0]
        self.result.test_acc = l[1]
        self.result.test_sample_count = self.featurizer.X_test.shape[0]
        
    def run_and_save(self):
        try:
            self.fit_train()
            self.evaluate_train()
            self.evaluate_test()
        except Exception as e:
            raise
            self.result.success = False
            self.result.exception = type(e).__name__
        else:
            self.result.success = True
        if self.config.glob_conf['nolog'] is False:
            self.save_results()
        
    def save_results(self):
        d = {}
        d.update(self.config.to_dict())
        d.update(self.result.to_dict())
        Experiment.save_to_dataframe(d, Experiment.df_path)
        
    @staticmethod
    def save_to_dataframe(data, fn):
        if os.path.exists(fn):
            df = pd.read_table(fn)
        else:
            df = pd.DataFrame(columns=data.keys())
            
        new_cols = set(data.keys()) - set(df.columns)
        for c in new_cols:
            df[c] = None
        df = df.append(data, ignore_index=True)
        df.to_csv(fn, sep='\t', index=False)

In [76]:
%%time

best_cfg = {
    'global': {
        'nolog': True,
    },
    'featurizer': {
        'last_char': 6,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 30000,
        'include_smaller_ngrams': True,
        'tag_filter': ("NOUN", "VERB"),
        'uniq_lines': True,
        'data_path': '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged',
        'max_lines': 10000000,
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (20, 10),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'binary_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
        'early_stopping': True,
    },
}

e = Experiment(best_cfg)
e.run_and_save()
print(e.result)

READ 1481402 lines
Test accuracy: 0.9449166666666666
CPU times: user 1min 50s, sys: 22.9 s, total: 2min 13s
Wall time: 57.5 s


In [44]:
with open('/tmp/judit/egyik', 'w') as f:
    f.write('\n'.join(sorted(e.featurizer.uniq_lines)))

In [33]:
print(e.featurizer.X_mtx.shape)

(60000, 17701)


In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
        #'comment': "",
    },
    'featurizer': {
        'last_char': 6,
        'N': 3,
        'use_padding': True,
        'sample_per_class': 50000,
        'include_smaller_ngrams': True,
        'tag_filter': ("NOUN", "VERB"),
        'uniq_lines': True,
        'data_path': '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged',
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'binary_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
        'early_stopping': True,
    },
}

cfg['model']['early_stopping'] = True
cfg['model']['nb_epoch'] = 300
cfg['featurizer']['sample_per_class'] = 30000
cfg['featurizer']['N'] = 2
cfg['featurizer']['last_char'] = 6
for l1 in range(10, 101, 10):
    for l2 in range(10, 101, 10):
        print(l1, l2)
        cfg['model']['layers'] = (l1, l2)
        e = Experiment(cfg)
        print(e.featurizer.X_mtx.shape)
        e.run_and_save()
        print(e.result)

In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
        #'comment': "",
    },
    'featurizer': {
        'last_char': 6,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 50000,
        'include_smaller_ngrams': True,
        'tag_filter': ("NOUN", "VERB"),
        'uniq_lines': True,
        'data_path': '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged',
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'binary_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
        'early_stopping': True,
    },
}

cfg['model']['early_stopping'] = True
cfg['model']['nb_epoch'] = 300
cfg['featurizer']['sample_per_class'] = 30000
cfg['featurizer']['N'] = 2 
cfg['featurizer']['last_char'] = 6
for l1 in range(10, 101, 10):
    for l2 in range(10, 101, 10):
        cfg['model']['layers'] = (l1, l2)
        e = Experiment(cfg)
        print(e.featurizer.X_mtx.shape)
        e.run_and_save()
        print(e.result)

In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
        #'comment': "",
    },
    'featurizer': {
        'last_char': 8,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 50000,
        'include_smaller_ngrams': True,
        'tag_filter': ("NOUN", "VERB"),
        'uniq_lines': True,
        'data_path': '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged',
        'early_stopping': True,
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'binary_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
        'early_stopping': True,
    },
}
#e = Experimenterimene.featurizere.featurizer.X_mtx.shape

In [None]:
print("DONEEE")

In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
    },
    'featurizer': {
        'last_char': 6,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 20,
        'include_smaller_ngrams': True,
        #'tag_filter': ("NOUN", "VERB", "ADJ"),
        'tag_filter': ("VERB", ),
        'grep_filter': ("<PERS<1", "<PERS<2" ),
        'max_lines': 20000000,
        'uniq_lines': True,
        'early_stopping': True,
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'categorical_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
    },
}

In [None]:
%%time
general_filters = [
    ("<PLUR", ),
    ("<ACC", ),
]
verb_filters = [
    ("<PERS<1", "<PERS<2"),
    ("<COND", ),
]
noun_filters = [
    ("<PLUR", ),
    ("<CAS<ACC", "<CAS<DAT", "<CAS<INE", "<CAS<INS", "<CAS<SBL"),
]
pos_filters = [
    ("ADV", "ADJ"),
]

N_range = [1, 2]
lc_range = [5, 6, 7]
sample_size = 30000

def run_filter_with_params(cfg, filters, N_range, lc_range, typ='grep_filter'):
    for filt in filters:
        if typ == 'grep_filter':
            class_n = len(filt) + 1
        elif typ == 'tag_filter':
            class_n = len(filt)
        cfg['featurizer'][typ] = filt
        cfg['featurizer']['sample_per_class'] = sample_size / class_n
        if class_n == 2:
            cfg['model']['loss'] = 'binary_crossentropy'
        elif class_n > 2:
            cfg['model']['loss'] = 'categorical_crossentropy'
        for N in N_range:
            cfg['featurizer']['N'] = N
            for last_char in lc_range:
                cfg['featurizer']['last_char'] = last_char
                e = Experiment(cfg)
                print("Starting experiment with filter [{0}], last_char [{1}], N [{2}], classes [{3}], size: [{4}]".format(
                        filt, last_char, N, class_n, e.featurizer.X_mtx.shape))
                e.run_and_save()
    
    
done = False
if done is False:
    print("GENERAL filters")
    cfg['featurizer']['tag_filter'] = None
    run_filter_with_params(cfg, general_filters, N_range, lc_range)
    print("DONE")

    print("VERB filters")
    cfg['featurizer']['tag_filter'] = ("VERB", )
    run_filter_with_params(cfg, verb_filters, N_range, lc_range)

    print("NOUN filters")
    cfg['featurizer']['tag_filter'] = ("NOUN", )
    run_filter_with_params(cfg, noun_filters, N_range, lc_range)
    print("Done")

In [None]:
%%time
verb_filters = [
    ("<PAST", ),
]
noun_filters = [
    ("<CAS<ACC", "<CAS<DAT", "<CAS<INE", "<CAS<INS", "<CAS<SBL", "<CAS<SUE", "<CAS<ALL",
     "<CAS<ILL", "<CAS<ELA", "<CAS<DEL"),
]
pos = [
    ("CONJ", "NOUN"),
    ("CONJ", "NOUN", "ADJ", "VERB"),
]
N_range = [1, 2]
lc_range = [2, 3, 4, 5, 6, 7, 8]
sample_size = 60000

cfg['global']['nolog'] = False
print("POS filter")
cfg['featurizer']['grep_filter'] = None
run_filter_with_params(cfg, pos, N_range, lc_range, 'tag_filter')

print("VERB filters")
cfg['featurizer']['tag_filter'] = ("VERB", )
run_filter_with_params(cfg, verb_filters, N_range, lc_range)

print("NOUN filters")
cfg['featurizer']['tag_filter'] = ("NOUN", )
run_filter_with_params(cfg, noun_filters, N_range, lc_range)

print("Done")

In [None]:
%%time

print("ADV vs ADJ")
N_range = [3]
lc_range = [5, 6, 7]
sample_size = 60000

cfg['featurizer']['grep_filter'] = None
cfg['featurizer']['sample_per_class'] = 30000
cfg['model']['loss'] = 'binary_crossentropy'
cfg['featurizer']['tag_filter'] = ("ADV", "ADJ")
for N in N_range:
    cfg['featurizer']['N'] = N
    for last_char in lc_range:
        cfg['featurizer']['last_char'] = N 
        e = Experiment(cfg)
        e.run_and_save()

In [None]:
print(12)

In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
        'comment': "limit freq[kr_tag][replace_to] to 5. A single KR tag may have been replaced at most 5 times.",
    },
    'featurizer': {
        'data_path': '/mnt/store/judit/projects/ulm/vitmav45-2016-MorphoDeep/dat/webcorp_kr_clustered.limit_repl',
        'last_char': 6,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 10,
        'include_smaller_ngrams': True,
        #'tag_filter': ("NOUN", "VERB", "ADJ"),
        'tag_filter': None,
        'grep_filter': None,
        'max_lines': 20000000,
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'categorical_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
    },
}
sample_range = [500]
N_range = [1, 2]
lc_range = [5, 6]

repl_limit = [30]

for r in repl_limit:
    cfg['featurizer']['data_path'] = "{0}{1}".format(cfg['featurizer']['data_path'], r)
    cfg['featurizer']['tag_limit_in_cluster'] = r
    for s in sample_range:
        print(r, s)
        cfg['featurizer']['sample_per_class'] = s
        for N in N_range:
            cfg['featurizer']['N'] = N
            for lc in lc_range:
                cfg['featurizer']['last_char'] = lc
                e = Experiment(cfg)
                e.run_and_save()

In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
    },
    'featurizer': {
        'data_path': '/mnt/store/judit/projects/ulm/vitmav45-2016-MorphoDeep/dat/webcorp_kr_clustered.limit_repl',
        'last_char': 6,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 300,
        'include_smaller_ngrams': True,
        #'tag_filter': ("NOUN", "VERB", "ADJ"),
        'tag_filter': None,
        'grep_filter': None,
        'max_lines': 20000000,
        'uniq_lines': True,
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'categorical_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
    },
}
sample_range = [100]
N_range = [2]
lc_range = [6]

repl_limit = [10, 25, 50, 75, 100, 200, 500, 1000, 2000, 5000, 10000, 50000]
repl_limit = [2, 3, 5, 15, 20]
repl_limit += [1, 4, 6, 7, 8, 9, 11, 12, 13]
#repl_limit = [14]
data_path = cfg['featurizer']['data_path']
cfg['featurizer']['uniq_lines'] = True

for r in repl_limit:
    fn = "{0}{1}".format(data_path, r)
    if not os.path.exists(fn):
        print("FILE NOT FOUND: {}".format(fn))
        continue
    cfg['featurizer']['data_path'] = fn
    cfg['featurizer']['tag_limit_in_cluster'] = r
    cfg['featurizer']['shuffled'] = True
    cfg['global']['comment'] = "limit freq[kr_tag][replace_to] to {0}. A single KR tag may have been replaced at most " \
        "{0} times.".format(r)
    for s in sample_range:
        cfg['featurizer']['sample_per_class'] = s
        for N in N_range:
            cfg['featurizer']['N'] = N
            for lc in lc_range:
                print(r, s, N, lc)
                cfg['featurizer']['last_char'] = lc
                e = Experiment(cfg)
                e.run_and_save()
                print(e.result)

In [None]:
print("DONEEEE")