From a8853f940627fc968622b5b5c5ce765dc0a6b8c0 Mon Sep 17 00:00:00 2001
From: ZheyuYe <zheyu.ye1995@gmail.com>
Date: Thu, 30 Jul 2020 01:10:06 +0800
Subject: [PATCH] Squashed commit of the following:

commit 232e0b6836a703f416967a1f864946c55dd7bb03
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Thu Jul 30 01:05:17 2020 +0800

    update

commit 995e5d758f13310408bb596b7fa74c9aa52f6917
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Thu Jul 30 01:01:56 2020 +0800

    fix

commit 962324004a60058464d8e55def76b1f5cea6e0bc
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Thu Jul 30 00:52:17 2020 +0800

    fix

commit d9c414029f62018b9d351af47500380f5b2f6311
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Wed Jul 29 23:07:10 2020 +0800

    fix transformer

commit e49fbe1c5033fe60b81018dac20f84f30da8b487
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Wed Jul 29 22:18:12 2020 +0800

    update

commit 1f75b263cc55df776637d88172f339a45d8d97f1
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Wed Jul 29 22:04:08 2020 +0800

    test bart

commit 5bab5163b64bc3ed35069243dbcd1b5a634e0024
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Wed Jul 29 21:34:47 2020 +0800

    fix cfg

commit 6c62a29934c66f5795153c9343c4027596a86ea2
Merge: 3366cf3 033214e
Author: ZheyuYe <zheyu.ye1995@gmail.com>
Date:   Wed Jul 29 21:33:10 2020 +0800

    Merge remote-tracking branch 'upstream/numpy' into bart

commit 033214ec7eb7c36006bc3c6846220166b6bb5a00
Author: Xingjian Shi <xshiab@connect.ust.hk>
Date:   Wed Jul 29 00:36:57 2020 -0700

    [Numpy] Fix SQuAD + Fix GLUE downloading (#1280)

    * Update run_squad.py

    * Update run_squad.py

    * Update prepare_glue.py

commit 3c874575bf40e8b1fa2280371131a8f29ebb3e98
Author: Xingjian Shi <xshiab@connect.ust.hk>
Date:   Tue Jul 28 18:03:21 2020 -0700

    Add layout + compute_layout support: TransformerNMT, BERT, ALBERT, ELECTRA, MobileBERT, RoBERTA, XLMR (#1258)

    * Add layout support

    * fix test

    * Update transformer.py

    * Update transformer.py

    * Update README.md

    * try to add set_layout

    * update test case

    * fix

    * update

    * update

    * update

    * Update bert.py

    * fix bug

    * update

    * Update test_models_bert.py

    * Update tokenizers.py

    * add compute layout

    * Update xlmr.py

    * Update test_models_bert.py

    * revise test cases

    * Update layers.py

    * move jieba to try import

    * fix

    * Update transformer.py

    * fix

    * Update bert.py

    * Update setup.py

    * Update test_models_bert.py

    * Update test_models_bert.py

    * fix

    * update

    * Revise

    * Update electra.py

    * Update electra.py

    * Update test_models_electra.py

    * fix

    * fix bug

    * Update test_models_albert.py

    * add more testcases

    * fix

    * Update albert.py

    * Update albert.py

    * fix bug

    * fix testcase

    * Update test_models_electra.py

    * Update bert.py

    * update

    * Update test_models_electra.py

    * Update mobilebert.py

    * Update mobilebert.py

    * update mobilebert

    * Update test_models_mobilebert.py

    * Update mobilebert.py

    * fix bug

    * Update roberta.py

    * fix roberta

    * update

    * update

    * fix import

    * fix bug

    * update

    * reduce test workloads

    * address comment

    * address comment

commit 4d43f82f8f1a9dfa2f7550d20bcc152c13803798
Author: Sheng Zha <szha@users.noreply.github.com>
Date:   Mon Jul 27 20:21:00 2020 -0700

    add subversion/wget to docker, add readme (#1279)

commit d76897b4368f1402672df4c1cc6becfca2df2402
Author: phile <phile_999@126.com>
Date:   Tue Jul 28 10:10:13 2020 +0800

    Add embedding related methods in numpy version (#1263)

    * A draft for embedding

    * fix embed_loader

    * add hyperbolic space and some updates

    * revise evaluation

    * fix

    * simple fixes

    * move l2norm to op.py

    * new features

    * fix

    * update

    * add tests, update

    * newline
---
 README.md                                     |    9 +-
 .../general_nlp_benchmark/prepare_glue.py     |   96 +-
 scripts/question_answering/run_squad.py       |    4 +-
 setup.py                                      |    2 +
 src/gluonnlp/__init__.py                      |    1 +
 src/gluonnlp/attention_cell.py                |   98 +-
 src/gluonnlp/data/tokenizers.py               |   20 +-
 src/gluonnlp/embedding/__init__.py            |   24 +
 src/gluonnlp/embedding/_constants.py          | 1002 +++++++++++++++++
 src/gluonnlp/embedding/embed_loader.py        |  320 ++++++
 src/gluonnlp/layers.py                        |    5 +-
 src/gluonnlp/models/albert.py                 |  318 ++++--
 src/gluonnlp/models/bart.py                   |   58 +-
 src/gluonnlp/models/bert.py                   |  364 ++++--
 src/gluonnlp/models/electra.py                |  424 +++++--
 src/gluonnlp/models/mobilebert.py             |  434 ++++---
 src/gluonnlp/models/roberta.py                |  309 +++--
 src/gluonnlp/models/transformer.py            |  452 +++++---
 src/gluonnlp/models/transformer_xl.py         |    9 +-
 src/gluonnlp/models/xlmr.py                   |   52 +-
 src/gluonnlp/op.py                            |   19 +
 src/gluonnlp/utils/testing.py                 |  152 ++-
 test_batch.sh                                 |    7 -
 tests/test_attention_cell.py                  |   51 +-
 tests/test_embedding.py                       |   50 +
 tests/test_models_albert.py                   |   68 +-
 tests/test_models_bart.py                     |   57 +-
 tests/test_models_bert.py                     |   78 +-
 tests/test_models_electra.py                  |   59 +-
 tests/test_models_mobilebert.py               |   78 +-
 tests/test_models_roberta.py                  |   54 +
 tests/test_models_transformer.py              |   79 +-
 tests/test_models_xlmr.py                     |    4 +-
 tools/batch/docker/Dockerfile                 |    2 +
 tools/batch/docker/README.md                  |   22 +
 35 files changed, 3880 insertions(+), 901 deletions(-)
 create mode 100644 src/gluonnlp/embedding/__init__.py
 create mode 100644 src/gluonnlp/embedding/_constants.py
 create mode 100644 src/gluonnlp/embedding/embed_loader.py
 delete mode 100644 test_batch.sh
 create mode 100644 tests/test_embedding.py
 create mode 100644 tools/batch/docker/README.md

diff --git a/README.md b/README.md
index 34fc069cbc..65b877451a 100644
--- a/README.md
+++ b/README.md
@@ -19,12 +19,17 @@ This is a work-in-progress.
 First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
+# Install the version with CUDA 10.0
+pip install -U --pre "mxnet-cu100>=2.0.0b20200716" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-pip install -U --pre mxnet-cu101>=2.0.0b20200716 -f https://dist.mxnet.io/python
+pip install -U --pre "mxnet-cu101>=2.0.0b20200716" -f https://dist.mxnet.io/python
+
+# Install the version with CUDA 10.2
+pip install -U --pre "mxnet-cu102>=2.0.0b20200716" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-pip install -U --pre mxnet>=2.0.0b20200716 -f https://dist.mxnet.io/python
+pip install -U --pre "mxnet>=2.0.0b20200716" -f https://dist.mxnet.io/python
 ```
 
 
diff --git a/scripts/datasets/general_nlp_benchmark/prepare_glue.py b/scripts/datasets/general_nlp_benchmark/prepare_glue.py
index e747626db6..bbaf01cf48 100644
--- a/scripts/datasets/general_nlp_benchmark/prepare_glue.py
+++ b/scripts/datasets/general_nlp_benchmark/prepare_glue.py
@@ -68,7 +68,23 @@ def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False):
                 nrows = len(elements)
             else:
                 assert nrows == len(elements)
-    return pd.DataFrame(out, columns=column_names)
+    df = pd.DataFrame(out, columns=column_names)
+    series_l = []
+    for col_name in df.columns:
+        idx = df[col_name].first_valid_index()
+        val = df[col_name][idx]
+        if isinstance(val, str):
+            try:
+                dat = pd.to_numeric(df[col_name])
+                series_l.append(dat)
+                continue
+            except ValueError:
+                pass
+            finally:
+                pass
+        series_l.append(df[col_name])
+    new_df = pd.DataFrame({name: series for name, series in zip(df.columns, series_l)})
+    return new_df
 
 
 def read_jsonl_superglue(jsonl_file):
@@ -157,6 +173,13 @@ def read_sts(dir_path):
         else:
             df = df[[7, 8, 1, 9]]
             df.columns = ['sentence1', 'sentence2', 'genre', 'score']
+        genre_l = []
+        for ele in df['genre'].tolist():
+            if ele == 'main-forum':
+                genre_l.append('main-forums')
+            else:
+                genre_l.append(ele)
+        df['genre'] = pd.Series(genre_l)
         df_dict[fold] = df
     return df_dict, None
 
@@ -320,8 +343,8 @@ def read_rte_superglue(dir_path):
 def read_wic(dir_path):
     df_dict = dict()
     meta_data = dict()
-    meta_data['entities1'] = {'type': 'entity', 'parent': 'sentence1'}
-    meta_data['entities2'] = {'type': 'entity', 'parent': 'sentence2'}
+    meta_data['entities1'] = {'type': 'entity', 'attrs': {'parent': 'sentence1'}}
+    meta_data['entities2'] = {'type': 'entity', 'attrs': {'parent': 'sentence2'}}
 
     for fold in ['train', 'val', 'test']:
         if fold != 'test':
@@ -340,13 +363,13 @@ def read_wic(dir_path):
             end2 = row['end2']
             if fold == 'test':
                 out.append([sentence1, sentence2,
-                            (start1, end1),
-                            (start2, end2)])
+                            {'start': start1, 'end': end1},
+                            {'start': start2, 'end': end2}])
             else:
                 label = row['label']
                 out.append([sentence1, sentence2,
-                            (start1, end1),
-                            (start2, end2),
+                            {'start': start1, 'end': end1},
+                            {'start': start2, 'end': end2},
                             label])
         df = pd.DataFrame(out, columns=columns)
         df_dict[fold] = df
@@ -357,8 +380,8 @@ def read_wsc(dir_path):
     df_dict = dict()
     tokenizer = WhitespaceTokenizer()
     meta_data = dict()
-    meta_data['noun'] = {'type': 'entity', 'parent': 'text'}
-    meta_data['pronoun'] = {'type': 'entity', 'parent': 'text'}
+    meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+    meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
     for fold in ['train', 'val', 'test']:
         jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
         df = read_jsonl_superglue(jsonl_path)
@@ -374,7 +397,7 @@ def read_wsc(dir_path):
             span2_text = target['span2_text']
             # Build entity
             # list of entities
-            # 'entity': {'start': 0, 'end': 100}
+            # 'entities': {'start': 0, 'end': 100}
             tokens, offsets = tokenizer.encode_with_offsets(text, str)
             pos_start1 = offsets[span1_index][0]
             pos_end1 = pos_start1 + len(span1_text)
@@ -382,12 +405,12 @@ def read_wsc(dir_path):
             pos_end2 = pos_start2 + len(span2_text)
             if fold == 'test':
                 samples.append({'text': text,
-                                'noun': (pos_start1, pos_end1),
-                                'pronoun': (pos_start2, pos_end2)})
+                                'noun': {'start': pos_start1, 'end': pos_end1},
+                                'pronoun': {'start': pos_start2, 'end': pos_end2}})
             else:
                 samples.append({'text': text,
-                                'noun': (pos_start1, pos_end1),
-                                'pronoun': (pos_start2, pos_end2),
+                                'noun': {'start': pos_start1, 'end': pos_end1},
+                                'pronoun': {'start': pos_start2, 'end': pos_end2},
                                 'label': label})
         df = pd.DataFrame(samples)
         df_dict[fold] = df
@@ -406,8 +429,8 @@ def read_boolq(dir_path):
 def read_record(dir_path):
     df_dict = dict()
     meta_data = dict()
-    meta_data['entities'] = {'type': 'entity', 'parent': 'text'}
-    meta_data['answers'] = {'type': 'entity', 'parent': 'text'}
+    meta_data['entities'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+    meta_data['answers'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
     for fold in ['train', 'val', 'test']:
         if fold != 'test':
             columns = ['source', 'text', 'entities', 'query', 'answers']
@@ -422,15 +445,11 @@ def read_record(dir_path):
             passage = row['passage']
             text = passage['text']
             entities = passage['entities']
-            entities = [(ele['start'], ele['end']) for ele in entities]
+            entities = [{'start': ele['start'], 'end': ele['end']} for ele in entities]
             for qas in row['qas']:
                 query = qas['query']
                 if fold != 'test':
-                    answer_entities = []
-                    for answer in qas['answers']:
-                        start = answer['start']
-                        end = answer['end']
-                        answer_entities.append((start, end))
+                    answer_entities = qas['answers']
                     out.append((source, text, entities, query, answer_entities))
                 else:
                     out.append((source, text, entities, query))
@@ -518,11 +537,15 @@ def format_mrpc(data_dir):
     os.makedirs(mrpc_dir, exist_ok=True)
     mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
     mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
-    download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file)
-    download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file)
+    download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file,
+             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']])
+    download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file,
+             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']])
     assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
     assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
-    download(GLUE_TASK2PATH["mrpc"]['dev'], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    download(GLUE_TASK2PATH["mrpc"]['dev'],
+             os.path.join(mrpc_dir, "dev_ids.tsv"),
+             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']])
 
     dev_ids = []
     with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
@@ -575,7 +598,7 @@ def get_tasks(benchmark, task_names):
 @DATA_PARSER_REGISTRY.register('prepare_glue')
 def get_parser():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--benchmark", choices=['glue', 'superglue', 'sts'],
+    parser.add_argument("--benchmark", choices=['glue', 'superglue'],
                         default='glue', type=str)
     parser.add_argument("-d", "--data_dir", help="directory to save data to", type=str,
                         default=None)
@@ -618,22 +641,24 @@ def main(args):
                 base_dir = os.path.join(args.data_dir, 'rte_diagnostic')
                 os.makedirs(base_dir, exist_ok=True)
                 download(TASK2PATH['diagnostic'][0],
-                         path=os.path.join(base_dir, 'diagnostic.tsv'))
+                         path=os.path.join(base_dir, 'diagnostic.tsv'),
+                         sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]])
                 download(TASK2PATH['diagnostic'][1],
-                         path=os.path.join(base_dir, 'diagnostic-full.tsv'))
+                         path=os.path.join(base_dir, 'diagnostic-full.tsv'),
+                         sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]])
                 df = reader(base_dir)
-                df.to_pickle(os.path.join(base_dir, 'diagnostic-full.pd.pkl'))
+                df.to_parquet(os.path.join(base_dir, 'diagnostic-full.parquet'))
             else:
                 for key, name in [('broadcoverage-diagnostic', 'AX-b'),
                                   ('winogender-diagnostic', 'AX-g')]:
                     data_file = os.path.join(args.cache_path, "{}.zip".format(key))
                     url = TASK2PATH[key]
                     reader = TASK2READER[key]
-                    download(url, data_file)
+                    download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
                     with zipfile.ZipFile(data_file) as zipdata:
                         zipdata.extractall(args.data_dir)
                     df = reader(os.path.join(args.data_dir, name))
-                    df.to_pickle(os.path.join(args.data_dir, name, '{}.pd.pkl'.format(name)))
+                    df.to_parquet(os.path.join(args.data_dir, name, '{}.parquet'.format(name)))
         elif task == 'mrpc':
             reader = TASK2READER[task]
             format_mrpc(args.data_dir)
@@ -641,7 +666,7 @@ def main(args):
             for key, df in df_dict.items():
                 if key == 'val':
                     key = 'dev'
-                df.to_pickle(os.path.join(args.data_dir, 'mrpc', '{}.pd.pkl'.format(key)))
+                df.to_parquet(os.path.join(args.data_dir, 'mrpc', '{}.parquet'.format(key)))
             with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'), 'w') as f:
                 json.dump(meta_data, f)
         else:
@@ -649,8 +674,11 @@ def main(args):
             data_file = os.path.join(args.cache_path, "{}.zip".format(task))
             url = TASK2PATH[task]
             reader = TASK2READER[task]
-            download(url, data_file)
+            download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
             base_dir = os.path.join(args.data_dir, task)
+            if os.path.exists(base_dir):
+                print('Found!')
+                continue
             zip_dir_name = None
             with zipfile.ZipFile(data_file) as zipdata:
                 if zip_dir_name is None:
@@ -662,7 +690,7 @@ def main(args):
             for key, df in df_dict.items():
                 if key == 'val':
                     key = 'dev'
-                df.to_pickle(os.path.join(base_dir, '{}.pd.pkl'.format(key)))
+                df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key)))
             if meta_data is not None:
                 with open(os.path.join(base_dir, 'metadata.json'), 'w') as f:
                     json.dump(meta_data, f)
diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
index 820aec0c46..1484aeccd2 100644
--- a/scripts/question_answering/run_squad.py
+++ b/scripts/question_answering/run_squad.py
@@ -563,8 +563,8 @@ def train(args):
                 segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None
                 valid_length = sample.valid_length.as_in_ctx(ctx)
                 p_mask = sample.masks.as_in_ctx(ctx)
-                gt_start = sample.gt_start.as_in_ctx(ctx)
-                gt_end = sample.gt_end.as_in_ctx(ctx)
+                gt_start = sample.gt_start.as_in_ctx(ctx).astype(np.int32)
+                gt_end = sample.gt_end.as_in_ctx(ctx).astype(np.int32)
                 is_impossible = sample.is_impossible.as_in_ctx(ctx).astype(np.int32)
                 batch_idx = mx.np.arange(tokens.shape[0], dtype=np.int32, ctx=ctx)
                 p_mask = 1 - p_mask  # In the network, we use 1 --> no_mask, 0 --> mask
diff --git a/setup.py b/setup.py
index 29cbc0c029..3de80f5695 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,8 @@ def find_version(*file_paths):
         'scripts',
     )),
     package_dir={"": "src"},
+    package_data={'': [os.path.join('models', 'model_zoo_checksums', '*.txt'),
+                       os.path.join('cli', 'data', 'url_checksums', '*.txt')]},
     zip_safe=True,
     include_package_data=True,
     install_requires=requirements,
diff --git a/src/gluonnlp/__init__.py b/src/gluonnlp/__init__.py
index 8eb18ab075..31e7e08557 100644
--- a/src/gluonnlp/__init__.py
+++ b/src/gluonnlp/__init__.py
@@ -12,3 +12,4 @@
 from . import optimizer
 from . import registry
 from . import sequence_sampler
+from . import embedding
diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py
index 5b292f9823..4773f81d46 100644
--- a/src/gluonnlp/attention_cell.py
+++ b/src/gluonnlp/attention_cell.py
@@ -20,6 +20,7 @@
 import mxnet as mx
 from mxnet.gluon.block import HybridBlock
 from mxnet.gluon import nn
+from .op import l2_normalize
 from .layers import SinusoidalPositionalEmbedding,\
                     BucketPositionalEmbedding,\
                     LearnedPositionalEmbedding
@@ -32,7 +33,8 @@
 def gen_self_attn_mask(F, data,
                        valid_length=None,
                        dtype: type = np.float32,
-                       attn_type: str = 'full'):
+                       attn_type: str = 'full',
+                       layout: str = 'NT'):
     """Generate the mask used for the encoder, i.e, self-attention.
 
     In our implementation, 1 --> not masked, 0 --> masked
@@ -99,25 +101,37 @@ def gen_self_attn_mask(F, data,
 
     Parameters
     ----------
-    F :
-    data :
-        The data. Shape (batch_size, seq_length, C)
-    valid_length :
+    F
+    data
+        The data.
+        - layout = 'NT'
+            Shape (batch_size, seq_length, C)
+        - layout = 'TN'
+            Shape (seq_length, batch_size, C)
+    valid_length
         Shape (batch_size,)
     dtype
         Data type of the mask
-    attn_type : str
+    attn_type
         Can be 'full' or 'causal'
+    layout
+        The layout of the data
 
     Returns
     -------
     mask
         Shape (batch_size, seq_length, seq_length)
     """
+    if layout == 'NT':
+        batch_axis, time_axis = 0, 1
+    elif layout == 'TN':
+        batch_axis, time_axis = 1, 0
+    else:
+        raise NotImplementedError('Unsupported layout={}'.format(layout))
     if attn_type == 'full':
         if valid_length is not None:
             valid_length = valid_length.astype(dtype)
-            steps = F.npx.arange_like(data, axis=1)  # (seq_length,)
+            steps = F.npx.arange_like(data, axis=time_axis)  # (seq_length,)
             mask1 = (F.npx.reshape(steps, (1, 1, -1))
                      < F.npx.reshape(valid_length, (-2, 1, 1)))
             mask2 = (F.npx.reshape(steps, (1, -1, 1))
@@ -125,12 +139,12 @@ def gen_self_attn_mask(F, data,
             mask = mask1 * mask2
         else:
             # TODO(sxjscience) optimize
-            seq_len_ones = F.np.ones_like(F.npx.arange_like(data, axis=1))  # (seq_length,)
-            batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=0))    # (batch_size,)
+            seq_len_ones = F.np.ones_like(F.npx.arange_like(data, axis=time_axis))  # (seq_length,)
+            batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=batch_axis))   # (batch_size,)
             mask = batch_ones.reshape((-1, 1, 1)) * seq_len_ones.reshape((1, -1, 1))\
                    * seq_len_ones.reshape((1, 1, -1))
     elif attn_type == 'causal':
-        steps = F.npx.arange_like(data, axis=1)
+        steps = F.npx.arange_like(data, axis=time_axis)
         # mask: (seq_length, seq_length)
         # batch_mask: (batch_size, seq_length)
         mask = (F.np.expand_dims(steps, axis=0) <= F.np.expand_dims(steps, axis=1)).astype(dtype)
@@ -139,7 +153,8 @@ def gen_self_attn_mask(F, data,
             batch_mask = (F.np.expand_dims(steps, axis=0) < F.np.expand_dims(valid_length, axis=-1)).astype(dtype)
             mask = mask * F.np.expand_dims(batch_mask, axis=-1)
         else:
-            batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=0), dtype=np.float32)  # (batch_size,)
+            batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=batch_axis),
+                                        dtype=dtype)  # (batch_size,)
             mask = mask * batch_ones.reshape((-1, 1, 1))
     else:
         raise NotImplementedError
@@ -147,7 +162,8 @@ def gen_self_attn_mask(F, data,
     return mask
 
 
-def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None, dtype=np.float32):
+def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None,
+                      dtype=np.float32, layout: str = 'NT'):
     """Generate the mask used for the decoder. All query slots are attended to the memory slots.
 
     In our implementation, 1 --> not masked, 0 --> masked
@@ -182,34 +198,48 @@ def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None, dt
     Parameters
     ----------
     F :
-    mem :
-        Shape (batch_size, mem_length, C_mem)
+    mem
+       - layout = 'NT'
+            Shape (batch_size, mem_length, C_mem)
+       - layout = 'TN'
+            Shape (mem_length, batch_size, C_mem)
     mem_valid_length :
         Shape (batch_size,)
-    data :
-        Shape (batch_size, query_length, C_data)
+    data
+        - layout = 'NT'
+            Shape (batch_size, query_length, C_data)
+        - layout = 'TN'
+            Shape (query_length, batch_size, C_data)
     data_valid_length :
         Shape (batch_size,)
-    dtype : type
+    dtype
         Data type of the mask
+    layout
+        Layout of the data + mem tensor
 
     Returns
     -------
     mask :
         Shape (batch_size, query_length, mem_length)
     """
+    if layout == 'NT':
+        batch_axis, time_axis = 0, 1
+    elif layout == 'TN':
+        batch_axis, time_axis = 1, 0
+    else:
+        raise NotImplementedError('Unsupported layout={}'.format(layout))
     mem_valid_length = mem_valid_length.astype(dtype)
-    mem_steps = F.npx.arange_like(mem, axis=1)  # (mem_length,)
+    mem_steps = F.npx.arange_like(mem, axis=time_axis)  # (mem_length,)
+    data_steps = F.npx.arange_like(data, axis=time_axis)  # (query_length,)
     mem_mask = (F.npx.reshape(mem_steps, (1, 1, -1))
                 < F.npx.reshape(mem_valid_length, (-2, 1, 1))).astype(dtype)  # (B, 1, mem_length)
     if data_valid_length is not None:
         data_valid_length = data_valid_length.astype(dtype)
-        data_steps = F.npx.arange_like(data, axis=1)  # (query_length,)
         data_mask = (F.npx.reshape(data_steps, (1, -1, 1))
                      < F.npx.reshape(data_valid_length, (-2, 1, 1))).astype(dtype)  # (B, query_length, 1)
         mask = mem_mask * data_mask
     else:
-        query_length_ones = F.np.ones_like(F.npx.arange_like(data, axis=1))  # (query_length,)
+        query_length_ones = F.np.ones_like(data_steps)
         mask = query_length_ones.reshape((1, -1, 1)) * mem_mask
     return mask
 
@@ -300,24 +330,6 @@ def masked_logsoftmax(F, att_score, mask, dtype=np.float32, axis: int = -1):
     return logits
 
 
-def l2_normalize(F, data, axis=-1, eps=1E-6):
-    """Normalize the data by L2 normalization.
-
-    Parameters
-    ----------
-    F : mx.sym or mx.nd
-    data : symbol or ndarray
-    axis : int, default -1
-    eps : float, default 1E-6
-
-    Returns
-    -------
-    ret : mx.sym or mx.nd
-    """
-    ret = data / (F.np.linalg.norm(data, axis=axis, keepdims=True) + eps)
-    return ret
-
-
 # TODO(sxjscience) Default to einsum. Current it is not the default because
 #   1) einsum is super-slow: https://github.com/apache/incubator-mxnet/issues/18043
 def dot_attn_score(F, query, key, scaled=True, normalized=False, eps=1E-6,
@@ -611,6 +623,7 @@ def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0,
         self._normalized = normalized
         self._eps = eps
         self._dtype = dtype
+        assert layout in ['NTK', 'NKT', 'TNK']
         self._layout = layout
         self._use_einsum = use_einsum
         if self._query_units is not None:
@@ -621,6 +634,10 @@ def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0,
         else:
             self._query_head_units = None
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, query, key, value, mask=None, edge_scores=None):
         return multi_head_dot_attn(F, query=query, key=key, value=value,
                                    mask=mask, edge_scores=edge_scores,
@@ -781,6 +798,11 @@ def __init__(self, query_units,
         else:
             raise NotImplementedError('method="{}" is currently not supported!'.format(method))
 
+    @property
+    def layout(self) -> str:
+        """Layout of the cell"""
+        return self._layout
+
     def hybrid_forward(self, F, rel_positions, query=None):
         """
 
diff --git a/src/gluonnlp/data/tokenizers.py b/src/gluonnlp/data/tokenizers.py
index a7aa40ee7b..d9579b2d55 100644
--- a/src/gluonnlp/data/tokenizers.py
+++ b/src/gluonnlp/data/tokenizers.py
@@ -26,21 +26,20 @@
 import json
 from collections import OrderedDict
 import abc
-import sys
 import warnings
 import itertools
 from typing import NewType
 import sacremoses
-import jieba
 from uuid import uuid4
 from .vocab import Vocab
 from ..registry import TOKENIZER_REGISTRY
-from ..utils.lazy_imports import try_import_subword_nmt, \
-    try_import_sentencepiece, \
-    try_import_huggingface_tokenizers, \
-    try_import_yttm, \
-    try_import_spacy, \
-    try_import_jieba
+from ..utils.lazy_imports import try_import_subword_nmt,\
+                                 try_import_sentencepiece,\
+                                 try_import_huggingface_tokenizers,\
+                                 try_import_yttm,\
+                                 try_import_spacy,\
+                                 try_import_jieba
+
 
 SentencesType = NewType('SentencesType', Union[str, List[str]])
 TokensType = NewType('TokensType', Union[List[str], List[List[str]]])
@@ -553,10 +552,10 @@ class JiebaTokenizer(BaseTokenizerWithVocab):
 
     """
 
-    def __init__(self, ditionary=None, vocab: Optional[Vocab] = None):
+    def __init__(self, dictionary=None, vocab: Optional[Vocab] = None):
         self._vocab = vocab
         jieba = try_import_jieba()
-        self._tokenizer = jieba.Tokenizer(ditionary)
+        self._tokenizer = jieba.Tokenizer(dictionary)
         self._tokenizer.initialize(self._tokenizer.dictionary)
 
     def encode(self, sentences, output_type=str):
@@ -626,6 +625,7 @@ def __getstate__(self):
         return d
 
     def __setstate__(self, state):
+        jieba = try_import_jieba()
         self._tokenizer = jieba.Tokenizer()
         for k, v in state.items():
             setattr(self._tokenizer, k, v)
diff --git a/src/gluonnlp/embedding/__init__.py b/src/gluonnlp/embedding/__init__.py
new file mode 100644
index 0000000000..73b1b54178
--- /dev/null
+++ b/src/gluonnlp/embedding/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""Word embeddings."""
+
+from . import embed_loader
+from .embed_loader import *
+
+__all__ = (embed_loader.__all__ )
diff --git a/src/gluonnlp/embedding/_constants.py b/src/gluonnlp/embedding/_constants.py
new file mode 100644
index 0000000000..1c7921d313
--- /dev/null
+++ b/src/gluonnlp/embedding/_constants.py
@@ -0,0 +1,1002 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Constants."""
+
+GLOVE_NPZ_SHA1 = \
+    {'glove.42B.300d': ('glove.42B.300d.npz',
+                        '7deee8f4860744db53ed9e50892effe9883e6d89'),
+     'glove.6B.100d': ('glove.6B.100d.npz',
+                       '01f80f202fcabcc3e0804898349087bfc191dd1c'),
+     'glove.6B.200d': ('glove.6B.200d.npz',
+                       '5e6e2bdab346c257f88d80d215d518e680d86e32'),
+     'glove.6B.300d': ('glove.6B.300d.npz',
+                       '1db264aa936be62f055dfb72854204450bdf4399'),
+     'glove.6B.50d': ('glove.6B.50d.npz',
+                      'aa16be8d184399d2199f83fd62586f2c30497bfa'),
+     'glove.840B.300d': ('glove.840B.300d.npz',
+                         'b4ba390c1154736e07c0e67d9180935f5930e83c'),
+     'glove.twitter.27B.100d': ('glove.twitter.27B.100d.npz',
+                                '0f7b82c223451d0002f79ba23596983cdbe0e2b1'),
+     'glove.twitter.27B.200d': ('glove.twitter.27B.200d.npz',
+                                '41cc2d26f58a54622ce96bf6c8434360ab524f20'),
+     'glove.twitter.27B.25d': ('glove.twitter.27B.25d.npz',
+                               '9f563d2f296995598cc46812b2fda05ad4c3c879'),
+     'glove.twitter.27B.50d': ('glove.twitter.27B.50d.npz',
+                               'ce9959c056f2a0a780c468feeb4f823af51630e9')}
+
+WORD2VEC_NPZ_SHA1 = {
+    'GoogleNews-vectors-negative300': ('GoogleNews-vectors-negative300-be6d6f98.npz',
+                                       'be6d6f98609bd65db8f6991ecaed923f1b1c8377'),
+    'freebase-vectors-skipgram1000-en': ('freebase-vectors-skipgram1000-en-6086803e.npz',
+                                         '6086803e4fd0b60e12b79031d585ef2c63ca71e6'),
+    'freebase-vectors-skipgram1000': ('freebase-vectors-skipgram1000-827a5d3a.npz',
+                                      '827a5d3a777ea3de21be4f61dad4de6510d77ee3')
+}
+
+FAST_TEXT_NPZ_SHA1 = {
+    'crawl-300d-2M': ('crawl-300d-2M.npz', '9dd611a1fe280c63050cd546d3595400fc0eede4'),
+    'crawl-300d-2M-subword': ('crawl-300d-2M-subword-927782c8e.npz',
+                              '927782c8ec8c2c1deb6a8a550217478e183ca25a'),
+    'wiki.aa': ('wiki.aa.npz', '48f163b80eb37f1806142169d3d4c05cf75b7339'),
+    'wiki.ab': ('wiki.ab.npz', '860ceff119dd27e5b701b605879037c1310cbc3e'),
+    'wiki.ace': ('wiki.ace.npz', '62938287464040491719f56a6f521f8f808beee8'),
+    'wiki.ady': ('wiki.ady.npz', '646843afa260d018ed711df3f1ca9c3e000447b6'),
+    'wiki.af': ('wiki.af.npz', '7b14cd27690b67fea318d0bac2283c16430680e2'),
+    'wiki.ak': ('wiki.ak.npz', '20f309adad1c45958c97b6055d5838e05bbaea72'),
+    'wiki.als': ('wiki.als.npz', 'a8b03aa133c4f7da12fc27c2b167b7918b1e9805'),
+    'wiki.am': ('wiki.am.npz', 'ed3dd10cea64737f7a1623612ee099df9dc19f66'),
+    'wiki.ang': ('wiki.ang.npz', '8efe64706d9d6b8eae38b2c7ff0b277e20592bc7'),
+    'wiki.an': ('wiki.an.npz', '168046283c719ab96a29b1abae2e25a6575c7be8'),
+    'wiki.arc': ('wiki.arc.npz', '049021b7decea4bc009b12936e56b4dbf5b760e7'),
+    'wiki.ar': ('wiki.ar.npz', '7e325e1e98dfcdc9368d2ebe40ee834a2ed44912'),
+    'wiki.arz': ('wiki.arz.npz', '7d851c2c7be3ee6f7fd896de7b76ea08e3fb08b0'),
+    'wiki.as': ('wiki.as.npz', '01d38c29cd4bd99c1a8534abc058822da14a5b9c'),
+    'wiki.ast': ('wiki.ast.npz', '9c9846ba5084505a0adea89c95c66e04efbf5ce9'),
+    'wiki.av': ('wiki.av.npz', '7ef6a920c364638504e673cfde5f7675503fa81e'),
+    'wiki.ay': ('wiki.ay.npz', 'c1202e110930e3902397f5cb64a8359e013b469f'),
+    'wiki.azb': ('wiki.azb.npz', '10351b7ef14ec2cb610d290cb6a3f6987ef5d8b3'),
+    'wiki.az': ('wiki.az.npz', '74257c3bcd533a606afae509ea835dc036d61546'),
+    'wiki.ba': ('wiki.ba.npz', '4a2857ed694d66864df562b376c2fa12fcb03646'),
+    'wiki.bar': ('wiki.bar.npz', 'e65c6b7e9ff83798d1eea05d166148837d53e615'),
+    'wiki.bat_smg': ('wiki.bat_smg.npz', '6420584ae28ba6c9dd145fea8f096243d457c2d8'),
+    'wiki.bcl': ('wiki.bcl.npz', '33606c970ab336b678393e2bdb8af2116d11cf7b'),
+    'wiki.be': ('wiki.be.npz', '84487d341e333344cf71bc12c7a205d923762498'),
+    'wiki.bg': ('wiki.bg.npz', '56f2a175b1a1d1a9cf9f1cea277cd0b46ffd7f66'),
+    'wiki.bh': ('wiki.bh.npz', '07473989853a344a41aaa18f41030dc56d0d01c7'),
+    'wiki.bi': ('wiki.bi.npz', '08adfa3c9ef3016d30ef69ea539d217ff67eda09'),
+    'wiki.bjn': ('wiki.bjn.npz', '998a551283222931d3a26922308449950bfa3ec7'),
+    'wiki.bm': ('wiki.bm.npz', '454ff9fbd4790e4a076d9a2087a51da28aa1332f'),
+    'wiki.bn': ('wiki.bn.npz', '1f36f6f39c9a9b33bb8035c9a4dc7e04933604fd'),
+    'wiki.bo': ('wiki.bo.npz', 'b9fe87318428de0a7790de175b5fec80c5af482d'),
+    'wiki.bpy': ('wiki.bpy.npz', '5c7853173d27e2c018c24eca69de8d5f34511b0d'),
+    'wiki.br': ('wiki.br.npz', '7aa66a2034fbfaa1d39e637385d48610238797c9'),
+    'wiki.bs': ('wiki.bs.npz', 'a019a4677677c2e9e4d899326b2b6c15ad6c011a'),
+    'wiki.bug': ('wiki.bug.npz', '09ae3477941d7a99d1df494368d7efb0b2c18913'),
+    'wiki.bxr': ('wiki.bxr.npz', 'b832c691b8ddd95896c052d3d15e1f98d72068d5'),
+    'wiki.ca': ('wiki.ca.npz', '391e0d4daad08649251274fa1cc2a5f49c7728b1'),
+    'wiki.cbk_zam': ('wiki.cbk_zam.npz', '02e57a763bc9f9eadaba57953383dd12a0a78a37'),
+    'wiki.cdo': ('wiki.cdo.npz', 'd6e8f422327e8b2273f1f2662d793707ece6695d'),
+    'wiki.ceb': ('wiki.ceb.npz', '23bc0bb9aeaa57dff35092766941a866de142aae'),
+    'wiki.ce': ('wiki.ce.npz', '182b2a889256119a6d379d501c55c7621e5855db'),
+    'wiki.ch': ('wiki.ch.npz', '82dd77512fcb463481f43c9cef3507e2baa90d7b'),
+    'wiki.cho': ('wiki.cho.npz', 'b0b620fc2442d1a6e2440e71a424861c80175f0c'),
+    'wiki.chr': ('wiki.chr.npz', '3d62c6b95c5af46abd6234426ae760cca65d5bd0'),
+    'wiki.chy': ('wiki.chy.npz', '34a28a22da79aebc100e3714b825c95c8d5f54a3'),
+    'wiki.ckb': ('wiki.ckb.npz', 'ad19461e4be583d08b7693ff5b1e9d590ed41add'),
+    'wiki.co': ('wiki.co.npz', 'fa60d9f0e79f1c7e15f381aef983a0f4f31c05a8'),
+    'wiki.crh': ('wiki.crh.npz', '540270ba6edd9d7b2f7efca52b3b407524ac67d1'),
+    'wiki.cr': ('wiki.cr.npz', 'f06b77465a38ec960d7d5a7554b848c37e945c76'),
+    'wiki.csb': ('wiki.csb.npz', 'b8b28559cf2541341af98e2aa755856765bdeabf'),
+    'wiki.cs': ('wiki.cs.npz', '19881e931fe06abf341450f00c342d364313e232'),
+    'wiki.cu': ('wiki.cu.npz', '731e0d00abd53bc2a8eb6cf37f6ab883cff34e15'),
+    'wiki.cv': ('wiki.cv.npz', 'e60034fcffb7dfef7b236ddba1194c3aa20b7967'),
+    'wiki.cy': ('wiki.cy.npz', '5a0fb967b5556f007c0d5065f951a3d3b1c1005a'),
+    'wiki.da': ('wiki.da.npz', 'd06258014ba2c7450bc2d55edfdf1731433e42e5'),
+    'wiki.de': ('wiki.de.npz', 'a21694dfd2af63bd7bb00f0b60b28e88bd1153f1'),
+    'wiki.diq': ('wiki.diq.npz', '4f6c77a86b39834a7130419967759afd8cc26b84'),
+    'wiki.dsb': ('wiki.dsb.npz', 'e74f1d346a8db96987bff0c33ee5f886907c380a'),
+    'wiki.dv': ('wiki.dv.npz', '5d6fe6f0eec2e7704121d5aba03b4edbb28af873'),
+    'wiki.dz': ('wiki.dz.npz', '77c639d36d0355b2de5adead7996eae342b852a6'),
+    'wiki.ee': ('wiki.ee.npz', '4b5a76127d57515d3e8a76787cdefde5856b754a'),
+    'wiki.el': ('wiki.el.npz', 'a00bcb97e7898931196a1c69f7a492e5b6202661'),
+    'wiki.eml': ('wiki.eml.npz', 'b475d626b3d97e7a68c02827fdc7900599e838c6'),
+    'wiki.en': ('wiki.en.npz', 'ad5ec6d49db6c6fe76b8e85ff05d34e5d0e1eb6a'),
+    'wiki.eo': ('wiki.eo.npz', '18049b0010520d13e676f5a82e8bb90153d99003'),
+    'wiki.es': ('wiki.es.npz', 'a6d192ba7d82d762f8367e75ca951aad4d11e410'),
+    'wiki.et': ('wiki.et.npz', '4beb7025cf88f1aa62d025b187f0cb09aee61858'),
+    'wiki.eu': ('wiki.eu.npz', '5e1a8197e35f20a2476798bbb935b4c131289c4f'),
+    'wiki.ext': ('wiki.ext.npz', '049b2d1b0a8b102b45907cf487cac30aa294e0a0'),
+    'wiki.fa': ('wiki.fa.npz', '81ed274997c87ef87d73d25e166ca06272ce426f'),
+    'wiki.ff': ('wiki.ff.npz', '4867dc74cd53ca0b0f769af4fa1ea420406b59bf'),
+    'wiki.fi': ('wiki.fi.npz', '6d1291b854045179f8171ac7d62ede7d8ac159a2'),
+    'wiki.fiu_vro': ('wiki.fiu_vro.npz', 'dd87806d9dc8833fa0e21e35a50815ebdbaa6c8b'),
+    'wiki.fj': ('wiki.fj.npz', 'cf5c31b0a69276f5dd18ab738ed92444abaeb755'),
+    'wiki.fo': ('wiki.fo.npz', 'ffc19807d528af000861a94cfb8097bd686e14fc'),
+    'wiki.fr': ('wiki.fr.npz', '8f06d5dbe3cf7214354fe9b2f6eca0ef7419f063'),
+    'wiki.frp': ('wiki.frp.npz', 'c8b200ae592478d3cd0bfaafcd7aa19de8a3bfe5'),
+    'wiki.frr': ('wiki.frr.npz', 'fa5e5c39ea2a45793c679eacea290a35e37405ea'),
+    'wiki.fur': ('wiki.fur.npz', 'a61a8940d059f25000e3fe23933e5ed0d37e65d3'),
+    'wiki.fy': ('wiki.fy.npz', '46f9f41bdf6f4fb8e27a753290413d745465963b'),
+    'wiki.gag': ('wiki.gag.npz', '49fb01230e6803544122d47ab7d3fe694d1444f2'),
+    'wiki.gan': ('wiki.gan.npz', '716b7b26acc15975f30caf3c6effa111516fcca5'),
+    'wiki.ga': ('wiki.ga.npz', 'ea934bc1fdc1acf6caf9ac746c6c499251f1fdee'),
+    'wiki.gd': ('wiki.gd.npz', '597017b5a32d933f194595d3656f858e37e70a62'),
+    'wiki.glk': ('wiki.glk.npz', '91a5834658bc2d48714e8807ef24efb79567b4b5'),
+    'wiki.gl': ('wiki.gl.npz', '2fa8e48d6ae1e9c9d542eb3f2156cf9e359e66c2'),
+    'wiki.gn': ('wiki.gn.npz', 'e359eef3928e1f1b5d8fcf0ea532e8794c66289a'),
+    'wiki.gom': ('wiki.gom.npz', '8cd361481c23f7545cc2bd8f1bf22aa7400edd4d'),
+    'wiki.got': ('wiki.got.npz', 'd05daf105611150695e61775fdff2c500b36be3f'),
+    'wiki.gu': ('wiki.gu.npz', '0ce175c5fc39bab4032892f70c9d2bb850af0f4a'),
+    'wiki.gv': ('wiki.gv.npz', '2c573f873d607831ff01b64603c17b8db79bd7e1'),
+    'wiki.hak': ('wiki.hak.npz', 'e6048727799cdf149f5c50037e0fc59300d33a94'),
+    'wiki.ha': ('wiki.ha.npz', 'f18ea7286bbd390c5470896b2c99cb1adc740064'),
+    'wiki.haw': ('wiki.haw.npz', '18bcd85d2e06b1b889f0835fc5b62697fdf32d72'),
+    'wiki.he': ('wiki.he.npz', '76915ff167b6ecb7b7e22ff0ca46914a55d344af'),
+    'wiki.hif': ('wiki.hif.npz', '12153aaf98d76d5502ab77a27cd0b9a539f61513'),
+    'wiki.hi': ('wiki.hi.npz', '249666a598991f6ec147954c6af9e531fd1cd94e'),
+    'wiki.ho': ('wiki.ho.npz', '3f804fd69780c0789708b56ea9d48715f8e38f26'),
+    'wiki.hr': ('wiki.hr.npz', '9a3de28e69f97048bfb480b4f83eaab6149f66ad'),
+    'wiki.hsb': ('wiki.hsb.npz', '7070bf64e13299dd66ac0e9f8e24011a56b6bfe8'),
+    'wiki.ht': ('wiki.ht.npz', 'a607093d511afeb584d02dc676bc5a27eff66287'),
+    'wiki.hu': ('wiki.hu.npz', '9b2c4750daf1bcf39768572e874b5afda0e2f0bc'),
+    'wiki.hy': ('wiki.hy.npz', 'ec0461a102a6fb00bd324f66cefd3c8d55a7093a'),
+    'wiki.hz': ('wiki.hz.npz', '5dfb8afbdae6b4148c3e55ab459c56a74b46b463'),
+    'wiki.ia': ('wiki.ia.npz', '4cfaaf053b9513bbf5b2423258c0f01d20256de6'),
+    'wiki.id': ('wiki.id.npz', 'bace396bb9941cc9e5b2e5f5a19be6db833c5fd4'),
+    'wiki.ie': ('wiki.ie.npz', '1bae7256c2e763ce6d692d1c0a603d99a8b22826'),
+    'wiki.ig': ('wiki.ig.npz', '23128e54a5e143891d392d621723bad9cfc8cf7b'),
+    'wiki.ii': ('wiki.ii.npz', '54bc16d05da512481865a89ecf30260b0acc04dc'),
+    'wiki.ik': ('wiki.ik.npz', 'f8015227e893d2375699b7d132b306ba381f02ac'),
+    'wiki.ilo': ('wiki.ilo.npz', '185a11f81bd5d24a34558dda81ee4735f5ba150b'),
+    'wiki.io': ('wiki.io.npz', 'ddf8180a90aa6ee5be93a2582cc99c535f21363e'),
+    'wiki.is': ('wiki.is.npz', '968f8dd2a093b279a6f7aaa734008454bf51d724'),
+    'wiki.it': ('wiki.it.npz', 'fdfb857a309b2c3d29482bb5cc55f21b858d2e6f'),
+    'wiki.iu': ('wiki.iu.npz', 'fa8896730bd6c24c3473daa22116d1016294e7f7'),
+    'wiki.jam': ('wiki.jam.npz', 'a8f0d0b99c89ace0a6401b8fcda261d06065faaf'),
+    'wiki.ja': ('wiki.ja.npz', '8d42e5a40e4d1d8645b2d80b873a65cadcf68b5c'),
+    'wiki.jbo': ('wiki.jbo.npz', '145fc999ab004b348cf9bf445f0a93a7a145308b'),
+    'wiki.jv': ('wiki.jv.npz', '66978770bf06e42414395cf5fd8c596044d72bec'),
+    'wiki.kaa': ('wiki.kaa.npz', '624a640ecb9901b2aba2e9f44ab615146ecb2862'),
+    'wiki.kab': ('wiki.kab.npz', 'e97f93b6ba65e95c85b7541932cf53c5ad9eb896'),
+    'wiki.ka': ('wiki.ka.npz', '1ca8376e1e0cbd58001c1b51a2d488a2874a6743'),
+    'wiki.kbd': ('wiki.kbd.npz', 'f2d2a05b06723ac549784ad5470d84f5742a1352'),
+    'wiki.kg': ('wiki.kg.npz', 'fa7f6d5f660a173a3e75342d449980eedcdc789e'),
+    'wiki.ki': ('wiki.ki.npz', '21a8c7c616c0050c51c288861f3423f313e4f634'),
+    'wiki.kj': ('wiki.kj.npz', 'f3c347509a0d81f4f7fdbb8b22889b8d76e5014e'),
+    'wiki.kk': ('wiki.kk.npz', 'bc24a3289e1c1e18e16b6789c2f9f92af1e73071'),
+    'wiki.kl': ('wiki.kl.npz', 'b8b7e7359f067836e2be2ecfe9f35a820b00fe1d'),
+    'wiki.km': ('wiki.km.npz', 'e053799fd01463808432dc035bef3e36620e2f36'),
+    'wiki.kn': ('wiki.kn.npz', '2849a0a8b3453e9bf6af05d4c7bd3db881dd1068'),
+    'wiki.koi': ('wiki.koi.npz', 'a9b02e9bd41833bcd54769f94626019c03f29997'),
+    'wiki.ko': ('wiki.ko.npz', '764d9896e74b5a26c6884d48bce3bed8ed3a7822'),
+    'wiki.krc': ('wiki.krc.npz', 'bfe39598c718f1cc95909db7544b3214b308a97c'),
+    'wiki.kr': ('wiki.kr.npz', '1e6af853d4a8ea7830e116eb9b61ac5d7d9a315c'),
+    'wiki.ksh': ('wiki.ksh.npz', '66cd0e3e0a0b0282a13960571ebe7cddd7706bf2'),
+    'wiki.ks': ('wiki.ks.npz', '85f1adaa05b854df4dede745a1aaab3836e60770'),
+    'wiki.ku': ('wiki.ku.npz', 'faf90584e5a45e6d0f9eeb88399b82abe037d584'),
+    'wiki.kv': ('wiki.kv.npz', '9f2b41822013a412da9c99fac06eed8be03ca192'),
+    'wiki.kw': ('wiki.kw.npz', '3eed8a8fc97a2fc79241b8474a458c98d00fc897'),
+    'wiki.ky': ('wiki.ky.npz', '0116ff90f10a6c0728e1ea86d8a44896ea83270a'),
+    'wiki.lad': ('wiki.lad.npz', '5af2015b3d1c5e8563f0e92721580988ebe2ce50'),
+    'wiki.la': ('wiki.la.npz', '7143303a3ea13c7668eb90ea6e3d2ca69857a3be'),
+    'wiki.lbe': ('wiki.lbe.npz', 'f206a3c35a184ba5d2b32ee68640eadf66c847da'),
+    'wiki.lb': ('wiki.lb.npz', '143dc6337f3690379282034c460c613d7f144923'),
+    'wiki.lez': ('wiki.lez.npz', 'b29a680decc6b29f24e8eb9e4f8e11e3419d45f1'),
+    'wiki.lg': ('wiki.lg.npz', '866640ce62cedbc1d453b7ea3c289c291ad76e13'),
+    'wiki.lij': ('wiki.lij.npz', '0dcd3d7009ae89b1016ca6cdb99a9f0d70bc4baf'),
+    'wiki.li': ('wiki.li.npz', '4666b3c238256d7b7623a136db19b8b9f4754734'),
+    'wiki.lmo': ('wiki.lmo.npz', 'ac89fa7cfe0675950bcb31c66bf3f88a3cfc98f0'),
+    'wiki.ln': ('wiki.ln.npz', 'fba158719944aabe58e0002a90be0ed77e11702d'),
+    'wiki.lo': ('wiki.lo.npz', '1e113e340a8a93d385e14502c9c4e3bcdf6c3101'),
+    'wiki.lrc': ('wiki.lrc.npz', '42cb755f398fba6f0da7949c91e92b55654bd482'),
+    'wiki.ltg': ('wiki.ltg.npz', '182f75859e228d1162215f28fe7f2dca127624a4'),
+    'wiki.lt': ('wiki.lt.npz', '66aa944bd2e777cb82d6d59b1f2f837b6c48cb37'),
+    'wiki.lv': ('wiki.lv.npz', '2be8f926da85694fa998bf79d80b61ebb8d67576'),
+    'wiki.mai': ('wiki.mai.npz', 'b8a9c36e2a0f1bb84a44dc762250d2a9007ef637'),
+    'wiki.map_bms': ('wiki.map_bms.npz', '6f0394d6b3d08a946e3df4b9355efe94148f018a'),
+    'wiki.mdf': ('wiki.mdf.npz', '774ee35334641db57f9ac9069961c5372a5d92e8'),
+    'wiki.mg': ('wiki.mg.npz', '496c48ef668f08ce95ebb11ce1ce5026b52d935c'),
+    'wiki.mh': ('wiki.mh.npz', '352edd84f99c5aa277a7306f6cacea1fab065ed3'),
+    'wiki.mhr': ('wiki.mhr.npz', 'dd78b27a674ac10411cdf74ac32f9391506b17e0'),
+    'wiki.min': ('wiki.min.npz', '628b406441ab03bc8aa68195ada50bfdc8226f34'),
+    'wiki.mi': ('wiki.mi.npz', '754127b473861cd4f9ae034c9f527a34827b1f00'),
+    'wiki.mk': ('wiki.mk.npz', 'b09fed4f56c296f13c4020ef1fec498382a38b73'),
+    'wiki.ml': ('wiki.ml.npz', '02fb55d97ca2f0408f0e7e8dd6a661bbc3319a2a'),
+    'wiki.mn': ('wiki.mn.npz', '08b2c45689aa5d9ec49df96dc7c777ce9b9a0b4b'),
+    'wiki.mo': ('wiki.mo.npz', '638c2e8bd2352fd52921b9ae62f578b8357bab49'),
+    'wiki.mrj': ('wiki.mrj.npz', 'ec5cf1f4fb8dfdca64d8172974e620eb8fa41626'),
+    'wiki.mr': ('wiki.mr.npz', '074dd68c947c2f137a3e84b55012925f00213139'),
+    'wiki.ms': ('wiki.ms.npz', '3dbe9e9d70251de8a374776ff1250a9c3103ee59'),
+    'wiki.mt': ('wiki.mt.npz', 'f5103998a68d1b178387417436a83123d44aba01'),
+    'wiki.multi.ar': ('wiki.multi.ar.npz', 'a010d1d81a465c56ebaf596b3e8e8795e7f0f8e3'),
+    'wiki.multi.bg': ('wiki.multi.bg.npz', 'c04018f3a600cee170f12a36cdd35b4727a2aade'),
+    'wiki.multi.ca': ('wiki.multi.ca.npz', 'eef52a0cf20c133ca9065de25f0702861a8cfa29'),
+    'wiki.multi.cs': ('wiki.multi.cs.npz', 'c5f547aa78c0e3d7dae67a0334d500bf2a86aa30'),
+    'wiki.multi.da': ('wiki.multi.da.npz', '24374f2ee169b33327feeee46da31b0de1622fe4'),
+    'wiki.multi.de': ('wiki.multi.de.npz', '2e6c119b345bebd34b56eaaf855d6703889b11f7'),
+    'wiki.multi.el': ('wiki.multi.el.npz', '9d122beedb80a2e5334946641e5bafd32c01e76b'),
+    'wiki.multi.en': ('wiki.multi.en.npz', '8c3c480b4cb2690304173713a646280613b244a8'),
+    'wiki.multi.es': ('wiki.multi.es.npz', '483a22656e4fb2a01e9f4ef8156b261e780850ab'),
+    'wiki.multi.et': ('wiki.multi.et.npz', '22498c7b91645a3874fa738b5cfb16bf98b6f97c'),
+    'wiki.multi.fi': ('wiki.multi.fi.npz', '765a6f0b63777bff4ae6ca2b461c5889c03d6a70'),
+    'wiki.multi.fr': ('wiki.multi.fr.npz', 'decd9aacf600114b8a36072535c0309874a37c83'),
+    'wiki.multi.he': ('wiki.multi.he.npz', '7eee940c1b85936f59122f4b1a166223dd946674'),
+    'wiki.multi.hr': ('wiki.multi.hr.npz', '1673963416af088f8bf15576afb33d58115db35c'),
+    'wiki.multi.hu': ('wiki.multi.hu.npz', 'a1fbe6ededf3cbaa3eaa22dd8b20cce4b36cfc6d'),
+    'wiki.multi.id': ('wiki.multi.id.npz', '6c3e721febb511ede7db7bf978d65769e4270f5c'),
+    'wiki.multi.it': ('wiki.multi.it.npz', 'fc5bfc11e0165e8d95c1708573dad5e456826c73'),
+    'wiki.multi.mk': ('wiki.multi.mk.npz', '6cd50198355674f156fc863108d9bebf11cfabd9'),
+    'wiki.multi.nl': ('wiki.multi.nl.npz', '4fa06b9230c95dfa5a9e9a5d80f1f5ba614d3cbf'),
+    'wiki.multi.no': ('wiki.multi.no.npz', '63756168c1101e73fba8d1a5015f32b8892819e6'),
+    'wiki.multi.pl': ('wiki.multi.pl.npz', '958b8e8bead965ba1bb1433e1c960fc3e12a10fb'),
+    'wiki.multi.pt': ('wiki.multi.pt.npz', '22f07df1609d79b95344ee575ea43141424a1528'),
+    'wiki.multi.ro': ('wiki.multi.ro.npz', '73180b3e382519004bf38ea7b86237aacbbe813a'),
+    'wiki.multi.ru': ('wiki.multi.ru.npz', '3b2eb9163f35e90bf2ce1cd3c997b354d0c34f59'),
+    'wiki.multi.sk': ('wiki.multi.sk.npz', '606a0c3ba9849070c6b6b8c22d920fdeed9a1385'),
+    'wiki.multi.sl': ('wiki.multi.sl.npz', '3cfdab5043b8cfe1535cb6dbd4c9e68847ad5904'),
+    'wiki.multi.sv': ('wiki.multi.sv.npz', '4f1494885b9a831e87cfa3c15f2204c4a73c0779'),
+    'wiki.multi.tr': ('wiki.multi.tr.npz', '54f90d5ddb9a65538a41e37c5a67ed933a5e4885'),
+    'wiki.multi.uk': ('wiki.multi.uk.npz', '500fd26b1d7a25b42458012e99f9f76642e0c787'),
+    'wiki.multi.vi': ('wiki.multi.vi.npz', '3955809cceb300965c15f9372221417719bb0db8'),
+    'wiki.mus': ('wiki.mus.npz', 'a5f48934a3fa6eaf4929098046c93fc94dd6bcb6'),
+    'wiki.mwl': ('wiki.mwl.npz', '8a5e2c272166f8a72c5694ca6c3104d5f49179ec'),
+    'wiki.my': ('wiki.my.npz', '5e035aca16700d7d6695af8a6d3a88ac847aaeb7'),
+    'wiki.myv': ('wiki.myv.npz', 'd4cfaab70c640033e02c0fc0c5a3615ae836c569'),
+    'wiki.mzn': ('wiki.mzn.npz', 'ad09ac584ae455b5862b95125ef409360ae18445'),
+    'wiki.nah': ('wiki.nah.npz', '2dc454ef37d059f2053af46cfa1f4f0ca939cba0'),
+    'wiki.na': ('wiki.na.npz', '401f0f880eb7aa78d21348bc1e0a3953b3e81bf0'),
+    'wiki.nap': ('wiki.nap.npz', '996da46aeeab5644ba766d00c5e343b1553361d7'),
+    'wiki.nds_nl': ('wiki.nds_nl.npz', '5a9307e16b13a5a82ec19a52b33254537e7198e7'),
+    'wiki.nds': ('wiki.nds.npz', 'b249a87c78c52becf51e7b50aaf9f9b6a36585f1'),
+    'wiki.ne': ('wiki.ne.npz', 'a601db2647a74ffd2b4b43dcb8584735f555459c'),
+    'wiki.new': ('wiki.new.npz', 'c398a3775aba9c68ce765cfdfb6b188f7c47e4c6'),
+    'wiki-news-300d-1M': ('wiki-news-300d-1M.npz', '0a03bbd508e5381e140476140fb121afeb0050ed'),
+    'wiki-news-300d-1M-subword': ('wiki-news-300d-1M-subword.npz',
+                                  '69edae21375407781c727dcb9e534e79d712d137'),
+    'wiki.ng': ('wiki.ng.npz', 'befd774d15f69d43547e13e5ea3a97c4cb1ab405'),
+    'wiki.nl': ('wiki.nl.npz', '5a7cb6f1dd0a7621202abba9461ac2c5bf905219'),
+    'wiki.nn': ('wiki.nn.npz', '8e5059ddeb24050fadaa5cc4622b13feb3e4a226'),
+    'wiki.no': ('wiki.no.npz', '5ce6e0f793e66f081652f64013968099de03d9f9'),
+    'wiki.nov': ('wiki.nov.npz', '95ed23b4cfd7a65afa1c12c7dbdce6af53923d77'),
+    'wiki.vec': ('wiki.npz.npz', '08ebb912efeb9df1c7d05e1af90484d210dff47e'),
+    'wiki.nrm': ('wiki.nrm.npz', 'e58614b4508ff9810f0b58fd818f973775bc918d'),
+    'wiki.nso': ('wiki.nso.npz', '56a2ebe260241402d117cd89c5c872b9c96ff05b'),
+    'wiki.nv': ('wiki.nv.npz', 'c713051fe03ec1f60314bb42161b2a47fb5e169a'),
+    'wiki.ny': ('wiki.ny.npz', 'ba5a1725955cbc13e7fd93ab499f8085840c992c'),
+    'wiki.oc': ('wiki.oc.npz', '259e7d994c38a4cfc140fb07016b82d6781e5027'),
+    'wiki.olo': ('wiki.olo.npz', '0fea70f887def4779ee70a79366b88f1ada65004'),
+    'wiki.om': ('wiki.om.npz', '47e2d756b5f8913085d901375c1b4e0b118a4221'),
+    'wiki.or': ('wiki.or.npz', '7e274ab060219b019aa02bb97941cc6e162fd01f'),
+    'wiki.os': ('wiki.os.npz', '19e8199cc2aaffdb07b6c558dbc5465ac6e03155'),
+    'wiki.pag': ('wiki.pag.npz', 'eddf4931547649026c02f893297ef673ec6158bb'),
+    'wiki.pam': ('wiki.pam.npz', '40109aa174bd9f0fa657839bb548e2b0646c58d3'),
+    'wiki.pa': ('wiki.pa.npz', '8a5870717e9e641b1f757f13259171698118de2e'),
+    'wiki.pap': ('wiki.pap.npz', '999c8e5b005ca20d9998fbbe4fa79177f69e24c0'),
+    'wiki.pcd': ('wiki.pcd.npz', 'e975066b323a65cdc5e4c27138ef674d2cf7250b'),
+    'wiki.pdc': ('wiki.pdc.npz', '5c770b9d56f276b0aa535845f175c05ee1cea615'),
+    'wiki.pfl': ('wiki.pfl.npz', '0063d0b633ee529a75482b36ed4f4da7d64994ec'),
+    'wiki.pih': ('wiki.pih.npz', 'ce1d76c94d248545eea0d7436c54849dbb380bfc'),
+    'wiki.pi': ('wiki.pi.npz', 'c7d56c334bf529f8b3655693d207a80feaec4aed'),
+    'wiki.pl': ('wiki.pl.npz', '0d612fdf871a1a4084c867f394940475be899443'),
+    'wiki.pms': ('wiki.pms.npz', 'ca149a2fb138011315bb6d5d61c7a5647e515e51'),
+    'wiki.pnb': ('wiki.pnb.npz', '9ec82d02ad8894056c67991cf8ce927bcca74ee2'),
+    'wiki.pnt': ('wiki.pnt.npz', '3f90123407bb8fc838a0a0d3700a14e15f5b26aa'),
+    'wiki.ps': ('wiki.ps.npz', '7edebc02ac16f5fab83eb10b7d0fab821a9a4d43'),
+    'wiki.pt': ('wiki.pt.npz', 'f172fd801edd1ad9d319ba44146d40b5d682a473'),
+    'wiki.qu': ('wiki.qu.npz', '68bec60ccfe1826c3b3a8968574488dbc74cdf7b'),
+    'wiki.rm': ('wiki.rm.npz', '00fb191fc736ba60cb23e76169dfccde9a9daad0'),
+    'wiki.rmy': ('wiki.rmy.npz', 'c5e93cc37ff7293b9a1d9fe55c42d6fbde372b97'),
+    'wiki.rn': ('wiki.rn.npz', '57b8e0d6999269be227af6ef2797a9cf8386ff1b'),
+    'wiki.roa_rup': ('wiki.roa_rup.npz', 'e06d6b5672a59bb9e83143bc8b28300d23c09546'),
+    'wiki.roa_tara': ('wiki.roa_tara.npz', 'c083105f40236dc3711f06c1b40e8ee7a714b99d'),
+    'wiki.ro': ('wiki.ro.npz', '766bc0cb58a65b0b1763b9a0d90e91ab982eb20d'),
+    'wiki.rue': ('wiki.rue.npz', '9a91fa093cd48d7d658d526b0ccda48dc59cd7f4'),
+    'wiki.ru': ('wiki.ru.npz', 'd59d099481c22d5592ab9635c9ee48060aa0bf45'),
+    'wiki.rw': ('wiki.rw.npz', 'e99ee87d249f6c157c5c97397d1025d798b85c69'),
+    'wiki.sah': ('wiki.sah.npz', '85dae39097b29bc8e2b64f343a77794e4a62f91a'),
+    'wiki.sa': ('wiki.sa.npz', '7d1928d7c67400045ac1b35a37a0e3089690d875'),
+    'wiki.scn': ('wiki.scn.npz', '27d7b8050bbeed8ce196061c610216760b053c39'),
+    'wiki.sc': ('wiki.sc.npz', '69c7b8be0f03a1bbd615695f93bdd78f96a58e16'),
+    'wiki.sco': ('wiki.sco.npz', '4880282f59d3338b67fbff75359e2d24896e95bb'),
+    'wiki.sd': ('wiki.sd.npz', '0ed8da4d27223db717a612cf0c88582351db6e19'),
+    'wiki.se': ('wiki.se.npz', '0f4b2e060d5e29f96ca73aab29c967e79db69c17'),
+    'wiki.sg': ('wiki.sg.npz', 'a5e4edf34fe1a88b322da4c3922ec5a470e200c6'),
+    'wiki.sh': ('wiki.sh.npz', 'c13f1e94676bc939560193f7aa7ffd7d604707b3'),
+    'wiki.simple': ('wiki.simple.npz', '352d0575e7d60b08e1dfce2c5de713906f0ed78f'),
+    'wiki.si': ('wiki.si.npz', '204f9ffbe7770a9f56d3b2fb26999165015f5c33'),
+    'wiki.sk': ('wiki.sk.npz', '7a9820b5a343b242660bf2595d1ecbf6e00a76d6'),
+    'wiki.sl': ('wiki.sl.npz', '85f3186f26d6725317a64e290363a7251b928b81'),
+    'wiki.sm': ('wiki.sm.npz', '9e13452cc4bff677f4f15db04f9d2f95f6ec054c'),
+    'wiki.sn': ('wiki.sn.npz', 'e8d5f7dcf51280c5f99bc3df849b4889a61e9fcd'),
+    'wiki.so': ('wiki.so.npz', '0f5d71b95768b33fd939a870c15344c4478364a9'),
+    'wiki.sq': ('wiki.sq.npz', '8b05826df8575e65c87a2fc0b7630cf644d4216d'),
+    'wiki.srn': ('wiki.srn.npz', '2711396ef297ac5dde8904508bc002bdecbcc6f4'),
+    'wiki.sr': ('wiki.sr.npz', '546edc8e29a5d2e99ed10eb4a552cbef2bb8f417'),
+    'wiki.ss': ('wiki.ss.npz', '2e5911bad79bb5270a64f587e326d31c95ec58f3'),
+    'wiki.st': ('wiki.st.npz', '23bc954719a2962e891f02efaea754c9ea025894'),
+    'wiki.stq': ('wiki.stq.npz', 'dd3ece0c0aa30e53ae0f4b558309bb60ab628652'),
+    'wiki.su': ('wiki.su.npz', '7e48732e8a1fcf212e692924a4416a6ac3b3b055'),
+    'wiki.sv': ('wiki.sv.npz', 'b9ec52e9423688f195f3145c243226c0e0b51e83'),
+    'wiki.sw': ('wiki.sw.npz', '5262f0c645322b10eca73f792a970f10b2719e55'),
+    'wiki.szl': ('wiki.szl.npz', 'fdd6d6b291cdbbcec5ff93451a588fdd103bb2d0'),
+    'wiki.ta': ('wiki.ta.npz', 'da7c5bc6e1142306ff2669bf1739832beb6c1763'),
+    'wiki.tcy': ('wiki.tcy.npz', 'baa49e1afa2bb0dcaaef0fac1ee75bbe711d1134'),
+    'wiki.te': ('wiki.te.npz', 'baf48767ce85e4d41d65d25f2bbf1c5f559ec18f'),
+    'wiki.tet': ('wiki.tet.npz', '11e46a893af55344dbe102d530fdfea5d949d3bc'),
+    'wiki.tg': ('wiki.tg.npz', 'da66abb72ec9ccc602713161e544963d59cc51d7'),
+    'wiki.th': ('wiki.th.npz', '25e54bf2d305779ec9baa5f344410bd75c7702fc'),
+    'wiki.ti': ('wiki.ti.npz', '1faf98f3a0eafa7559a4b2a111f43dd1f7b9a05b'),
+    'wiki.tk': ('wiki.tk.npz', '34c714fa8275fd6abfe86b2d144a043774552a6c'),
+    'wiki.tl': ('wiki.tl.npz', '7d7f8a0485155bce7a74a1d778824375b0029f53'),
+    'wiki.tn': ('wiki.tn.npz', 'd0bc3a9b948753ac2283e5e10480c9fa0f6acb53'),
+    'wiki.to': ('wiki.to.npz', 'e982fc31bcfcf7339988d7aad21ce29ac9e84b0b'),
+    'wiki.tpi': ('wiki.tpi.npz', '448cef043fa4b7f97825dbf8ee205ef05543bcac'),
+    'wiki.tr': ('wiki.tr.npz', 'c9830607a4c5134c6191006f1d80bae0ec798fe6'),
+    'wiki.ts': ('wiki.ts.npz', '84a0598803712c8a713943447ddb73fc0f39af43'),
+    'wiki.tt': ('wiki.tt.npz', '82c29df18f33e6284af3e977a6dda7e132a7a225'),
+    'wiki.tum': ('wiki.tum.npz', '358990b894a3fb09d70674465952d828c9b0eda7'),
+    'wiki.tw': ('wiki.tw.npz', '1e6d2838a4f271c1808795fb929cfcbf95094d93'),
+    'wiki.ty': ('wiki.ty.npz', 'e41ca5192d8cb515b3561c8d6935b150deb027b7'),
+    'wiki.tyv': ('wiki.tyv.npz', 'ce062ed32e854604714b65698ae290c99ba28060'),
+    'wiki.udm': ('wiki.udm.npz', '9e1c5891ee0c5ac8f65fc457e1b42c7b2bfc8d37'),
+    'wiki.ug': ('wiki.ug.npz', '656503e54063e200980e39f00fc011395bcd8551'),
+    'wiki.uk': ('wiki.uk.npz', '352b7ee24d9fc6513fff4fe13bc04086c680834a'),
+    'wiki.ur': ('wiki.ur.npz', 'a81e55c7adfc2cef779ce9a01fe21319a7e4943b'),
+    'wiki.uz': ('wiki.uz.npz', 'd60d1e67bb8574dd71c18c88114aba674fc1eecb'),
+    'wiki.ve': ('wiki.ve.npz', '5bfc3dbb3e47d23597df47ef12bd1c64ab8d3ea9'),
+    'wiki.vep': ('wiki.vep.npz', '7a94355754fbe56802242c0bf9d7a27335095552'),
+    'wiki.vi': ('wiki.vi.npz', 'f118039eb16a4ca3347b6b171eac41113350a041'),
+    'wiki.vls': ('wiki.vls.npz', '9a46a2fdc6448aa54f212081643745499ea7d05c'),
+    'wiki.vo': ('wiki.vo.npz', '8e2f93c85ac608bcc4ae14093b9ff016061378fb'),
+    'wiki.wa': ('wiki.wa.npz', '907074f7743d30cdbb2c48d0c8b4040796ea4164'),
+    'wiki.war': ('wiki.war.npz', '928fb410c394b9c18d875326b6a3e750e2611e1b'),
+    'wiki.wo': ('wiki.wo.npz', '7bb352be44f7261aa926f49b13e77df30f29312f'),
+    'wiki.wuu': ('wiki.wuu.npz', '0d1dc7b05867ff2156a1180ad3da3b4697924e59'),
+    'wiki.xal': ('wiki.xal.npz', 'd87f4a131e086dc0bdc2a7e10406820c3c03b6a9'),
+    'wiki.xh': ('wiki.xh.npz', 'c64e1d2e77d1c744a628e2bd7353284616e48bea'),
+    'wiki.xmf': ('wiki.xmf.npz', '160b9ee9773b9099aaf37ae9bdbc8a4a93b7f6ea'),
+    'wiki.yi': ('wiki.yi.npz', '0662542cee29f3392fc905004ac6443b32c1477c'),
+    'wiki.yo': ('wiki.yo.npz', '5d12d3b902a1fa19d8548295c3802c0608afa5c8'),
+    'wiki.za': ('wiki.za.npz', '536348ff89df62e968739b567a1245bfd4112fbe'),
+    'wiki.zea': ('wiki.zea.npz', '61fa192289a7c0f73ffa8035632a38b91c31c224'),
+    'wiki.zh_classical': ('wiki.zh_classical.npz', '9acc9eaf8ebe316b945fb1f56ac71a2b7e024854'),
+    'wiki.zh_min_nan': ('wiki.zh_min_nan.npz', '5d38bc025c82af578299d60f7df7b399de6ed81a'),
+    'wiki.zh': ('wiki.zh.npz', '94007fcf3b105bf2c21b84a3a22bdb7946e74804'),
+    'wiki.zh_yue': ('wiki.zh_yue.npz', 'af6f0d94e6418d528d6cedd859e07e6e2fb416ab'),
+    'wiki.zu': ('wiki.zu.npz', 'fc9ce07d5d0c49a3c86cf1b26056ada58f9404ca'),
+    'cc.af.300': ('cc.af.300-6cf6fb1b.npz', '6cf6fb1b9f890787cbd3b510ef6201de9e02a297'),
+    'cc.als.300': ('cc.als.300-479a6674.npz', '479a66746401f6119a7e4cba58ddfac5f9937ba6'),
+    'cc.am.300': ('cc.am.300-0d9530cd.npz', '0d9530cd2b7e4bc9eac96048ed5cbf7d3cc9f799'),
+    'cc.an.300': ('cc.an.300-ef9cb799.npz', 'ef9cb799a5a627a9d33b54604aff1593e0de3b40'),
+    'cc.ar.300': ('cc.ar.300-e9c5e360.npz', 'e9c5e360d5cd050effd9ce42f831b9c94b7ffbd9'),
+    'cc.arz.300': ('cc.arz.300-9e6a80e7.npz', '9e6a80e752f3830b5cc934884d0a65f8cf94eff4'),
+    'cc.as.300': ('cc.as.300-8b00a681.npz', '8b00a681b079d56929bedf1ced16b3c9573c5b37'),
+    'cc.ast.300': ('cc.ast.300-8681d5cb.npz', '8681d5cbbdfd89bed0a9335a8e6f28617185627a'),
+    'cc.az.300': ('cc.az.300-06632ae7.npz', '06632ae75e85f75caa9b87f8327661333a504ccc'),
+    'cc.azb.300': ('cc.azb.300-01c48025.npz', '01c480257a8343fd18d1d9011dd14ef5c3f124a2'),
+    'cc.ba.300': ('cc.ba.300-afc6b4d1.npz', 'afc6b4d1e77964965fd8cc2ff31e077286af81e4'),
+    'cc.bar.300': ('cc.bar.300-67450b87.npz', '67450b879ae9d6d5e1b7c7150baab9173563d38a'),
+    'cc.bcl.300': ('cc.bcl.300-261d8d11.npz', '261d8d11ca9fe67a12de812976398722b30e1df2'),
+    'cc.be.300': ('cc.be.300-cd32b101.npz', 'cd32b101a860ae629f05ee90beb2a8f137ed8bc2'),
+    'cc.bg.300': ('cc.bg.300-088de1c6.npz', '088de1c633cbfe8e06405badd685bc04f2127cfd'),
+    'cc.bh.300': ('cc.bh.300-35ced78e.npz', '35ced78e8dc524f17e7ba4a5326b6bbeb92af4e1'),
+    'cc.bn.300': ('cc.bn.300-98293882.npz', '98293882fc548c2047f5482ea4dcd5eedef58f13'),
+    'cc.bo.300': ('cc.bo.300-7653c3c7.npz', '7653c3c76bfb21a3f4e655b4a3d347d6b5e1ebc3'),
+    'cc.bpy.300': ('cc.bpy.300-8225d2db.npz', '8225d2db3bbd34c1f27e491b41d35e1f4394f529'),
+    'cc.br.300': ('cc.br.300-ac611b58.npz', 'ac611b58dd6006dc741170ea4066aeff8b8b7a0c'),
+    'cc.bs.300': ('cc.bs.300-be17aeed.npz', 'be17aeedde87cdd173a3508edf2762311df5b369'),
+    'cc.ca.300': ('cc.ca.300-8e7f57c1.npz', '8e7f57c190a9a4bd5513b9d268b5a26f55202fe0'),
+    'cc.ce.300': ('cc.ce.300-7ef28422.npz', '7ef28422df058d6be07c819c195b9ce50be5a985'),
+    'cc.ceb.300': ('cc.ceb.300-25801d42.npz', '25801d429ed046998749cb07526533b9c63bded2'),
+    'cc.ckb.300': ('cc.ckb.300-c56f75b4.npz', 'c56f75b46cc38357f38f7de51d46e97c66381ca0'),
+    'cc.co.300': ('cc.co.300-f9394c12.npz', 'f9394c1285bf6d5d33b1c28387535e1010a1d4c6'),
+    'cc.cs.300': ('cc.cs.300-82823bbf.npz', '82823bbfc29ec94051b72079939eb66d5db4d1bf'),
+    'cc.cv.300': ('cc.cv.300-6862885d.npz', '6862885d97a2c84792bf7f291338008757eb5790'),
+    'cc.cy.300': ('cc.cy.300-c649c74c.npz', 'c649c74c0ca139d80e19b9a98b5f5d15386393d7'),
+    'cc.da.300': ('cc.da.300-ee9246dc.npz', 'ee9246dc34da5fa1e2d49d37395b169598488324'),
+    'cc.de.300': ('cc.de.300-713dc52f.npz', '713dc52f1c24b5c31e4104aee9d92e7e26bd6db9'),
+    'cc.diq.300': ('cc.diq.300-fdc37a8e.npz', 'fdc37a8e2357e922bc27f945f1e080d80fbcae5a'),
+    'cc.dv.300': ('cc.dv.300-d37b74b2.npz', 'd37b74b2247761723126b92183857f5e32a4f17c'),
+    'cc.el.300': ('cc.el.300-888b3ecf.npz', '888b3ecfc9f3e16349d15cc85b5453a90f524529'),
+    'cc.eml.300': ('cc.eml.300-01926a33.npz', '01926a335cb7055270066d07a9d050a786398adf'),
+    'cc.en.300': ('cc.en.300-79da8fea.npz', '79da8fea1408d642ce43a5fdf40c3c803a49db2c'),
+    'cc.eo.300': ('cc.eo.300-02accc23.npz', '02accc23007b196a0bab9be70dcfe911fb8fa87c'),
+    'cc.es.300': ('cc.es.300-a0063528.npz', 'a00635289e65081d50fc46bb39203e48115b5d20'),
+    'cc.et.300': ('cc.et.300-2916e309.npz', '2916e309a61ba0b83761ed9b4f75d959ad59247f'),
+    'cc.eu.300': ('cc.eu.300-0257399f.npz', '0257399f1433ca579aeaf625f897b01c2f041438'),
+    'cc.fa.300': ('cc.fa.300-d5aca585.npz', 'd5aca58546a99513e1f96b6df7a52b95ace2247a'),
+    'cc.fi.300': ('cc.fi.300-ed53841e.npz', 'ed53841e29ebf6d701ee4c96a10c25f2ecc8a904'),
+    'cc.fr.300': ('cc.fr.300-c87b8969.npz', 'c87b89697779a76650b5a583a5d682809d73a794'),
+    'cc.frr.300': ('cc.frr.300-dd4b3bdf.npz', 'dd4b3bdf9d6df61f7e9e94cdf754dee46ec15fa7'),
+    'cc.fy.300': ('cc.fy.300-7eb20794.npz', '7eb20794c65568e6fabe5c9df974d1951035a819'),
+    'cc.ga.300': ('cc.ga.300-8d09df0e.npz', '8d09df0e774f7ac3fe7cafb8ed67dd58388743a9'),
+    'cc.gd.300': ('cc.gd.300-606435bb.npz', '606435bba611f1c77a59e684ac44c3e852d31beb'),
+    'cc.gl.300': ('cc.gl.300-a58a25da.npz', 'a58a25da563958c3ae70fc023c4f25bd2fc8a75d'),
+    'cc.gom.300': ('cc.gom.300-ec42b285.npz', 'ec42b285a4cbb43ca76056ea0873f0a3b4c19a2e'),
+    'cc.gu.300': ('cc.gu.300-ddfdc7d5.npz', 'ddfdc7d5351cbf95838050af53c76e874a013f3d'),
+    'cc.gv.300': ('cc.gv.300-df66ebec.npz', 'df66ebec5580e2e8ce5ed42b0116768ac1e63d43'),
+    'cc.he.300': ('cc.he.300-bd197a43.npz', 'bd197a43f2600b73f42480f901f8fafb9056e334'),
+    'cc.hi.300': ('cc.hi.300-e8f1a8ee.npz', 'e8f1a8ee11d469ee007ac66f1a6ae2d9cf996fde'),
+    'cc.hif.300': ('cc.hif.300-cd787567.npz', 'cd7875675c126764f76394435d114a0405a6341b'),
+    'cc.hr.300': ('cc.hr.300-f33745d1.npz', 'f33745d1c8e966932d5034f248bcc22b7f8d2297'),
+    'cc.hsb.300': ('cc.hsb.300-2c0e9847.npz', '2c0e9847177614324fd271e9c9fa5524969090e9'),
+    'cc.ht.300': ('cc.ht.300-3192d8a6.npz', '3192d8a632af0d617d9a9c9e78f4015d0f594131'),
+    'cc.hu.300': ('cc.hu.300-08a106da.npz', '08a106da56e64d2b8db4306cbacc99086b49659d'),
+    'cc.hy.300': ('cc.hy.300-935747f2.npz', '935747f2a88dff9edf957723fe4736d6ebcf1d6a'),
+    'cc.ia.300': ('cc.ia.300-737f4b78.npz', '737f4b78c4fa857a575ea01e6946217b69616ee9'),
+    'cc.id.300': ('cc.id.300-b50ae07a.npz', 'b50ae07a0663023c3c117305c05b09143a167700'),
+    'cc.ilo.300': ('cc.ilo.300-02b500a7.npz', '02b500a7e10d206239aa502590ff4768840e29c0'),
+    'cc.io.300': ('cc.io.300-aaf228a7.npz', 'aaf228a78ce7dc1138181a32011c4d495e383e81'),
+    'cc.is.300': ('cc.is.300-2f612f20.npz', '2f612f20cfdcd68b5eb46dacef4ba30f00069f55'),
+    'cc.it.300': ('cc.it.300-5b21ee40.npz', '5b21ee408ab99c35a2bdd25e716975e0b73182ad'),
+    'cc.ja.300': ('cc.ja.300-89cf6cb7.npz', '89cf6cb70985ca841246139719028103e7a932f8'),
+    'cc.jv.300': ('cc.jv.300-85d4a52b.npz', '85d4a52b83150aa46ea60887d58e12f5b8fbc732'),
+    'cc.ka.300': ('cc.ka.300-048778a9.npz', '048778a9ac39f9e4fc2b216f5fe752864c793295'),
+    'cc.kk.300': ('cc.kk.300-f29ac700.npz', 'f29ac7000778e5adea0c9aa00f11fdcc47386adf'),
+    'cc.km.300': ('cc.km.300-b9a2073f.npz', 'b9a2073f8d325e49934ac919cbe222ac84ef77af'),
+    'cc.kn.300': ('cc.kn.300-034e4f17.npz', '034e4f17d08351a890896d9b7d2573a88d0fc230'),
+    'cc.ko.300': ('cc.ko.300-28e7ae64.npz', '28e7ae64e994b9989f042323ec6f15f5ad7a53d2'),
+    'cc.ku.300': ('cc.ku.300-82496bfd.npz', '82496bfd5e23f697b17ae4f351c1a13b12f482b5'),
+    'cc.ky.300': ('cc.ky.300-4efb03dc.npz', '4efb03dc26319fd813edfe7cd1c1f373d134ac97'),
+    'cc.la.300': ('cc.la.300-8adf2142.npz', '8adf2142b05825aa040d4b96afee9f08b00b94fe'),
+    'cc.lb.300': ('cc.lb.300-8945d3df.npz', '8945d3dfb24fbe5ac9391aa10296f9a18992c380'),
+    'cc.li.300': ('cc.li.300-0b6aee43.npz', '0b6aee432ca667db8b06dfb29fd249d00df7a275'),
+    'cc.lmo.300': ('cc.lmo.300-a02cf032.npz', 'a02cf032a3af3a035bec24b2b225048bf0f86eb5'),
+    'cc.lt.300': ('cc.lt.300-2b682e7d.npz', '2b682e7d30e739c4090fef65593d29f40f1323f9'),
+    'cc.lv.300': ('cc.lv.300-c2453825.npz', 'c24538254050ce393b7ed8a018ed27a693c9dfa1'),
+    'cc.mai.300': ('cc.mai.300-4bc11fe3.npz', '4bc11fe3470e23c79ad5bc2a16283ede92857f9b'),
+    'cc.mg.300': ('cc.mg.300-2b644c0f.npz', '2b644c0fe3d32d89cf4a91a62b30a8cdf84eb69e'),
+    'cc.mhr.300': ('cc.mhr.300-f9216c88.npz', 'f9216c883afb62c0fd890f1df1f076fda48d534d'),
+    'cc.min.300': ('cc.min.300-0d8c3a77.npz', '0d8c3a7709acb9386c9bfb29294c56923e2fe160'),
+    'cc.mk.300': ('cc.mk.300-bf1caa91.npz', 'bf1caa91a2376a6e4cae5576e84c00bd7b4d53ea'),
+    'cc.ml.300': ('cc.ml.300-0c3baa74.npz', '0c3baa74f062367e00c2df09446480df4ef45b79'),
+    'cc.mn.300': ('cc.mn.300-4ce878ea.npz', '4ce878ea1e15afffefa257374a8e265bc26b9d19'),
+    'cc.mr.300': ('cc.mr.300-a9e08f9d.npz', 'a9e08f9d21627dc34318208aebb0b7a8f78aeb47'),
+    'cc.mrj.300': ('cc.mrj.300-1ebb04f1.npz', '1ebb04f175c1e115c77167222cbfc434474dbf11'),
+    'cc.ms.300': ('cc.ms.300-05216b2f.npz', '05216b2f3f76af940bd343135be243c8ed5f0de5'),
+    'cc.mt.300': ('cc.mt.300-2a3ba408.npz', '2a3ba408205f3552f432d97fca3de797aa086c62'),
+    'cc.mwl.300': ('cc.mwl.300-e758f2b4.npz', 'e758f2b42482ad9d1caf1d4f46f8873a61f6e0ce'),
+    'cc.my.300': ('cc.my.300-443b1674.npz', '443b16746ef8f23d64a397b21b1d0ae5707d1e5c'),
+    'cc.myv.300': ('cc.myv.300-67d19cef.npz', '67d19ceffa23cb376c309f9e985222579abb9ac1'),
+    'cc.mzn.300': ('cc.mzn.300-ce441f50.npz', 'ce441f50a28f0b1f02b47ee1608297750ef0cabd'),
+    'cc.nah.300': ('cc.nah.300-ba4c46c0.npz', 'ba4c46c0089109c65cfc25735c5000845345d56e'),
+    'cc.nap.300': ('cc.nap.300-489727c7.npz', '489727c7e8241f98c0c7049387a12a101f625226'),
+    'cc.nds.300': ('cc.nds.300-6265356d.npz', '6265356d5822372b15ca5103b5ef43f8b3ff4a1c'),
+    'cc.ne.300': ('cc.ne.300-6b66b354.npz', '6b66b3542ec091054c2735cb3358b349c8ca3a87'),
+    'cc.new.300': ('cc.new.300-f4747761.npz', 'f4747761d9827340fd8074a08e61769ab3a7cc11'),
+    'cc.nl.300': ('cc.nl.300-1867fc6d.npz', '1867fc6d6f466fb0d2f46623530c21f4c149197b'),
+    'cc.nn.300': ('cc.nn.300-3ce324ef.npz', '3ce324eff15cd595e81bb1882d23b7948772f400'),
+    'cc.no.300': ('cc.no.300-6e39d0d5.npz', '6e39d0d5e205c7106be0a601dbd59455f17c97e1'),
+    'cc.nso.300': ('cc.nso.300-90ba6fda.npz', '90ba6fdac1eb6d14d6de69b6a794a2a6194ea5a2'),
+    'cc.oc.300': ('cc.oc.300-c66e395b.npz', 'c66e395bce5ad97815b7273556b6225e72b76a0d'),
+    'cc.or.300': ('cc.or.300-8b14cde1.npz', '8b14cde1cee9858052430c9ecc6cfa8a16fcab39'),
+    'cc.os.300': ('cc.os.300-1a888846.npz', '1a8888460713455b33f9cd2ddd957a1afda6ce45'),
+    'cc.pa.300': ('cc.pa.300-185e0f43.npz', '185e0f4361f927fca6c0ff27cf8e1963d627041c'),
+    'cc.pam.300': ('cc.pam.300-9ac2a111.npz', '9ac2a1112ffbe96cc9ba226a4cc9331f696db6e9'),
+    'cc.pfl.300': ('cc.pfl.300-ca0cddb4.npz', 'ca0cddb47b061380b2522041d9472ca06240b92f'),
+    'cc.pl.300': ('cc.pl.300-98c4c23c.npz', '98c4c23c78824a3801d05f9d67a696f9fcc40683'),
+    'cc.pms.300': ('cc.pms.300-cbd33047.npz', 'cbd3304720df204b5c8ec576e3abf9306728f5d4'),
+    'cc.pnb.300': ('cc.pnb.300-66b69add.npz', '66b69addc40b633af8d06c7d5fd8066ac64c8a2c'),
+    'cc.ps.300': ('cc.ps.300-ce11c971.npz', 'ce11c97193ee28cdb53ed72c68b20465a607f957'),
+    'cc.pt.300': ('cc.pt.300-985866df.npz', '985866df8e19f52f410e6251b82e3bb4494b0d1d'),
+    'cc.qu.300': ('cc.qu.300-0ae2e211.npz', '0ae2e2111ee2e7aab1116b587fe302d86ab3641e'),
+    'cc.rm.300': ('cc.rm.300-b2367a7f.npz', 'b2367a7f72e26ab5a844b7d8483a455dc73e1994'),
+    'cc.ro.300': ('cc.ro.300-682eda30.npz', '682eda308c2a14041e71d0d0588dd17681294ec5'),
+    'cc.ru.300': ('cc.ru.300-0f9c3b90.npz', '0f9c3b905669e0f2a32196bc2a391fe2ef40f5a4'),
+    'cc.sa.300': ('cc.sa.300-1d0f5bfd.npz', '1d0f5bfdc3118f9ad77e704a7d808c9231cb13eb'),
+    'cc.sah.300': ('cc.sah.300-f8532800.npz', 'f85328005fd0d2f0f09cbdc24760b20180b537c4'),
+    'cc.sc.300': ('cc.sc.300-dcb4fb26.npz', 'dcb4fb26874067e449f609f9b2cd138b60606d33'),
+    'cc.scn.300': ('cc.scn.300-e72c5de7.npz', 'e72c5de7693efab12d6f9a6508c0a5fa8845bd04'),
+    'cc.sco.300': ('cc.sco.300-13d0662d.npz', '13d0662d930c18bf0c8dff7efc01219712d38351'),
+    'cc.sd.300': ('cc.sd.300-baf6b5af.npz', 'baf6b5afa56fd93d78175307b3d84dbad85d2710'),
+    'cc.sh.300': ('cc.sh.300-b6958a18.npz', 'b6958a18612551a7b325022fcbbdfa2637fb4411'),
+    'cc.si.300': ('cc.si.300-c846206f.npz', 'c846206fd77cdb5619752e3cb448dbcc0dac61b6'),
+    'cc.sk.300': ('cc.sk.300-4fc2be73.npz', '4fc2be73972027f55f88e288dc3c0ecb2a6fba42'),
+    'cc.sl.300': ('cc.sl.300-6b735538.npz', '6b7355380cee5320c42b1aa8d41226a562bc6407'),
+    'cc.so.300': ('cc.so.300-f09c2019.npz', 'f09c2019099cd0ae488810a26ceea3437c450711'),
+    'cc.sq.300': ('cc.sq.300-c55b576c.npz', 'c55b576c14d06489d6105a7a1d9126e181877030'),
+    'cc.sr.300': ('cc.sr.300-887cf83d.npz', '887cf83d9525573407f4206b268e6e3c31266403'),
+    'cc.su.300': ('cc.su.300-224e9974.npz', '224e99745371a890c6c944aae94d6c5d637876b5'),
+    'cc.sv.300': ('cc.sv.300-c5266ab8.npz', 'c5266ab8ee8a1c093b9437c6b598659c72dd1f7e'),
+    'cc.sw.300': ('cc.sw.300-829d1ca0.npz', '829d1ca09c78d52280497fd257883292b53e8bc7'),
+    'cc.ta.300': ('cc.ta.300-f8e9bee4.npz', 'f8e9bee4ea31e6952ff582855200ade217c11d9b'),
+    'cc.te.300': ('cc.te.300-19b7470c.npz', '19b7470cd59e6a67986267cf9171c6d02340c1d1'),
+    'cc.tg.300': ('cc.tg.300-069920da.npz', '069920da67fcaba58e2bd5f4bd505e3cd0b325ab'),
+    'cc.th.300': ('cc.th.300-17f28dd9.npz', '17f28dd9fabe987d6017ed4ed0897e3f01ab18a2'),
+    'cc.tk.300': ('cc.tk.300-ac4be8fe.npz', 'ac4be8fe39901913ac0ae5b9ec023ffdb2c4ccdd'),
+    'cc.tl.300': ('cc.tl.300-919ed791.npz', '919ed791c7cc2f7f1a214611fa12d32187f186c5'),
+    'cc.tr.300': ('cc.tr.300-cbd537c6.npz', 'cbd537c606cd41c6a90e2de71ad7dc2902e63363'),
+    'cc.tt.300': ('cc.tt.300-c38d9317.npz', 'c38d93176d6fa35658352bd74a0fb81e0edf3d3a'),
+    'cc.ug.300': ('cc.ug.300-d99daa00.npz', 'd99daa007459f8129508fdd3ffcab135083309d5'),
+    'cc.uk.300': ('cc.uk.300-a20e9dab.npz', 'a20e9dab63d727dc4c03a99caf66c8a194a2f96e'),
+    'cc.ur.300': ('cc.ur.300-57ca0636.npz', '57ca06364b7413195d75adda61c00f972ab1a43e'),
+    'cc.uz.300': ('cc.uz.300-0f2eef78.npz', '0f2eef78cc68f14a89b6b6a2d6ac4b6613d080e9'),
+    'cc.vec.300': ('cc.vec.300-c9bfa76a.npz', 'c9bfa76a5dc2787923f26ea314f3a5a957fcd00c'),
+    'cc.vi.300': ('cc.vi.300-44d740f4.npz', '44d740f4044cb2cd8d442327c17b99c359a8ca43'),
+    'cc.vls.300': ('cc.vls.300-08e7eaba.npz', '08e7eaba113d6084d1a25958165428486caf7d19'),
+    'cc.vo.300': ('cc.vo.300-792d3a79.npz', '792d3a7983bb4560d05932d1a2de35203c6b9479'),
+    'cc.wa.300': ('cc.wa.300-bfc87d4c.npz', 'bfc87d4c6738770afd3b9f0f5e7adccd79131a03'),
+    'cc.war.300': ('cc.war.300-d8a9082f.npz', 'd8a9082f98ebd8b312cafbc90c50a8ab421dc06b'),
+    'cc.xmf.300': ('cc.xmf.300-8bc1fdf1.npz', '8bc1fdf1db5f2716a2d6542c3ae6d5a4abbd2506'),
+    'cc.yi.300': ('cc.yi.300-33533193.npz', '33533193c9b710d2ca283e8978b83e3842ea8f5d'),
+    'cc.yo.300': ('cc.yo.300-9dc5edde.npz', '9dc5eddeee3354f3f587ecdca73caf01080c399d'),
+    'cc.zea.300': ('cc.zea.300-c0a4fb02.npz', 'c0a4fb025aab6774db52d6a845108f5cf8738508'),
+    'cc.zh.300': ('cc.zh.300-355cfcaf.npz', '355cfcafe71536226a1737aafb4530c9ba4fd09f'),
+}
+
+FAST_TEXT_BIN_SHA1 = {
+    'wiki-news-300d-1M-subword': ('wiki-news-300d-1M-subword-c8853bda.bin',
+                                  'c8853bdae00318097b6337c4631d342879d6b18c'),
+    'crawl-300d-2M-subword': ('crawl-300d-2M-subword-e6b07293f.bin',
+                              'e6b07293f7b0095e3c72c2a12bc09464b69444b0'),
+    'cc.af.300': ('cc.af.300-33115ff8.bin', '33115ff8e4c8f439757c819399177f1f58f07f12'),
+    'cc.als.300': ('cc.als.300-d6579933.bin', 'd65799331a03895d68a3fbe7611b181d7e7cc916'),
+    'cc.am.300': ('cc.am.300-999b3e95.bin', '999b3e95a2c490d7fcab2a6e08074746303d3c17'),
+    'cc.an.300': ('cc.an.300-65f5c5b8.bin', '65f5c5b88d1c8181ce60aff4275d14e8a7c4ae53'),
+    'cc.ar.300': ('cc.ar.300-44333e53.bin', '44333e5344fe66e78322b05bf53d6047925097ee'),
+    'cc.arz.300': ('cc.arz.300-430f08ff.bin', '430f08ffc7f9391ed09c781fabc30baf568b8d47'),
+    'cc.as.300': ('cc.as.300-e85d59f6.bin', 'e85d59f6fe2e908b3caab3a8bc9bfd23d6885eb2'),
+    'cc.ast.300': ('cc.ast.300-4bdc4520.bin', '4bdc452067cc838e49a1544902941470ff685b12'),
+    'cc.az.300': ('cc.az.300-10a62cca.bin', '10a62cca45f99e977accd28912ee18c74332080e'),
+    'cc.azb.300': ('cc.azb.300-5f148a4f.bin', '5f148a4f2d8feecb217da604ca02fabd0fc112fd'),
+    'cc.ba.300': ('cc.ba.300-9310a2c1.bin', '9310a2c11fda72ec87493bd4d65330537911b09a'),
+    'cc.bar.300': ('cc.bar.300-35ab084b.bin', '35ab084b3e3972419534bd60197a564c27ca90e0'),
+    'cc.bcl.300': ('cc.bcl.300-cf0fb2f8.bin', 'cf0fb2f8f1cbd04ad12bebb9846d7636333de556'),
+    'cc.be.300': ('cc.be.300-ac4ef017.bin', 'ac4ef017d975f0649c294f57fb83a3bddf55e137'),
+    'cc.bg.300': ('cc.bg.300-a5a375ef.bin', 'a5a375ef5f670c0a0926aa1a8025df3190cfc2d0'),
+    'cc.bh.300': ('cc.bh.300-776d7f4d.bin', '776d7f4d102a574cffba45a43b1913b2e23c6d94'),
+    'cc.bn.300': ('cc.bn.300-e327bd67.bin', 'e327bd678adbda1b4ace3e020a0329f6146d9f6f'),
+    'cc.bo.300': ('cc.bo.300-33174d4f.bin', '33174d4f9ffa87f71c401260a5a6008cdaac61cb'),
+    'cc.bpy.300': ('cc.bpy.300-4f8f3598.bin', '4f8f35987bc35b30d11b189f7066c41510331d4e'),
+    'cc.br.300': ('cc.br.300-7a48b869.bin', '7a48b869104057ba097c210d847de2f76ec748fb'),
+    'cc.bs.300': ('cc.bs.300-8a237bd9.bin', '8a237bd9a530f8feee7feaab583d89028e26be8d'),
+    'cc.ca.300': ('cc.ca.300-db0f7120.bin', 'db0f7120e03604c8dcedb57582cee4f7d5d9c90c'),
+    'cc.ce.300': ('cc.ce.300-5f8cebac.bin', '5f8cebac11c1fbbb23540655f83d8afe1b1a7760'),
+    'cc.ceb.300': ('cc.ceb.300-89a4764f.bin', '89a4764f1ff3dc073a76fda3290f04fe5adf83ac'),
+    'cc.ckb.300': ('cc.ckb.300-990d1cef.bin', '990d1cef7d7d36c12f7b9afe2381251169417499'),
+    'cc.co.300': ('cc.co.300-836763a0.bin', '836763a0b4e40facde79983f5156d8c6a875dffb'),
+    'cc.cs.300': ('cc.cs.300-884c693a.bin', '884c693a557633cd711bcd2888a4088bdc74723e'),
+    'cc.cv.300': ('cc.cv.300-0dfbf016.bin', '0dfbf0168205c4ca02fd7f249c8a4f7caec6ea2a'),
+    'cc.cy.300': ('cc.cy.300-cb4b9534.bin', 'cb4b953463170fa209c2ce9991bea3a07575e9de'),
+    'cc.da.300': ('cc.da.300-6b65b204.bin', '6b65b204ff034184c785678655ffc9fa7b642b34'),
+    'cc.de.300': ('cc.de.300-fc6e4385.bin', 'fc6e438502a3b8aadf119d117f85120a3cc28bae'),
+    'cc.diq.300': ('cc.diq.300-490f18c4.bin', '490f18c4a8963ca511a8d064da68eddd05f44e7f'),
+    'cc.dv.300': ('cc.dv.300-4ffe23d6.bin', '4ffe23d6bd18a1ba6273601e543932abf69d4651'),
+    'cc.el.300': ('cc.el.300-7a89986b.bin', '7a89986b681f178b92f3af015aaa4900ba6dd6a6'),
+    'cc.eml.300': ('cc.eml.300-8a6221bf.bin', '8a6221bfd1d98e1d14c89db54436d17f505b065d'),
+    'cc.en.300': ('cc.en.300-53588c22.bin', '53588c22cac7f8bf504169f671206b60da21d9b2'),
+    'cc.eo.300': ('cc.eo.300-5d9aeebb.bin', '5d9aeebb2c19807839ef68b5c5f7897d1e8ddd3a'),
+    'cc.es.300': ('cc.es.300-e9f8c041.bin', 'e9f8c04142005cadae449f016e1bebf7ae254307'),
+    'cc.et.300': ('cc.et.300-3f4391ed.bin', '3f4391edec8cf6aafcf9857bf465439f00b84a1a'),
+    'cc.eu.300': ('cc.eu.300-142f1337.bin', '142f1337d51569f9254a50bdcfe125c028f28bb5'),
+    'cc.fa.300': ('cc.fa.300-3d6ad675.bin', '3d6ad6750c27ad94e3498314a985d81bf20130f0'),
+    'cc.fi.300': ('cc.fi.300-edbd8e6e.bin', 'edbd8e6e56ab951429911ce7a16d51260773e81c'),
+    'cc.fr.300': ('cc.fr.300-35ea5d6b.bin', '35ea5d6b86011a5b85d0671d133acf8aded5fc54'),
+    'cc.frr.300': ('cc.frr.300-d87f646a.bin', 'd87f646a6c3559263217941255856da48d159e4d'),
+    'cc.fy.300': ('cc.fy.300-fd96db60.bin', 'fd96db60715adb8aaddc85123e14b3d081ef0ad3'),
+    'cc.ga.300': ('cc.ga.300-520acbd7.bin', '520acbd7771703194c8e99b28094ea54fa86a3c7'),
+    'cc.gd.300': ('cc.gd.300-781ceb1c.bin', '781ceb1cceaa107adea7c0434677c74906c05e4c'),
+    'cc.gl.300': ('cc.gl.300-b71ae11d.bin', 'b71ae11d25dfecfa3dfe83e49b24a85037e83b43'),
+    'cc.gom.300': ('cc.gom.300-65ba9b91.bin', '65ba9b9172c78600b5fcccd7514e6f5cb6b34750'),
+    'cc.gu.300': ('cc.gu.300-d717959d.bin', 'd717959de35ffdc4be47ea282181f3118fa6af05'),
+    'cc.gv.300': ('cc.gv.300-15fb06cb.bin', '15fb06cbfed61516a6014cb04f45e3876b154ae2'),
+    'cc.he.300': ('cc.he.300-743fbd32.bin', '743fbd320942c5c48bb4347beb9f24aa5d3b46f4'),
+    'cc.hi.300': ('cc.hi.300-75e919aa.bin', '75e919aa43832d6a7f08b8e05d9ddff562ead072'),
+    'cc.hif.300': ('cc.hif.300-0c25528b.bin', '0c25528b1f156a61205b96817b0fa9995fa5a2b3'),
+    'cc.hr.300': ('cc.hr.300-ab167ebb.bin', 'ab167ebb9a5cdd999500fd1beac2229796923795'),
+    'cc.hsb.300': ('cc.hsb.300-62fb0705.bin', '62fb07054f659ce5d9f2e2dda67133649b432611'),
+    'cc.ht.300': ('cc.ht.300-292d0eeb.bin', '292d0eebf256811b9cc7d6cd5dccf039d5083cf9'),
+    'cc.hu.300': ('cc.hu.300-9d660157.bin', '9d660157bc371de60ead317cce852d506544f0e2'),
+    'cc.hy.300': ('cc.hy.300-fa5ac6a1.bin', 'fa5ac6a1eb9e1e4e047bbf8343ea042ded75dd40'),
+    'cc.ia.300': ('cc.ia.300-a01758dc.bin', 'a01758dcab7138e5f67e9fd58c23b18e88142b4b'),
+    'cc.id.300': ('cc.id.300-609f02da.bin', '609f02daa0c13e544c52314452bf077f6f769019'),
+    'cc.ilo.300': ('cc.ilo.300-199068ee.bin', '199068ee56ce25ac16b6ba70c3ae337f8eed9d96'),
+    'cc.io.300': ('cc.io.300-80565e7e.bin', '80565e7e7e71b28e247ebb85da1e767cc62e7c38'),
+    'cc.is.300': ('cc.is.300-b228019a.bin', 'b228019ac716a60a4da057e787a64b0b53c1a1ec'),
+    'cc.it.300': ('cc.it.300-411f0ed7.bin', '411f0ed74448758f25c66699eec582ff5f9d2cc2'),
+    'cc.ja.300': ('cc.ja.300-806f7e68.bin', '806f7e68c0c832afb5d70f0c072189fbb4d44108'),
+    'cc.jv.300': ('cc.jv.300-af9d3f82.bin', 'af9d3f823a4d87c0dcf85d8b7ba753e1145cc4f9'),
+    'cc.ka.300': ('cc.ka.300-7189ff30.bin', '7189ff30be4d8e45b394149b1fd9f3db794e3b42'),
+    'cc.kk.300': ('cc.kk.300-621de409.bin', '621de40935e740a063a945402111b3bb7c619c08'),
+    'cc.km.300': ('cc.km.300-6410e183.bin', '6410e1832df131f309337416e4ed07a19bd22d9f'),
+    'cc.kn.300': ('cc.kn.300-dfcf2d68.bin', 'dfcf2d68bf43a1dcbb5d01e5076db16132e27d8f'),
+    'cc.ko.300': ('cc.ko.300-b7990877.bin', 'b7990877d498f084adf300f63b53565bc868b520'),
+    'cc.ku.300': ('cc.ku.300-eb62ada0.bin', 'eb62ada0e5bf9cf0535f1fe80d47b136665a8e3a'),
+    'cc.ky.300': ('cc.ky.300-01ae0d23.bin', '01ae0d2346e12e30b7ac0422cfd1f3ad6cb701da'),
+    'cc.la.300': ('cc.la.300-08e402f3.bin', '08e402f3d0d10da67444890fe15ca09e563f11a6'),
+    'cc.lb.300': ('cc.lb.300-c3b5e0a3.bin', 'c3b5e0a3ee790f21f12c17f6302e73ba0ee644f7'),
+    'cc.li.300': ('cc.li.300-b7c9c792.bin', 'b7c9c79273458c4110786f4a89c1fa0ec9bcaa80'),
+    'cc.lmo.300': ('cc.lmo.300-b7da2fe8.bin', 'b7da2fe85b58341e63379a0e22ccb84a7d2466ba'),
+    'cc.lt.300': ('cc.lt.300-73413b3f.bin', '73413b3f0072abf2eb7666795d77e9f0e85b327a'),
+    'cc.lv.300': ('cc.lv.300-725f5e2a.bin', '725f5e2a1e66173d73cbf103fceb5f86844e2278'),
+    'cc.mai.300': ('cc.mai.300-3de31332.bin', '3de31332d7afde6a93e3d05c05212d27ea538d3d'),
+    'cc.mg.300': ('cc.mg.300-0c7757e2.bin', '0c7757e2b3417cea49c679291e6e7bfe8f3653d5'),
+    'cc.mhr.300': ('cc.mhr.300-1013afe9.bin', '1013afe9cd1428e5915feebc3a2b189d8d77f9d0'),
+    'cc.min.300': ('cc.min.300-1d684a9b.bin', '1d684a9bead229e94c9b538fc9aebb1235c6e68f'),
+    'cc.mk.300': ('cc.mk.300-f9ba6f8e.bin', 'f9ba6f8eddb4e577bf44475f831984c70e371719'),
+    'cc.ml.300': ('cc.ml.300-bc6a2b1c.bin', 'bc6a2b1c2743bc2749fc8072a8276e2beb3f9a22'),
+    'cc.mn.300': ('cc.mn.300-7637ae47.bin', '7637ae47bb925fa77fe82dbe2d20eb3c56b517ee'),
+    'cc.mr.300': ('cc.mr.300-3e5eb45e.bin', '3e5eb45e7475dd3115ef5cc91e7b0257989fde18'),
+    'cc.mrj.300': ('cc.mrj.300-1593ea78.bin', '1593ea786cfcf70ba4feffda09de2dc2f2bcf80d'),
+    'cc.ms.300': ('cc.ms.300-a743adf6.bin', 'a743adf6420ad8a7aa146d9218f14d1bdf5c3285'),
+    'cc.mt.300': ('cc.mt.300-87c3b72c.bin', '87c3b72cfcd0383d7edb8f4075106f51d5e6b03c'),
+    'cc.mwl.300': ('cc.mwl.300-5d3cc773.bin', '5d3cc7739062030b9733f5dcbd64fdb1f3d397ec'),
+    'cc.my.300': ('cc.my.300-b84b8c93.bin', 'b84b8c93cbb60178ada74e23caf85cf443208739'),
+    'cc.myv.300': ('cc.myv.300-ccf32608.bin', 'ccf32608c23258ff0b381b07ee6b4a1374fac29f'),
+    'cc.mzn.300': ('cc.mzn.300-00c010f4.bin', '00c010f4c43e3ddb4fcdc29b4a946dafd5196151'),
+    'cc.nah.300': ('cc.nah.300-052fcbbc.bin', '052fcbbc5fd6891ef38250b3731987e480ba072d'),
+    'cc.nap.300': ('cc.nap.300-aa45c158.bin', 'aa45c158119e095eb186098e47b8037dcca0c847'),
+    'cc.nds.300': ('cc.nds.300-c934b13a.bin', 'c934b13ab1a06ea288461b12b9065a13d6e6438e'),
+    'cc.ne.300': ('cc.ne.300-7f70c5b9.bin', '7f70c5b9b7b9f598c041c7d8454d1d12e41005df'),
+    'cc.new.300': ('cc.new.300-4f8f8762.bin', '4f8f876293ff7096f4fe0ed13148dd116bf57ce5'),
+    'cc.nl.300': ('cc.nl.300-fb2cb6e7.bin', 'fb2cb6e75fff23b26d220395b6e2869be083722e'),
+    'cc.nn.300': ('cc.nn.300-085e9ef7.bin', '085e9ef79e6bb147d53081a407d598658562dab1'),
+    'cc.no.300': ('cc.no.300-d3028680.bin', 'd3028680f0e5458d2272ea14ee56a10820e4e406'),
+    'cc.nso.300': ('cc.nso.300-6cc24a78.bin', '6cc24a78780f4da1a18d5da310217dc21acc1977'),
+    'cc.oc.300': ('cc.oc.300-8cee765a.bin', '8cee765a77d21044792895b4fe32d56e8287c200'),
+    'cc.or.300': ('cc.or.300-64fb17ff.bin', '64fb17ffcd76db9836be3a8c553b0c973232b4fa'),
+    'cc.os.300': ('cc.os.300-e5c880f6.bin', 'e5c880f6499b1ea9f9d554d0ea356f914e4c4657'),
+    'cc.pa.300': ('cc.pa.300-3673544d.bin', '3673544dea157cfcae180fe5a444457c7bed462e'),
+    'cc.pam.300': ('cc.pam.300-1e894611.bin', '1e894611ec170839348af1f767164230f7225c94'),
+    'cc.pfl.300': ('cc.pfl.300-ac9babfd.bin', 'ac9babfd17941341fdd06f9dc23aeb3dd315952a'),
+    'cc.pl.300': ('cc.pl.300-ea55590b.bin', 'ea55590b385ca9c8ff409a807a4635624c73693e'),
+    'cc.pms.300': ('cc.pms.300-523564e9.bin', '523564e993e7925c706c039d444c4048fa19658e'),
+    'cc.pnb.300': ('cc.pnb.300-d09b6003.bin', 'd09b6003f0852f698f9589523e717c3be9b0e230'),
+    'cc.ps.300': ('cc.ps.300-0cb19e87.bin', '0cb19e874d83664980312fa659f1f7269f1459e4'),
+    'cc.pt.300': ('cc.pt.300-e69e6c5b.bin', 'e69e6c5b1ba0e802755c227d2161106caabb6b3d'),
+    'cc.qu.300': ('cc.qu.300-f99c269d.bin', 'f99c269de57ff62ec2a580307e239d80d6c0ac1a'),
+    'cc.rm.300': ('cc.rm.300-20d2cdcd.bin', '20d2cdcd8fbb49e000eb588e969046f4a4058c9b'),
+    'cc.ro.300': ('cc.ro.300-30900544.bin', '309005440433a108017444689d8605709c5bd0ee'),
+    'cc.ru.300': ('cc.ru.300-fd892a10.bin', 'fd892a10914cde02c4f1348f9b03d25d45e0d2d3'),
+    'cc.sa.300': ('cc.sa.300-91f3b393.bin', '91f3b3931e2b6f4ab6fb092032df7218d400d330'),
+    'cc.sah.300': ('cc.sah.300-ea2c7d00.bin', 'ea2c7d00ccfd6b02a928d1f9326986c64cc6e558'),
+    'cc.sc.300': ('cc.sc.300-6879c580.bin', '6879c58057dd2eeff50ef158fe53a26ae9050070'),
+    'cc.scn.300': ('cc.scn.300-4fb8dad7.bin', '4fb8dad71966dffe5c70efe330a1f881df6227dd'),
+    'cc.sco.300': ('cc.sco.300-191f6929.bin', '191f6929dcf10e7d6198529108156b9dd48b23be'),
+    'cc.sd.300': ('cc.sd.300-de045844.bin', 'de045844a43e931db0183bdce996110a8593aa63'),
+    'cc.sh.300': ('cc.sh.300-529f81f1.bin', '529f81f1d5fec8c2208e976a11406c93de3e8920'),
+    'cc.si.300': ('cc.si.300-c66d404a.bin', 'c66d404a889326a06e20c3cfe7eea80d866a1d13'),
+    'cc.sk.300': ('cc.sk.300-2ed40f6a.bin', '2ed40f6aa0bcd369d1450bcea9b0cfab16e6d6d0'),
+    'cc.sl.300': ('cc.sl.300-da689ced.bin', 'da689cedcfef7914985e6085df7d0c4ef68da657'),
+    'cc.so.300': ('cc.so.300-07b7260a.bin', '07b7260aff73dc829cb49eb2bd72584122337b7f'),
+    'cc.sq.300': ('cc.sq.300-440b0444.bin', '440b04440edb4d26751c4b13010d0335972808d8'),
+    'cc.sr.300': ('cc.sr.300-23f9d7d9.bin', '23f9d7d93f7f4d0bfb73dea047eae5f4d67aef23'),
+    'cc.su.300': ('cc.su.300-5d7d8243.bin', '5d7d82438fb71594a31ea46c0b5580ac41b37ece'),
+    'cc.sv.300': ('cc.sv.300-6fafdc44.bin', '6fafdc4452a30350ded92e9309bea658f2a31279'),
+    'cc.sw.300': ('cc.sw.300-73909439.bin', '7390943941f25d75fe2bdd9894b2d49f32b1a74b'),
+    'cc.ta.300': ('cc.ta.300-2e0386c4.bin', '2e0386c410927b53eafbc63b07ab45ff27d6dac9'),
+    'cc.te.300': ('cc.te.300-e77f5ea9.bin', 'e77f5ea9e2e726607bdfc6634cf5eac0b9f7d5b5'),
+    'cc.tg.300': ('cc.tg.300-ba451c18.bin', 'ba451c18ba027b5e12fb1d129aefe1dc8e10b451'),
+    'cc.th.300': ('cc.th.300-5b8a7299.bin', '5b8a729925df8059de767a393e1c9cfef8d94a41'),
+    'cc.tk.300': ('cc.tk.300-3f602443.bin', '3f602443ebee0d49cc181f4f21f21bd9590a31d5'),
+    'cc.tl.300': ('cc.tl.300-afee5714.bin', 'afee5714639cfe4d145bf1ef6294da065bc65b37'),
+    'cc.tr.300': ('cc.tr.300-5ac2d698.bin', '5ac2d698881a330dfeb554c43cd3737605f04e66'),
+    'cc.tt.300': ('cc.tt.300-8b467e9d.bin', '8b467e9d9834df62075829c00a007618280a3980'),
+    'cc.ug.300': ('cc.ug.300-8dd88596.bin', '8dd88596669dba3822a701baab0fabb5c97ed7cb'),
+    'cc.uk.300': ('cc.uk.300-89630e2d.bin', '89630e2d47dac2e0c7a2036e4a6021b5323dd5aa'),
+    'cc.ur.300': ('cc.ur.300-997b377c.bin', '997b377c148c50f9e39ccf085e2316e23da54228'),
+    'cc.uz.300': ('cc.uz.300-7f1e67da.bin', '7f1e67dae218977ffc2da9c7160a4ac268fa4199'),
+    'cc.vec.300': ('cc.vec.300-21e1d068.bin', '21e1d068086afcaecc3fa585b72697eb5ca3aeee'),
+    'cc.vi.300': ('cc.vi.300-3c52cba2.bin', '3c52cba2d0c5fbf781eef4068e31f6c53ba7ed8f'),
+    'cc.vls.300': ('cc.vls.300-6ffd43bb.bin', '6ffd43bb11eceec01fd5f0d6fefc96f9c14a17f1'),
+    'cc.vo.300': ('cc.vo.300-70751ce3.bin', '70751ce3c3867fa9ddc5fd7e435fcc9f1334e796'),
+    'cc.wa.300': ('cc.wa.300-eaca4696.bin', 'eaca46968edb721849fc99e15647f9f0f2df3eca'),
+    'cc.war.300': ('cc.war.300-a89f1676.bin', 'a89f1676dba8d7beae42828c71b43f749da7cbfd'),
+    'cc.xmf.300': ('cc.xmf.300-bb054a64.bin', 'bb054a64c6e287173224fbc7bf19f7a365a5866f'),
+    'cc.yi.300': ('cc.yi.300-38a25707.bin', '38a257077225bf544e5ac95d36125ccd26f1e45a'),
+    'cc.yo.300': ('cc.yo.300-cecf6563.bin', 'cecf6563658de3082db9a197e8e5382d4a9c5b25'),
+    'cc.zea.300': ('cc.zea.300-ac403268.bin', 'ac4032686216c76784c743c33e703109584d0a3f'),
+    'cc.zh.300': ('cc.zh.300-bbab54e0.bin', 'bbab54e09aa1de478a02de1c2c7c71c3a8d1f4a9'),
+    'wiki.aa': ('wiki.aa-19450d26.bin', '19450d26509c90a0a6f00114fb8d25f58e108d90'),
+    'wiki.ab': ('wiki.ab-4c3cc463.bin', '4c3cc4637cf8c75abc4377c40e5238d15e506264'),
+    'wiki.ace': ('wiki.ace-1d107b15.bin', '1d107b158e94cff010021775ad4d440035d375c0'),
+    'wiki.ady': ('wiki.ady-568aebce.bin', '568aebce6b718077f8faa8441a7bd6ffdbbae821'),
+    'wiki.af': ('wiki.af-e4c40da8.bin', 'e4c40da87e6628c32d82a736d0178298b8dc612d'),
+    'wiki.ak': ('wiki.ak-c1c81013.bin', 'c1c81013d6ec19ad97fbf145df71f72341ff95f0'),
+    'wiki.als': ('wiki.als-a77c3e58.bin', 'a77c3e58b0e13eb2a43dc43b4a519dc9f1a10fe8'),
+    'wiki.am': ('wiki.am-18ab66cf.bin', '18ab66cfc9d1ae84679acc2edb4696ac79b77aec'),
+    'wiki.an': ('wiki.an-ad4f3886.bin', 'ad4f3886c2f4794349bf45ddc7af700ce3941aa6'),
+    'wiki.ang': ('wiki.ang-1053783b.bin', '1053783bb06e698f1705bd7244cad81134dae6ff'),
+    'wiki.ar': ('wiki.ar-48738c73.bin', '48738c73a1438a8b615335deff77864556e783eb'),
+    'wiki.arc': ('wiki.arc-9e0740db.bin', '9e0740dbf20d39fccd4b7bed2be329b06cf7270b'),
+    'wiki.arz': ('wiki.arz-32384b81.bin', '32384b8102596a459436caf02031d29a2fb31b2a'),
+    'wiki.as': ('wiki.as-50765f8c.bin', '50765f8c2a6bb827b843165aa6ab4b25ffb340c2'),
+    'wiki.ast': ('wiki.ast-0b3e9cd0.bin', '0b3e9cd0c14ad6e72847f549cf4e2d684c23e2fc'),
+    'wiki.av': ('wiki.av-26427883.bin', '2642788376154d8fc10e0f50eefd6ebd956d8211'),
+    'wiki.ay': ('wiki.ay-3926337d.bin', '3926337dc3162e5eb369d2c4141aa19d43de7e77'),
+    'wiki.az': ('wiki.az-7e1aa3b5.bin', '7e1aa3b54e75aa381104a36d32e9b2b943c58416'),
+    'wiki.azb': ('wiki.azb-2a3112fe.bin', '2a3112fedf2eeee37d929f2e24e6ce93651c1e58'),
+    'wiki.ba': ('wiki.ba-ddb26431.bin', 'ddb2643124e4d67da1099a6bb7c34ccda9c4d54b'),
+    'wiki.bar': ('wiki.bar-76a67a05.bin', '76a67a057330963a89845248718e9c0cf43042e6'),
+    'wiki.bat_smg': ('wiki.bat_smg-e6bb57b0.bin', 'e6bb57b0c61e2e4486b29c8437f073893111bc12'),
+    'wiki.bcl': ('wiki.bcl-f9b50b40.bin', 'f9b50b40c7398b441db0d023b9546c5623ca81e6'),
+    'wiki.be': ('wiki.be-9de13c85.bin', '9de13c852c5e283ac3362d3debc9343dcaf851ed'),
+    'wiki.bg': ('wiki.bg-fa0e36e7.bin', 'fa0e36e702301e091dab5b50353f5b93ec99eda1'),
+    'wiki.bh': ('wiki.bh-4ea0c4ce.bin', '4ea0c4ce9fb9e7fcec9b102a1d21e51cbea54860'),
+    'wiki.bi': ('wiki.bi-d756a260.bin', 'd756a26035e8a4f4c556bfd40470a3504bf47380'),
+    'wiki.bjn': ('wiki.bjn-a6bda749.bin', 'a6bda7490f87734de0e9cb0996e6edd68b32d097'),
+    'wiki.bm': ('wiki.bm-ddae0aee.bin', 'ddae0aee51f99812ec418ca32ebc7bcb3d7d7afe'),
+    'wiki.bn': ('wiki.bn-84a5663f.bin', '84a5663fdeb61edf7a2076b313970d5cef5a1e58'),
+    'wiki.bo': ('wiki.bo-ee189a77.bin', 'ee189a7723d6d89088299b2f7b2b7c5c81c1a83c'),
+    'wiki.bpy': ('wiki.bpy-7c9cab8b.bin', '7c9cab8bc9317b2c9b782f3684956f62ce5253c9'),
+    'wiki.br': ('wiki.br-750d7016.bin', '750d7016b22ac7293214cb26e25562b1fc333165'),
+    'wiki.bs': ('wiki.bs-0a2fdd98.bin', '0a2fdd987687a7485f4a1030d013f7b707af2d56'),
+    'wiki.bug': ('wiki.bug-3937c2af.bin', '3937c2afd0b6bea60d31b600858002657319dfff'),
+    'wiki.bxr': ('wiki.bxr-2b522edb.bin', '2b522edb5fef85bf5c2818f84cc1ffd13e4ddf95'),
+    'wiki.ca': ('wiki.ca-fc711e4b.bin', 'fc711e4b5b67de1ebf6cd1f5f99bb09822953781'),
+    'wiki.cbk_zam': ('wiki.cbk_zam-b7832a19.bin', 'b7832a1932382f1ce89562042009055a914f7c1e'),
+    'wiki.cdo': ('wiki.cdo-e75244c2.bin', 'e75244c2d992bd14c0900546b950a707ee99c79a'),
+    'wiki.ce': ('wiki.ce-06dcd3ba.bin', '06dcd3bab08d2caabe5b8e5b7e7fcce233858a43'),
+    'wiki.ceb': ('wiki.ceb-35c5cd0f.bin', '35c5cd0f8aeef5a78a9113d0ef856e5470eb6400'),
+    'wiki.ch': ('wiki.ch-9bfefcab.bin', '9bfefcab247cef190bbcc711e2982c033df8faab'),
+    'wiki.cho': ('wiki.cho-7087a54a.bin', '7087a54a087863d5d7058d35478740cd3fdd716d'),
+    'wiki.chr': ('wiki.chr-5e93d639.bin', '5e93d6398b44467f8a57593656127cc3a601d361'),
+    'wiki.chy': ('wiki.chy-f119f436.bin', 'f119f43617ba2adbf6f7b10f573c3d6c8daa63a9'),
+    'wiki.ckb': ('wiki.ckb-49d6d997.bin', '49d6d99772cac2e25beeb02d9f8f055739d1f369'),
+    'wiki.co': ('wiki.co-dd8e6c6c.bin', 'dd8e6c6ce7d740c9a87f10a90b597df2d1a0d883'),
+    'wiki.cr': ('wiki.cr-a60b68fc.bin', 'a60b68fc163412717af9d50ce6f8cce90de6089a'),
+    'wiki.crh': ('wiki.crh-ae73e838.bin', 'ae73e83881604df8dfeb7ae558be351a57051080'),
+    'wiki.cs': ('wiki.cs-a41ff81a.bin', 'a41ff81af6ef6ff5f692d9719c90fac2261b7c21'),
+    'wiki.csb': ('wiki.csb-13121dd7.bin', '13121dd7558b3ea692d73c964bc721db6ccb8d9b'),
+    'wiki.cu': ('wiki.cu-968dea66.bin', '968dea66e5856289724c4d1c8290c1012ea97df3'),
+    'wiki.cv': ('wiki.cv-a87c66dc.bin', 'a87c66dc67a57af38333aeb375dd33e4c42f327f'),
+    'wiki.cy': ('wiki.cy-4cc3571e.bin', '4cc3571ed974dd877daa3b5ffaf486725d4436a0'),
+    'wiki.da': ('wiki.da-53f0da01.bin', '53f0da01b102ff17499678e0a3876146365b5de7'),
+    'wiki.de': ('wiki.de-2da44d3d.bin', '2da44d3d5ac758a7c1a169f66db4953a020b1df4'),
+    'wiki.diq': ('wiki.diq-f31f5534.bin', 'f31f5534d63a6adaf8fce1426fd2b8efa5dbb88e'),
+    'wiki.dsb': ('wiki.dsb-1b26e0af.bin', '1b26e0af41d67a4a8a0d4e2e97ff7d8f958daaac'),
+    'wiki.dv': ('wiki.dv-32a8ebf5.bin', '32a8ebf59405ad8f9919b18e27f2a2a79bdd3f3f'),
+    'wiki.dz': ('wiki.dz-594a371f.bin', '594a371f17bec9ab45514bf5cf26252be7bb8396'),
+    'wiki.ee': ('wiki.ee-e0003d72.bin', 'e0003d7287640101f6b9ecb69452cb47afa0d438'),
+    'wiki.el': ('wiki.el-9c824bd0.bin', '9c824bd0e0e6888e2bbc065a27e2b45ec4164e8b'),
+    'wiki.eml': ('wiki.eml-84490c6b.bin', '84490c6bedef5204c703b5ac5c9ee008147ddaa2'),
+    'wiki.en': ('wiki.en-8ca82682.bin', '8ca8268250f81b88119949e0fea5a6b81bcac809'),
+    'wiki.eo': ('wiki.eo-7baf04e3.bin', '7baf04e353a607bbddb36f439c4033097d854747'),
+    'wiki.es': ('wiki.es-422e6f75.bin', '422e6f7582adff418f527ceb01763296c60e1f31'),
+    'wiki.et': ('wiki.et-9cf101e3.bin', '9cf101e3cdb6cdf0b0cb16364769cbb26ef8875a'),
+    'wiki.eu': ('wiki.eu-f5637868.bin', 'f56378689a3c14d26b0b1df0483c5b96c9a1ec9f'),
+    'wiki.ext': ('wiki.ext-daefc0bc.bin', 'daefc0bc266f14f48a58e5ea632796adb9a36540'),
+    'wiki.fa': ('wiki.fa-0b8559e6.bin', '0b8559e6b6506e262de3fb55ea9dec03244badca'),
+    'wiki.ff': ('wiki.ff-4d6b11b3.bin', '4d6b11b3d6ccdfa2b06d91c78fd05d7a46106582'),
+    'wiki.fi': ('wiki.fi-d1d2f60d.bin', 'd1d2f60da48564f659072dee5b2de1306cf2b590'),
+    'wiki.fiu_vro': ('wiki.fiu_vro-fc73c1f3.bin', 'fc73c1f3caec0bff7ca7e3ec006c0c9757d2c8f2'),
+    'wiki.fj': ('wiki.fj-d3f97816.bin', 'd3f9781664886cd2e332623615a1db8b2781c925'),
+    'wiki.fo': ('wiki.fo-04aeaf7c.bin', '04aeaf7cef283cfdd6766af0c65f6a6b13f6040a'),
+    'wiki.fr': ('wiki.fr-ee1dde08.bin', 'ee1dde0800113dcd6124ccb643bd1004184b7559'),
+    'wiki.frp': ('wiki.frp-0f64bb1b.bin', '0f64bb1b389e30e3af163e526b72bfc4cab8eb7d'),
+    'wiki.frr': ('wiki.frr-576ebf02.bin', '576ebf02e6b9b0acf2ea9da6d35e1a5b2c98648b'),
+    'wiki.fur': ('wiki.fur-5ebed3c9.bin', '5ebed3c9e39243479326b288202b9e11d1c434c2'),
+    'wiki.fy': ('wiki.fy-811bc386.bin', '811bc3864418110fe914f555216372b2f79b7fb5'),
+    'wiki.ga': ('wiki.ga-77b3aa66.bin', '77b3aa6640cc25536b965f5dc512503d8d0c2a6a'),
+    'wiki.gag': ('wiki.gag-a732c376.bin', 'a732c376f771309bc196fa8758fd5100c67627f6'),
+    'wiki.gan': ('wiki.gan-40a8cfd9.bin', '40a8cfd9889646aa722c32dac2e0a5c76689c6a1'),
+    'wiki.gd': ('wiki.gd-0dcdb67d.bin', '0dcdb67d346f0f9abc41554b7ca13190aa8fab16'),
+    'wiki.gl': ('wiki.gl-44a91a4c.bin', '44a91a4cf3aaaa68feaa3dc10e16f03cb7d41e53'),
+    'wiki.glk': ('wiki.glk-43f0bf43.bin', '43f0bf43b98d2e0477b0671ee9fce7f2cb8da6a3'),
+    'wiki.gn': ('wiki.gn-29975179.bin', '2997517997b93f71613b9d008e6d95f68a01ad4b'),
+    'wiki.gom': ('wiki.gom-08ba082b.bin', '08ba082b769cf631bfcd100631bfdda77980aa54'),
+    'wiki.got': ('wiki.got-bfe0a90d.bin', 'bfe0a90d91343f24c8f9faea325f40395ab8bb8e'),
+    'wiki.gu': ('wiki.gu-7d49d055.bin', '7d49d05551425e5661f968b9cc0354e15ea0405f'),
+    'wiki.gv': ('wiki.gv-eeea71f6.bin', 'eeea71f64e24c80ff07482825f1ce26be19a69fd'),
+    'wiki.ha': ('wiki.ha-87a99090.bin', '87a990900d4a74055303585b4d4a89ab7bf0aa47'),
+    'wiki.hak': ('wiki.hak-c652e7ff.bin', 'c652e7ff3d63676307bcbd9f4241fb1c6b8cf7ff'),
+    'wiki.haw': ('wiki.haw-4f2d842b.bin', '4f2d842b730bec90d0268bff9ca2c8f41fc33987'),
+    'wiki.he': ('wiki.he-9c5eb5cd.bin', '9c5eb5cda37954c481d2053bd4e553bbfd34deb4'),
+    'wiki.hi': ('wiki.hi-1ca0898a.bin', '1ca0898af562c2ec90a06860ebb27b5a7b0b8cf4'),
+    'wiki.hif': ('wiki.hif-8876acba.bin', '8876acbaf8195b94724179aab516234b06f3812a'),
+    'wiki.ho': ('wiki.ho-8bc406a1.bin', '8bc406a1defb5703a743a5b424c169b26ffb347c'),
+    'wiki.hr': ('wiki.hr-b06384ed.bin', 'b06384ede2bbacae89cb7e94cc9457b0905b410c'),
+    'wiki.hsb': ('wiki.hsb-c9cc78b6.bin', 'c9cc78b6e9c1eb2ec3ad57e0f47f2312134de86c'),
+    'wiki.ht': ('wiki.ht-da38ff9e.bin', 'da38ff9e8e8e61672422316cba888f2f35fbf9f5'),
+    'wiki.hu': ('wiki.hu-a7cd92e6.bin', 'a7cd92e6880b53ee880a7b9de6767a8bd77c9f1a'),
+    'wiki.hy': ('wiki.hy-e23e7c36.bin', 'e23e7c36fe63418d46621efee4d4a5248fa7b9ce'),
+    'wiki.hz': ('wiki.hz-1a43df11.bin', '1a43df118a1a21f3c9c2d7d43f1cc30900569fdb'),
+    'wiki.ia': ('wiki.ia-439e6f2f.bin', '439e6f2f0bf209b7e36e166c8c7e0af6aba34cd2'),
+    'wiki.id': ('wiki.id-4ed7d4aa.bin', '4ed7d4aabb54f0af97ca35e31b79711bec2a033e'),
+    'wiki.ie': ('wiki.ie-7b0a9761.bin', '7b0a97617ddaf155b898ed9281ff5d2e78e428ef'),
+    'wiki.ig': ('wiki.ig-f588d85a.bin', 'f588d85a10ba426ab4825aaaf66d4a87e82b22dd'),
+    'wiki.ii': ('wiki.ii-3214212c.bin', '3214212c59f85e40a6fcd1ceb00b36428c1dcc17'),
+    'wiki.ik': ('wiki.ik-6bf795cf.bin', '6bf795cf8233e6bfd312f943d301ff5ce70d70b7'),
+    'wiki.ilo': ('wiki.ilo-17eb1eff.bin', '17eb1eff170510e85874f83136ead4cc9a2121a6'),
+    'wiki.io': ('wiki.io-3f7d30f3.bin', '3f7d30f3abed949dbd089c67e0131f0412fdc84f'),
+    'wiki.is': ('wiki.is-e246137d.bin', 'e246137db426f11ec6eb3cbf1bcd1152b0ce0aab'),
+    'wiki.it': ('wiki.it-d3019ee2.bin', 'd3019ee2bdafaac7fbb3b9590ce4af35887e3ecc'),
+    'wiki.iu': ('wiki.iu-25d55802.bin', '25d558026b0f65cbadd485f63f335faacd192dbd'),
+    'wiki.ja': ('wiki.ja-7f4f37fa.bin', '7f4f37fad9c4cff36b221d73bdd8a2d6c5d96518'),
+    'wiki.jam': ('wiki.jam-32f692c1.bin', '32f692c1f817d4a5f64e98942e50cfe24356cb43'),
+    'wiki.jbo': ('wiki.jbo-2cada509.bin', '2cada509f2e32a01af248c11b76f1e3de313997f'),
+    'wiki.jv': ('wiki.jv-2def005b.bin', '2def005b119888a0940302704639c6d47c452839'),
+    'wiki.ka': ('wiki.ka-deed211e.bin', 'deed211e28a45a7cec32e9a4a6e0b8015bb5c75a'),
+    'wiki.kaa': ('wiki.kaa-31a0f80c.bin', '31a0f80cb3a6e6d1d435489cfbc6a778e6177cbc'),
+    'wiki.kab': ('wiki.kab-01dfdf1f.bin', '01dfdf1fcaf0888df6e43bb12e6f680c33181032'),
+    'wiki.kbd': ('wiki.kbd-b753b4d4.bin', 'b753b4d4584d3ee158a0e65de8cf1e684b9204a6'),
+    'wiki.kg': ('wiki.kg-bd1f271d.bin', 'bd1f271dc512d5e5e6100ddeb09d92ef20f5735b'),
+    'wiki.ki': ('wiki.ki-6044fff2.bin', '6044fff24b0951468438c10bd17a67c3042d3df5'),
+    'wiki.kj': ('wiki.kj-b4c8d6ad.bin', 'b4c8d6ad7b89e596e92b4ab275758837961c074a'),
+    'wiki.kk': ('wiki.kk-3357eee9.bin', '3357eee9c3193859b2a254e7e03167f9e133aefa'),
+    'wiki.kl': ('wiki.kl-2ae394ee.bin', '2ae394ee6b61bac8732b97a8b09783e2d49e7a64'),
+    'wiki.km': ('wiki.km-ef0eed3b.bin', 'ef0eed3b83c8f54c7f124fd3f8d9c0200d981f8a'),
+    'wiki.kn': ('wiki.kn-7ba6e9f2.bin', '7ba6e9f2da563dbd9a657032c8eb99c5b56cdba3'),
+    'wiki.ko': ('wiki.ko-ae46f52b.bin', 'ae46f52bd2534a01b601449af7f2eccfa1c06719'),
+    'wiki.koi': ('wiki.koi-fa0c7bdd.bin', 'fa0c7bdd198e6afd88b0f735796faa43a8dc33fb'),
+    'wiki.kr': ('wiki.kr-08696f70.bin', '08696f7016cc384fd8170e6cf485da5f17d1b7cd'),
+    'wiki.krc': ('wiki.krc-ff8a4631.bin', 'ff8a46318685e56cb5cf5eed87ec899c5352b963'),
+    'wiki.ks': ('wiki.ks-19d31479.bin', '19d31479421121b35d4f1281846033125f8c4ad7'),
+    'wiki.ksh': ('wiki.ksh-97112af8.bin', '97112af8d32fae08270e7a88cd4da3af85e23b35'),
+    'wiki.ku': ('wiki.ku-0ef76a6f.bin', '0ef76a6ff71586b155eda3273205b1b7b08cd73d'),
+    'wiki.kv': ('wiki.kv-a50d93bf.bin', 'a50d93bf0db87356b6e8f794a37c989392195277'),
+    'wiki.kw': ('wiki.kw-77ef77b3.bin', '77ef77b3323955a9825c992868ba83cb80775e8e'),
+    'wiki.ky': ('wiki.ky-3f5928bc.bin', '3f5928bca0dc2551c2a9d2b25a90aa957e9742bf'),
+    'wiki.la': ('wiki.la-3fe4d514.bin', '3fe4d51458c4b10878b671c8a7c0550b4e7d6baa'),
+    'wiki.lad': ('wiki.lad-753bb201.bin', '753bb20136b3372cdcddf3bf5cbbef2910039490'),
+    'wiki.lb': ('wiki.lb-827a0e46.bin', '827a0e46d57af4b8b59cd921a072ee0c0dd713ca'),
+    'wiki.lbe': ('wiki.lbe-2934dfa5.bin', '2934dfa5f791f36f46abff43fcafce5ea6726a59'),
+    'wiki.lez': ('wiki.lez-f344710f.bin', 'f344710ffb5eb75b59fb97b9a91be1d6c01329eb'),
+    'wiki.lg': ('wiki.lg-3c31935e.bin', '3c31935eeb7165c4bdc28a77644c82703e61deb7'),
+    'wiki.li': ('wiki.li-a9214c96.bin', 'a9214c96d8df517c68bf95c0e832905e85c7a6a9'),
+    'wiki.lij': ('wiki.lij-55fead2b.bin', '55fead2bf98a2ee9e19ba5890ac2d5d82679b21e'),
+    'wiki.lmo': ('wiki.lmo-b3a6ce73.bin', 'b3a6ce73aa41892ded828247f22cf41c71c5044d'),
+    'wiki.ln': ('wiki.ln-403279f9.bin', '403279f918ab9409df69f1582ad850aa02046696'),
+    'wiki.lo': ('wiki.lo-a5903d28.bin', 'a5903d2807e8088df6cef9e88ad9e3fc42dbe17e'),
+    'wiki.lrc': ('wiki.lrc-07de075f.bin', '07de075f09cb54767ec7e79114311ca5e58f6d3e'),
+    'wiki.lt': ('wiki.lt-62e95727.bin', '62e957278fe6c45f73071c4c91e68d4f02a9fe20'),
+    'wiki.ltg': ('wiki.ltg-1351a4b3.bin', '1351a4b3a671bd208f5a6527d9d9047124007f78'),
+    'wiki.lv': ('wiki.lv-991eae2a.bin', '991eae2ae2cbe77a89133bb42c5829f15cf59c9e'),
+    'wiki.mai': ('wiki.mai-56cee5cb.bin', '56cee5cbeb259bc26927e15baf68c7e0b88786ab'),
+    'wiki.map_bms': ('wiki.map_bms-87356c93.bin', '87356c93b166304986df7b5df52728e896d58180'),
+    'wiki.mdf': ('wiki.mdf-5c5f1c2c.bin', '5c5f1c2c499bed790b81c1954edad644b1d137aa'),
+    'wiki.mg': ('wiki.mg-a1b18be8.bin', 'a1b18be8864cc40d2ef5be324783de32e35d3c40'),
+    'wiki.mh': ('wiki.mh-14147c18.bin', '14147c18cd451b7693babf995edeafb1a03a6f01'),
+    'wiki.mhr': ('wiki.mhr-204edb17.bin', '204edb1741f0b04566f0e2a9bce46e8a2411ce32'),
+    'wiki.mi': ('wiki.mi-ef6b4e35.bin', 'ef6b4e35b9f3dc7aa8803bb422a48c72eaea64ec'),
+    'wiki.min': ('wiki.min-9a0a9ebc.bin', '9a0a9ebc607286f43cb6dc8cd06e2fa8c8986f12'),
+    'wiki.mk': ('wiki.mk-4100301f.bin', '4100301fafb6ff4c5531187c54c11e9f24679657'),
+    'wiki.ml': ('wiki.ml-e08b01b3.bin', 'e08b01b34318ca3eebedcfd6d44c832a0f16dd37'),
+    'wiki.mn': ('wiki.mn-f6a269d2.bin', 'f6a269d2225d518acbb45cc3f8c5cbc2a318ee30'),
+    'wiki.mo': ('wiki.mo-394239a3.bin', '394239a3d27bed433cad8c70e5bba16a0a30838c'),
+    'wiki.mr': ('wiki.mr-37b4bb82.bin', '37b4bb82fa499080e015b3081b85c8c3c5e3e6b9'),
+    'wiki.mrj': ('wiki.mrj-37ca837d.bin', '37ca837dde630945217e46ff4065e5a2e795bc93'),
+    'wiki.ms': ('wiki.ms-5b4ddb79.bin', '5b4ddb79ab02d5638ee69d60a6191b65fda5ce86'),
+    'wiki.mt': ('wiki.mt-b3323fa1.bin', 'b3323fa1038143a3e791bc352e6964f197213af5'),
+    'wiki.mus': ('wiki.mus-f005e240.bin', 'f005e2408799819e6337aa62afb8d92e713f60d6'),
+    'wiki.mwl': ('wiki.mwl-f0838820.bin', 'f0838820a819b3de537d6e2e01c543ffea2cf2fc'),
+    'wiki.my': ('wiki.my-d64aad4e.bin', 'd64aad4e6f5c6c7319d794e657e96718c752277f'),
+    'wiki.myv': ('wiki.myv-0464a2f4.bin', '0464a2f4ffde9637f93d59f2004e350b15b1f4eb'),
+    'wiki.mzn': ('wiki.mzn-872ffc98.bin', '872ffc987bff427a3bd7edab5850658f07611fed'),
+    'wiki.na': ('wiki.na-d7b31b79.bin', 'd7b31b7991767f4d328347dcc871d11dffdd629d'),
+    'wiki.nah': ('wiki.nah-712f2493.bin', '712f2493b87f6ce3f8991f842e91ae18f93e7357'),
+    'wiki.nap': ('wiki.nap-11b97cb5.bin', '11b97cb5d41fa479322a569dbaf79b7e3c26f819'),
+    'wiki.nds': ('wiki.nds-305ba618.bin', '305ba618d97b6b6b334a28df888ffe624ae4b9c9'),
+    'wiki.nds_nl': ('wiki.nds_nl-fb880749.bin', 'fb880749b3326a03de850e064a65f89a4e069e40'),
+    'wiki.ne': ('wiki.ne-72e739fd.bin', '72e739fd99cfda922ff3b8262550df087d59581d'),
+    'wiki.new': ('wiki.new-8f66c97a.bin', '8f66c97ab6e30c0ee9ff350818e7df2da18636a0'),
+    'wiki.ng': ('wiki.ng-bc569540.bin', 'bc569540c4fd3d4ddb6ab7bf1c63d3e576b0d0fa'),
+    'wiki.nl': ('wiki.nl-40d9776a.bin', '40d9776a8466d3eebdc9c8e6b385bdd3271ea126'),
+    'wiki.nn': ('wiki.nn-d4e5918c.bin', 'd4e5918cde9a4354dcc3c4174ae4fca97392b250'),
+    'wiki.no': ('wiki.no-18db3b96.bin', '18db3b96776c453b37c343c663516a9b937635ae'),
+    'wiki.nov': ('wiki.nov-3056416e.bin', '3056416e12e7d08fc4fdd85f0173a6dd72c40ab6'),
+    'wiki.nrm': ('wiki.nrm-1a0a8daf.bin', '1a0a8daf48da8476dcef8018bae70e20dd027e36'),
+    'wiki.nso': ('wiki.nso-977d9079.bin', '977d9079818970d709f50a1cd6a5917875b805fc'),
+    'wiki.nv': ('wiki.nv-4dfd868a.bin', '4dfd868ae77b006a34e873180698e7152aef910e'),
+    'wiki.ny': ('wiki.ny-ad7f85e7.bin', 'ad7f85e700fd386d88e5c82a53773afd493c7a3d'),
+    'wiki.oc': ('wiki.oc-289fe81d.bin', '289fe81dbadcbd7ea052545b73c46206c34864ec'),
+    'wiki.olo': ('wiki.olo-9721fefd.bin', '9721fefd1e1b1c3e51a0d8543f8c5473984fe2c4'),
+    'wiki.om': ('wiki.om-71345bb8.bin', '71345bb8c3186e27bd4d1d4808a8cb097903e1bf'),
+    'wiki.or': ('wiki.or-f55c1ca8.bin', 'f55c1ca84126f315312e2a7ed45e32ab91b8e636'),
+    'wiki.os': ('wiki.os-c0148462.bin', 'c0148462d638e5dc94cef3ec607d8b6ef9672156'),
+    'wiki.pa': ('wiki.pa-a0903f78.bin', 'a0903f78b47f32e11e4c62173bbedebb64a510e0'),
+    'wiki.pag': ('wiki.pag-00e02108.bin', '00e021082f3a14b5603be9cc201b145405498f2a'),
+    'wiki.pam': ('wiki.pam-dfe4c21c.bin', 'dfe4c21c29e4022f1ab848c060c3acb697d279f9'),
+    'wiki.pap': ('wiki.pap-a3766e93.bin', 'a3766e936218cbd5e8ac0c279209960a7908b15a'),
+    'wiki.pcd': ('wiki.pcd-986d648c.bin', '986d648ce805d15c3fa0e706c7a96c5d02e22298'),
+    'wiki.pdc': ('wiki.pdc-031d7283.bin', '031d7283179b75f0f62e427b58eef4ecc7211243'),
+    'wiki.pfl': ('wiki.pfl-826fc525.bin', '826fc525d7542d9c9a6ac3b72cfb01f438f6d33f'),
+    'wiki.pi': ('wiki.pi-c0ddd653.bin', 'c0ddd653f5c89cea4eca90de1b0a8f78111e77a4'),
+    'wiki.pih': ('wiki.pih-a71523db.bin', 'a71523dba29a0f98e6fd52c4bac3f984cdfcd0b2'),
+    'wiki.pl': ('wiki.pl-b600fdc2.bin', 'b600fdc2f1d62c062ec6028d0652ede539081381'),
+    'wiki.pms': ('wiki.pms-7e429bdf.bin', '7e429bdf7094257c372008d5ef1638f813f60da0'),
+    'wiki.pnb': ('wiki.pnb-08ab3d18.bin', '08ab3d185ebc76a6636cb4da169a6904361ba048'),
+    'wiki.pnt': ('wiki.pnt-14458f30.bin', '14458f3083992b5cddd87f2b9cbcd152377ff97b'),
+    'wiki.ps': ('wiki.ps-b86afb94.bin', 'b86afb94162924807dc8e4c75d80fcfb71acda76'),
+    'wiki.pt': ('wiki.pt-f971330c.bin', 'f971330cf54742dd17d1cb6245c1b3b468b42978'),
+    'wiki.qu': ('wiki.qu-7cbfa5fd.bin', '7cbfa5fdb2d41e0569209c4a297a54b80743bc17'),
+    'wiki.rm': ('wiki.rm-15cc9e74.bin', '15cc9e74f2ea0db93f8914a357e9b60840607b08'),
+    'wiki.rmy': ('wiki.rmy-fdc5afea.bin', 'fdc5afeae1f185a2a551e091bf3d7731ba92223d'),
+    'wiki.rn': ('wiki.rn-9859ac6f.bin', '9859ac6f33ae14670772f0fa06957127ffba60d8'),
+    'wiki.ro': ('wiki.ro-be264092.bin', 'be264092886bd9cdfa3e0fe43896ee7e7b8039ad'),
+    'wiki.roa_rup': ('wiki.roa_rup-ab463b13.bin', 'ab463b13e26f3c9ab99598ddcb722e70cf5bf35a'),
+    'wiki.roa_tara': ('wiki.roa_tara-073b0d19.bin', '073b0d192290b9c45463ddc11f043ff1ec42371e'),
+    'wiki.ru': ('wiki.ru-5fbe8dd5.bin', '5fbe8dd58e6f6f58fb430e948e4d6a3b6cd6b603'),
+    'wiki.rue': ('wiki.rue-1d09f6c5.bin', '1d09f6c53dc669f2fed52b511ba551c9ebb5caa4'),
+    'wiki.rw': ('wiki.rw-e5e6abb9.bin', 'e5e6abb981ce2898a10f19c092033111d881917b'),
+    'wiki.sa': ('wiki.sa-a651ca36.bin', 'a651ca363e7b431f9a2340557fa54163ff523764'),
+    'wiki.sah': ('wiki.sah-0a0642a3.bin', '0a0642a36a863726bf8009957505101b74e2d8ef'),
+    'wiki.sc': ('wiki.sc-9a6d6d82.bin', '9a6d6d82cb0bf57cf0b05ecaadfa0614712ddca7'),
+    'wiki.scn': ('wiki.scn-2beb5569.bin', '2beb556906fc10a70a5dc5d8359c57ddbb567620'),
+    'wiki.sco': ('wiki.sco-99406b00.bin', '99406b00be5c98fc69e561b2cd6bf29549514d5d'),
+    'wiki.sd': ('wiki.sd-f9bafcac.bin', 'f9bafcacfc1b99aa5235ef6ff23f0532a638a89d'),
+    'wiki.se': ('wiki.se-4ec410b7.bin', '4ec410b7963d85535bbc40429e3e74597aa102a3'),
+    'wiki.sg': ('wiki.sg-05416c3e.bin', '05416c3e636d6b4612966de3476262760dc1b6af'),
+    'wiki.sh': ('wiki.sh-d1262628.bin', 'd126262872b90d3d9d3d4a91c009a1a4a8ba6c38'),
+    'wiki.si': ('wiki.si-cba0721f.bin', 'cba0721fa9b6d6ef470da3f852c475cef2af793e'),
+    'wiki.simple': ('wiki.simple-67424112.bin', '67424112bc1879dc1288c9081ac3292aed1027e9'),
+    'wiki.sk': ('wiki.sk-42878f2a.bin', '42878f2afd7633bdb5c2b4f742c4d1b13377eb00'),
+    'wiki.sl': ('wiki.sl-3df13aa9.bin', '3df13aa9cf7c2c5a8b7a655e129cca46bc7635d7'),
+    'wiki.sm': ('wiki.sm-0b04a63e.bin', '0b04a63e3cdc04722530f7b1d60dfa01368191b8'),
+    'wiki.sn': ('wiki.sn-30f67c3f.bin', '30f67c3f26a074888813ebf67d9ae9c4d5863d2f'),
+    'wiki.so': ('wiki.so-8466ae2e.bin', '8466ae2ea133cac06cceb240934e826a9e304d89'),
+    'wiki.sq': ('wiki.sq-4068a1bc.bin', '4068a1bcce63aee6af1aabfd8139dc6508c017cf'),
+    'wiki.sr': ('wiki.sr-082e3132.bin', '082e3132151777aa9f20f005dbb59b78f415cb5f'),
+    'wiki.srn': ('wiki.srn-b7f15abf.bin', 'b7f15abf361445aa2bb621b71f7e70bbf88cb547'),
+    'wiki.ss': ('wiki.ss-d84b4c58.bin', 'd84b4c58fc78d5bfd8e0ae3909fa6ad8966ed5a2'),
+    'wiki.st': ('wiki.st-41bbfe88.bin', '41bbfe88fcc624496d35ed1801da3c51dd444c43'),
+    'wiki.stq': ('wiki.stq-7869010b.bin', '7869010b7edd26d51849f676ef1a4f39dc83ec55'),
+    'wiki.su': ('wiki.su-522015ae.bin', '522015ae575b367ba73baed722168c6035bf1a4f'),
+    'wiki.sv': ('wiki.sv-2b51c008.bin', '2b51c00867be3483aa123b82af93bab8fc596886'),
+    'wiki.sw': ('wiki.sw-b19842d4.bin', 'b19842d48289ca2a81b7b22e34b467cf8dfb9a26'),
+    'wiki.szl': ('wiki.szl-19f27783.bin', '19f277833eea6f5d878114e54dc097764f963efb'),
+    'wiki.ta': ('wiki.ta-0746a2ef.bin', '0746a2ef062efecd5268146d64c060ef1e9144f2'),
+    'wiki.tcy': ('wiki.tcy-0ff14866.bin', '0ff1486683d6b736a04305e8625f8d3850c6a3a8'),
+    'wiki.te': ('wiki.te-0dc2bd97.bin', '0dc2bd97553cc270d3d4154c35fcfce6998a88bd'),
+    'wiki.tet': ('wiki.tet-670940fc.bin', '670940fc5658e21740f8d9ce442e6133b5e3e260'),
+    'wiki.tg': ('wiki.tg-488952ce.bin', '488952ce406c4deafd1d5dfb1336f49341faef55'),
+    'wiki.th': ('wiki.th-7332f0f8.bin', '7332f0f8c42f5b60cd4e2676f7b806cc942a4ab0'),
+    'wiki.ti': ('wiki.ti-398f25bb.bin', '398f25bb3685800cc46aab59b084726c8cb6cd74'),
+    'wiki.tk': ('wiki.tk-c281e62a.bin', 'c281e62aa025f18f4c0006939cf42446b12cd0f3'),
+    'wiki.tl': ('wiki.tl-929953fd.bin', '929953fd9b07fbad5bc79cd62703b3fb1a59ba1a'),
+    'wiki.tn': ('wiki.tn-ea8fac80.bin', 'ea8fac801735804eb98971cf823c89af496be814'),
+    'wiki.to': ('wiki.to-bcc965ca.bin', 'bcc965cace5fe8c359b97a9389188ea04077c1bf'),
+    'wiki.tpi': ('wiki.tpi-efd371af.bin', 'efd371af61d7ce4d1b7922e33790b72bc8f3c4d5'),
+    'wiki.tr': ('wiki.tr-9c2c0a70.bin', '9c2c0a7008cd89cbe211a909431f7896a630f686'),
+    'wiki.ts': ('wiki.ts-8958dcb3.bin', '8958dcb31274f4fa1ac96cff7de2ad12b0594ff2'),
+    'wiki.tt': ('wiki.tt-4835092f.bin', '4835092f1ee35ece995f73712c11b128d7e6d8e3'),
+    'wiki.tum': ('wiki.tum-20684599.bin', '206845992b8ec51f673408f11c6383a7ddc75faf'),
+    'wiki.tw': ('wiki.tw-6fcf965e.bin', '6fcf965e4f74a6bccac892ebebb7e305e4d747c3'),
+    'wiki.ty': ('wiki.ty-cf5c1022.bin', 'cf5c10220157851050babc0086af7e0812be9fb4'),
+    'wiki.tyv': ('wiki.tyv-dd1284c9.bin', 'dd1284c9f847ebcc171616e0e60b5d97bdfa1808'),
+    'wiki.udm': ('wiki.udm-85b368ec.bin', '85b368ec3981bd3fb0599f6dc8287ff8c9dac2aa'),
+    'wiki.ug': ('wiki.ug-57acb78f.bin', '57acb78f7370a0766145cd9ca6fc11e0543f1385'),
+    'wiki.uk': ('wiki.uk-747aa3ab.bin', '747aa3ab24fd860f6304fc021ef281facc4c039d'),
+    'wiki.ur': ('wiki.ur-b89e16db.bin', 'b89e16dbc9ec62f9f91bce2a44447f7cdc830453'),
+    'wiki.uz': ('wiki.uz-e8a38b09.bin', 'e8a38b091ff4ecdb8ded1a20bb182ba3ef73c6aa'),
+    'wiki.ve': ('wiki.ve-5c1b48c3.bin', '5c1b48c309cb43717bcaeecb33e26cdcfd46cdec'),
+    'wiki.vec': ('wiki.vec-34e6f3d9.bin', '34e6f3d94843381be488b37eb06600aff3ab3d6b'),
+    'wiki.vep': ('wiki.vep-660b5d1e.bin', '660b5d1e7f74f97fe18f310736bf8950d4696c67'),
+    'wiki.vi': ('wiki.vi-99ab162f.bin', '99ab162f703a0a6fb03a64142b6d47013e0314a3'),
+    'wiki.vls': ('wiki.vls-07e0742e.bin', '07e0742e4946f2ae057be2ce84e327053a916c91'),
+    'wiki.vo': ('wiki.vo-562905a3.bin', '562905a3c920bdf4b9bf4a9fb1b3a1293883a905'),
+    'wiki.wa': ('wiki.wa-727a61c7.bin', '727a61c7115e7093ce8e2c1dc1f3e164eb2654cc'),
+    'wiki.war': ('wiki.war-bcad746f.bin', 'bcad746f062fa166ceebc47df3449f4752448be5'),
+    'wiki.wo': ('wiki.wo-5a3815d8.bin', '5a3815d82f535f6d35cd7058cc757117f2a41ac2'),
+    'wiki.wuu': ('wiki.wuu-b114fb8d.bin', 'b114fb8d1ca2ba54ec3e4295f93ae6f33b7eed16'),
+    'wiki.xal': ('wiki.xal-45449f93.bin', '45449f936d0ea7a57e7b287702fd09a715180efb'),
+    'wiki.xh': ('wiki.xh-7b5a743d.bin', '7b5a743dca7f9ed9d0a8760169e681ccc5d00e54'),
+    'wiki.xmf': ('wiki.xmf-755644bf.bin', '755644bfb0223e56e4d3ef7ae113f057b9143a18'),
+    'wiki.yi': ('wiki.yi-191d3a6a.bin', '191d3a6a6676566f21ddf2b9a645e7dadd719600'),
+    'wiki.yo': ('wiki.yo-2629d292.bin', '2629d292d16a7a6f80a12c1ca446b7ba4be56508'),
+    'wiki.za': ('wiki.za-0f26bdfb.bin', '0f26bdfb74dacc74b3690d8b00c9ce92e2598152'),
+    'wiki.zea': ('wiki.zea-f40226d3.bin', 'f40226d3c27c013b0690a3dba70b36cec1233aa6'),
+    'wiki.zh': ('wiki.zh-69e1fa5f.bin', '69e1fa5f3a7a1625789e5eeb47d3bfe72506f403'),
+    'wiki.zh_classical': ('wiki.zh_classical-ac01671b.bin',
+                          'ac01671b3fd0baadbc5fd850132ef4c9891b7e55'),
+    'wiki.zh_min_nan': ('wiki.zh_min_nan-5b773206.bin', '5b773206277e3c47fdfb110e43797bad644c1feb'),
+    'wiki.zh_yue': ('wiki.zh_yue-2e504f07.bin', '2e504f07395f4ac5f732d8c3ee3594b431c2eb64'),
+    'wiki.zu': ('wiki.zu-642b157b.bin', '642b157b3b799cfb50b13eda0b7d156698cdde83'),
+}
+
diff --git a/src/gluonnlp/embedding/embed_loader.py b/src/gluonnlp/embedding/embed_loader.py
new file mode 100644
index 0000000000..5a349595eb
--- /dev/null
+++ b/src/gluonnlp/embedding/embed_loader.py
@@ -0,0 +1,320 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=consider-iterating-dictionary, too-many-lines
+"""Load token embedding"""
+
+__all__ = [
+    'list_sources', 'load_embeddings', 'get_fasttext_model'
+]
+
+import io
+import logging
+import os
+import warnings
+import fasttext
+
+import numpy as np
+from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
+
+from . import _constants as C
+from ..base import get_home_dir
+from ..data import Vocab
+
+text_embedding_reg = {
+    'glove' : C.GLOVE_NPZ_SHA1,
+    'word2vec' : C.WORD2VEC_NPZ_SHA1,
+    'fasttext' : C.FAST_TEXT_NPZ_SHA1
+}
+def list_sources(embedding_name=None):
+    """Get valid token embedding names and their pre-trained file names.
+
+    Parameters
+    ----------
+    embedding_name : str or None, default None
+        The pre-trained token embedding name.
+
+    Returns
+    -------
+    dict or list:
+        A list of all the valid pre-trained token embedding file names (`source`) for the
+        specified token embedding name (`embedding_name`). If the text embedding name is set to
+        None, returns a dict mapping each valid token embedding name to a list of valid pre-trained
+        files (`source`).
+    """
+    if embedding_name is not None:
+        embedding_name = embedding_name.lower()
+        if embedding_name == 'fasttext.bin':
+            return list(C.FAST_TEXT_BIN_SHA1.keys())
+        if embedding_name not in text_embedding_reg:
+            raise KeyError('Cannot find `embedding_name` {}. Use '
+                           '`list_sources(embedding_name=None).keys()` to get all the valid'
+                           'embedding names.'.format(embedding_name))
+        return list(text_embedding_reg[embedding_name].keys())
+    else:
+        return {embedding_name: list(embedding_cls.keys())
+                for embedding_name, embedding_cls in text_embedding_reg.items()}
+
+def _append_unk_vecs(matrix, vocab_size):
+    append_dim = vocab_size - len(matrix)
+    assert append_dim in [0, 1], "Error occurs in the embedding file."
+    if append_dim == 1:
+        # there is no unknown_token in the embedding file
+        mean = np.mean(found_vectors, axis=0, keepdims=True)
+        std = np.std(found_vectors, axis=0, keepdims=True)
+        vecs = np.random.randn(append_dim, dim).astype('float32') * std + mean
+        return np.concatenate([matrix, vecs], axis=0)
+    return matrix
+
+def _load_embedding_txt(file_path, vocab, unknown_token):
+    if vocab is not None:
+        result = np.zeros(len(vocab), dtype=bool)
+    else:
+        result = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        line = f.readline().strip()
+        parts = line.split()
+        start_idx = 0
+        if len(parts) == 2:
+            dim = int(parts[1])
+            start_idx += 1
+        else:
+            dim = len(parts) - 1
+            f.seek(0)
+        if vocab is None:
+            matrix = []
+        else: matrix = np.random.randn(len(vocab), dim).astype('float32')
+        for idx, line in enumerate(f, start_idx):
+            try:
+                parts = line.strip().split()
+                word = ''.join(parts[:-dim])
+                nums = parts[-dim:]
+                if vocab is None:
+                    result.append(word)
+                    matrix.append(np.fromstring(' '.join(nums), sep=' ', dtype='float32', count=dim))
+                else:
+                    if word == unknown_token and vocab.unk_token is not None:
+                        word = vocab.unk_token
+                    if word in vocab:
+                        index = vocab[word]
+                        matrix[index] = np.fromstring(' '.join(nums), sep=' ', dtype='float32', count=dim)
+                        result[index] = True
+            except Exception as e:
+                logging.error("Error occurred at the {} line.".format(idx))
+                raise e
+    if vocab is None:
+        result = Vocab(result, unk_token=unknown_token)
+        matrix = _append_unk_vecs(np.array(matrix), len(result))
+    return matrix, result
+
+def _load_embedding_npz(file_path, vocab, unknown):
+    if vocab is not None:
+        result = np.zeros(len(vocab), dtype=bool)
+    else:
+        result = []
+    npz_dict = np.load(file_path, allow_pickle=True)
+    unknown_token = npz_dict['unknown_token']
+    if not unknown_token:
+        unknown_token = unknown
+    else:
+        if isinstance(unknown_token, np.ndarray):
+            if unknown_token.dtype.kind == 'S':
+                unknown_token = unknown_token.tobytes().decode()
+            else:
+                unknown_token = str(unknown_token)
+    if unknown != unknown_token:
+        warnings.warn("You may not assign correct unknown token in the pretrained file"
+                      "Use {} as the unknown mark.".format(unknown_token))
+
+    idx_to_token = npz_dict['idx_to_token'].tolist()
+    token2idx = {x : i for i, x in enumerate(idx_to_token)}
+    idx_to_vec = npz_dict['idx_to_vec']
+    if vocab is None:
+        result = Vocab(idx_to_token, unk_token=unknown_token)
+        idx_to_vec = _append_unk_vecs(idx_to_vec, len(result))
+        return idx_to_vec, result
+    else:
+        matrix = np.random.randn(len(vocab), idx_to_vec.shape[-1]).astype('float32')
+        for i, token in enumerate(vocab.all_tokens):
+            if token == vocab.unk_token and unknown_token is not None:
+                word = unknown_token
+            else:
+                word = token
+            if word in token2idx:
+                index = token2idx[word]
+                matrix[i] = idx_to_vec[index]
+                result[i] = True
+        return matrix, result
+
+def _get_file_url(cls_name, file_name):
+    namespace = 'gluon/embeddings/{}'.format(cls_name)
+    return _get_repo_file_url(namespace, file_name)
+
+def _get_file_path(cls_name, file_name, file_hash):
+    root_path = os.path.expanduser(os.path.join(get_home_dir(), 'embedding'))
+    embedding_dir = os.path.join(root_path, cls_name)
+    url = _get_file_url(cls_name, file_name)
+    file_path = os.path.join(embedding_dir, file_name)
+    if not os.path.exists(file_path) or not check_sha1(file_path, file_hash):
+        logging.info('Embedding file {} is not found. Downloading from Gluon Repository. '
+                        'This may take some time.'.format(file_name))
+        download(url, file_path, sha1_hash=file_hash)
+    return file_path
+
+def _check_and_get_path(pretrained_name_or_dir):
+    if os.path.exists(pretrained_name_or_dir):
+        return pretrained_name_or_dir
+    for cls_name, embedding_cls in text_embedding_reg.items():
+        if pretrained_name_or_dir in embedding_cls:
+            source = pretrained_name_or_dir
+            file_name, file_hash = embedding_cls[source]
+            return _get_file_path(cls_name, file_name, file_hash)
+
+    return None
+
+def load_embeddings(vocab=None, pretrained_name_or_dir='glove.6B.50d', unknown='<unk>',
+                    unk_method=None):
+    """Load pretrained word embeddings for building an embedding matrix for a given Vocab.
+
+    This function supports loading GloVe, Word2Vec and FastText word embeddings from remote sources.
+    You can also load your own embedding file(txt with Word2Vec or GloVe format) from a given file path.
+
+    Glove: an unsupervised learning algorithm for obtaining vector representations for words.
+    Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
+    the resulting representations showcase interesting linear substructures of the word vector
+    space. (Source from https://nlp.stanford.edu/projects/glove/)
+        Available sources:
+            ['glove.42B.300d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d', 'glove.6B.50d', \
+             'glove.840B.300d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', \
+             'glove.twitter.27B.25d', 'glove.twitter.27B.50d']
+    Word2Vec: an unsupervised learning algorithm for obtaining vector representations for words.
+    Training is performed with continuous bag-of-words or skip-gram architecture for computing vector
+    representations of words.
+        Available sources:
+            ['GoogleNews-vectors-negative300', 'freebase-vectors-skipgram1000',
+             'freebase-vectors-skipgram1000-en']
+    FastText: an open-source, free, lightweight library that allows users to learn text
+    representations and text classifiers. It works on standard, generic hardware. Models can later
+    be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/)
+        Available sources:
+            ['cc.af.300', ..., 'cc.en.300', ..., 'crawl-300d-2M', 'crawl-300d-2M-subword', \
+             'wiki-news-300d-1M', 'wiki-news-300d-1M-subword', \
+             'wiki.aa', ..., 'wiki.multi.ar', ..., 'wiki.zu']
+            Detailed sources can be founded by `gluonnlp.embedding.list_sources('FastText')`
+        For 'wiki.multi' embedding:
+
+        Word Translation Without Parallel Data
+        Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou.
+        https://arxiv.org/abs/1710.04087
+
+    Parameters
+    ----------
+    vocab : gluonnlp.data.Vocab object, default None
+        A vocabulary on which an embedding matrix is built.
+        If `vocab` is `None`, then all tokens in the pretrained file will be used.
+    pretrained_name_or_dir : str, default 'glove.6B.50d'
+        A file path for a pretrained embedding file or the name of the pretrained token embedding file.
+        This method would first check if it is a file path.
+        If not, the method will load from cache or download.
+    unknown : str, default '<unk>'
+        To specify the unknown token in the pretrained file.
+    unk_method : Callable, default None
+        A function which receives `List[str]` and returns `numpy.ndarray`.
+        The input of the function is a list of words which are in the `vocab`,
+        but do not occur in the pretrained file.
+        And the function is aimed to return an embedding matrix for these words.
+        If `unk_method` is None, we generate vectors for these words,
+        by sampling from normal distribution with the same std and mean of the embedding matrix.
+        It is only useful when `vocab` is not `None`.
+
+    Returns
+    -------
+    If `vocab` is `None`
+        numpy.ndarray:
+            An embedding matrix in the pretrained file.
+        gluonnlp.data.Vocab:
+            The vocabulary in the pretrained file.
+    Otherwise,
+        numpy.ndarray:
+            An embedding matrix for the given vocabulary.
+    """
+    assert isinstance(vocab, (Vocab, type(None))), "Only gluonnlp.data.Vocab is supported."
+    file_path = _check_and_get_path(pretrained_name_or_dir)
+    if file_path is None:
+        raise ValueError("Cannot recognize `{}`".format(pretrained_name_or_dir))
+
+    if file_path.endswith('.npz'):
+        matrix, result = _load_embedding_npz(file_path, vocab, unknown)
+    else:
+        matrix, result = _load_embedding_txt(file_path, vocab, unknown)
+    dim = matrix.shape[-1]
+    logging.info("Pre-trained embedding dim: {}".format(dim))
+    if vocab is None:
+        return matrix, result
+    else:
+        hit_flags = result
+        total_hits = sum(hit_flags)
+        logging.info("Found {} out of {} words in the pretrained embedding.".format(total_hits, len(vocab)))
+        if total_hits != len(vocab):
+            if unk_method is None:
+                found_vectors = matrix[hit_flags]
+                mean = np.mean(found_vectors, axis=0, keepdims=True)
+                std = np.std(found_vectors, axis=0, keepdims=True)
+                unfound_vec_num = len(vocab) - total_hits
+                r_vecs = np.random.randn(unfound_vec_num, dim).astype('float32') * std + mean
+                matrix[hit_flags == False] = r_vecs
+            else:
+                unk_idxs = (hit_flags == False).nonzero()[0]
+                matrix[hit_flags == False] = unk_method(vocab.to_tokens(unk_idxs))
+
+        return matrix
+
+def get_fasttext_model(model_name_or_dir='cc.en.300'):
+    """ Load fasttext model from the binaray file
+
+    This method will load fasttext model binaray file from a given file path or remote sources,
+    and return a `fasttext` model object. See `fasttext.cc` for more usage information.
+
+    Available sources:
+    ['wiki-news-300d-1M-subword', 'crawl-300d-2M-subword', \
+     'cc.af.300', ..., 'cc.en.300', ..., 'wiki.aa', ..., 'wiki.en', ..., 'wiki.zu']
+    Detailed sources can be founded by `gluonnlp.embedding.list_sources('FastText.bin')`
+
+    Parameters
+    ----------
+    model_name_or_dir : str, default 'cc.en.300'
+        A file path for a FastText binary file or the name of the FastText model.
+        This method would first check if it is a file path.
+        If not, the method will load from cache or download.
+
+    Returns
+    -------
+    fasttext.FastText._FastText:
+        A FastText model based on `fasttext` package.
+    """
+    if os.path.exists(model_name_or_dir):
+        file_path = model_name_or_dir
+    else:
+        source = model_name_or_dir
+        root_path = os.path.expanduser(os.path.join(get_home_dir(), 'embedding'))
+        embedding_dir = os.path.join(root_path, 'fasttext')
+        if source not in C.FAST_TEXT_BIN_SHA1:
+            raise ValueError('Cannot recognize {} for the bin file'.format(source))
+        file_name, file_hash = C.FAST_TEXT_BIN_SHA1[source]
+        file_path = _get_file_path('fasttext', file_name, file_hash)
+    return fasttext.load_model(file_path)
+
diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py
index f19553fd5e..a6ea6b181e 100644
--- a/src/gluonnlp/layers.py
+++ b/src/gluonnlp/layers.py
@@ -356,9 +356,10 @@ def __init__(self, mode='erf'):
 
     def hybrid_forward(self, F, x):
         if self._mode == 'erf':
-            return x * 0.5 * (1.0 + F.npx.erf(x / math.sqrt(2.0)))
+            return F.npx.leaky_relu(x, act_type='gelu')
         elif self._mode == 'tanh':
-            return 0.5 * x * (1.0 + F.np.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * (x ** 3))))
+            return 0.5 * x\
+                   * (1.0 + F.np.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * (x ** 3))))
         elif self._mode == 'sigmoid':
             return x * F.npx.sigmoid(1.702 * x)
         else:
diff --git a/src/gluonnlp/models/albert.py b/src/gluonnlp/models/albert.py
index 1eb504c643..1b4efa16e2 100644
--- a/src/gluonnlp/models/albert.py
+++ b/src/gluonnlp/models/albert.py
@@ -25,7 +25,8 @@
 
 """
 __all__ = ['AlbertModel', 'AlbertForMLM', 'AlbertForPretrain',
-           'list_pretrained_albert', 'get_pretrained_albert']
+           'list_pretrained_albert', 'get_pretrained_albert',
+           'albert_cfg_reg']
 
 import os
 from typing import Tuple
@@ -38,16 +39,89 @@
 from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
 from ..utils.config import CfgNode as CN
 from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
 from ..initializer import TruncNorm
 from ..attention_cell import gen_self_attn_mask
 from ..layers import get_activation, PositionalEmbedding
 from ..op import select_vectors_by_position
 from ..data.tokenizers import SentencepieceTokenizer
 
+albert_cfg_reg = Registry('albert_cfg')
+
+
+@albert_cfg_reg.register()
+def google_albert_base():
+    cfg = CN()
+    # Model Parameters
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30000
+    cfg.MODEL.embed_size = 128
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu(tanh)'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_groups = 1
+    cfg.MODEL.num_token_types = 2
+    cfg.MODEL.hidden_dropout_prob = 0.0
+    cfg.MODEL.attention_dropout_prob = 0.0
+    cfg.MODEL.dtype = 'float32'
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Hyper-parameters of the Initializers
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    # Version of the model. This helps ensure backward compatibility.
+    # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@albert_cfg_reg.register()
+def google_albert_large():
+    cfg = google_albert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 1024
+    cfg.freeze()
+    return cfg
+
+
+@albert_cfg_reg.register()
+def google_albert_xlarge():
+    cfg = google_albert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 8192
+    cfg.MODEL.num_heads = 32
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 2048
+    cfg.freeze()
+    return cfg
+
+
+@albert_cfg_reg.register()
+def google_albert_xxlarge():
+    cfg = google_albert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 16384
+    cfg.MODEL.num_heads = 64
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.units = 4096
+    cfg.freeze()
+    return cfg
+
 
 PRETRAINED_URL = {
     'google_albert_base_v2': {
-        'cfg': 'google_albert_base_v2/model-8767fdc9.yml',
+        'cfg': google_albert_base(),
         'spm_model': 'google_albert_base_v2/spm-65999e5d.model',
         'vocab': 'google_albert_base_v2/vocab-2ee53ae7.json',
         'params': 'google_albert_base_v2/model-125be477.params',
@@ -55,7 +129,7 @@
         'lowercase': True,
     },
     'google_albert_large_v2': {
-        'cfg': 'google_albert_large_v2/model-e2e9b974.yml',
+        'cfg': google_albert_large(),
         'spm_model': 'google_albert_large_v2/spm-65999e5d.model',
         'vocab': 'google_albert_large_v2/vocab-2ee53ae7.json',
         'params': 'google_albert_large_v2/model-ad60bcd5.params',
@@ -63,7 +137,7 @@
         'lowercase': True,
     },
     'google_albert_xlarge_v2': {
-        'cfg': 'google_albert_xlarge_v2/model-8123bffd.yml',
+        'cfg': google_albert_xlarge(),
         'spm_model': 'google_albert_xlarge_v2/spm-65999e5d.model',
         'vocab': 'google_albert_xlarge_v2/vocab-2ee53ae7.json',
         'params': 'google_albert_xlarge_v2/model-4149c9e2.params',
@@ -71,7 +145,7 @@
         'lowercase': True,
     },
     'google_albert_xxlarge_v2': {
-        'cfg': 'google_albert_xxlarge_v2/model-07fbeebc.yml',
+        'cfg': google_albert_xxlarge(),
         'spm_model': 'google_albert_xxlarge_v2/spm-65999e5d.model',
         'vocab': 'google_albert_xxlarge_v2/vocab-2ee53ae7.json',
         'params': 'google_albert_xxlarge_v2/model-5601a0ed.params',
@@ -97,7 +171,8 @@ def __init__(self, units=512, hidden_size=2048,
                  layer_norm_eps=1E-12,
                  weight_initializer=TruncNorm(stdev=0.02),
                  bias_initializer='zeros',
-                 activation='gelu'):
+                 activation='gelu',
+                 layout='NT'):
         super().__init__()
         assert units % num_heads == 0,\
             'In AlbertEncoder, The units should be divided exactly ' \
@@ -112,6 +187,8 @@ def __init__(self, units=512, hidden_size=2048,
 
         self._output_attention = output_attention
         self._output_all_encodings = output_all_encodings
+        self._layout = layout
+
 
         self.all_encoder_groups = nn.HybridSequential()
         for group_idx in range(num_groups):
@@ -124,7 +201,13 @@ def __init__(self, units=512, hidden_size=2048,
                                         layer_norm_eps=layer_norm_eps,
                                         weight_initializer=weight_initializer,
                                         bias_initializer=bias_initializer,
-                                        activation=activation))
+                                        activation=activation,
+                                        dtype=dtype,
+                                        layout=layout))
+
+    @property
+    def layout(self):
+        return self._layout
 
     def hybrid_forward(self, F, data, valid_length):
         """
@@ -135,18 +218,26 @@ def hybrid_forward(self, F, data, valid_length):
         Parameters
         ----------
         F
-        data :
-            Shape (batch_size, seq_length, C)
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
         valid_length :
             Shape (batch_size,)
 
         Returns
         -------
-        out :
-            Shape (batch_size, seq_length, C_out)
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
         """
         # 1. Embed the data
-        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full')
+        time_axis = 1 if self.layout == 'NT' else 0
+        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype,
+                                       attn_type='full', layout=self.layout)
         out = data
         all_encodings_outputs = []
         additional_outputs = []
@@ -159,7 +250,8 @@ def hybrid_forward(self, F, data, valid_length):
             if self._output_all_encodings:
                 out = F.npx.sequence_mask(out,
                                           sequence_length=valid_length,
-                                          use_sequence_length=True, axis=1)
+                                          use_sequence_length=True,
+                                          axis=time_axis)
                 all_encodings_outputs.append(out)
 
             if self._output_attention:
@@ -168,7 +260,8 @@ def hybrid_forward(self, F, data, valid_length):
         if not self._output_all_encodings:
             # if self._output_all_encodings, SequenceMask is already applied above
             out = F.npx.sequence_mask(out, sequence_length=valid_length,
-                                      use_sequence_length=True, axis=1)
+                                      use_sequence_length=True,
+                                      axis=time_axis)
             return out, additional_outputs
         else:
             return all_encodings_outputs, additional_outputs
@@ -195,7 +288,9 @@ def __init__(self,
                  weight_initializer=TruncNorm(stdev=0.02),
                  bias_initializer='zeros',
                  dtype='float32',
-                 use_pooler=True):
+                 use_pooler=True,
+                 layout='NT',
+                 compute_layout='auto'):
         super().__init__()
         self._dtype = dtype
         self.use_pooler = use_pooler
@@ -210,6 +305,11 @@ def __init__(self,
         self.weight_initializer = weight_initializer
         self.bias_initializer = bias_initializer
         self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout is None or compute_layout == 'auto':
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
         # Construct AlbertEncoder
         self.encoder = AlbertEncoder(
             units=units,
@@ -226,6 +326,7 @@ def __init__(self,
             weight_initializer=weight_initializer,
             bias_initializer=bias_initializer,
             dtype=dtype,
+            layout=self._compute_layout
         )
         self.encoder.hybridize()
         # Construct word embedding
@@ -257,6 +358,10 @@ def __init__(self,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer)
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         # pylint: disable=arguments-differ
         """Generate the representation given the inputs.
@@ -266,10 +371,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
@@ -279,8 +390,11 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):
 
         Returns
         -------
-        contextual_embedding :
-            Shape (batch_size, seq_length, units).
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units)
         pooled_output :
             This is optional. Shape (batch_size, units)
         """
@@ -290,7 +404,13 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         if self.embed_size != self.units:
             prev_out = self.embed_factorized_proj(prev_out)
         outputs = []
-        contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
+        if self._compute_layout != self._layout:
+            # Swap input to reflect the compute_layout
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
         outputs.append(contextual_embeddings)
         if self.use_pooler:
             pooled_out = self.apply_pooling(contextual_embeddings)
@@ -304,24 +424,37 @@ def get_initial_embedding(self, F, inputs, token_types=None):
         ----------
         F
         inputs
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         token_types
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
             If None, it will be initialized as all zero
 
         Returns
         -------
         embedding
             The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_embed)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_embed)
         """
+        if self.layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        else:
+            batch_axis, time_axis = 1, 0
         embedding = self.word_embed(inputs)
         if token_types is None:
             token_types = F.np.zeros_like(inputs)
         type_embedding = self.token_type_embed(token_types)
         embedding = embedding + type_embedding
         if self.pos_embed_type is not None:
-            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=1))
-            positional_embedding = F.np.expand_dims(positional_embedding, axis=0)
+            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
             embedding = embedding + positional_embedding
         # Extra layer normalization plus dropout
         embedding = self.embed_layer_norm(embedding)
@@ -334,50 +467,34 @@ def apply_pooling(self, sequence):
         This is used for pre-training or fine-tuning a Bert model.
         Get the first token of the whole sequence which is [CLS]
 
-        sequence:
-            Shape (batch_size, sequence_length, units)
-        return:
+        Parameters
+        ----------
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+
+        Returns
+        -------
+        pooled_out
             Shape (batch_size, units)
         """
-        outputs = sequence[:, 0, :]
+        if self.layout == 'NT':
+            outputs = sequence[:, 0, :]
+        else:
+            outputs = sequence[0, :, :]
         return self.pooler(outputs)
 
     @staticmethod
     def get_cfg(key=None):
-        if key is None:
-            cfg = CN()
-            # Model Parameters
-            cfg.MODEL = CN()
-            cfg.MODEL.vocab_size = 30000
-            cfg.MODEL.embed_size = 128
-            cfg.MODEL.units = 768
-            cfg.MODEL.hidden_size = 3072
-            cfg.MODEL.max_length = 512
-            cfg.MODEL.num_heads = 12
-            cfg.MODEL.num_layers = 12
-            cfg.MODEL.pos_embed_type = 'learned'
-            cfg.MODEL.activation = 'gelu'
-            cfg.MODEL.layer_norm_eps = 1E-12
-            cfg.MODEL.num_groups = 1
-            cfg.MODEL.num_token_types = 2
-            cfg.MODEL.hidden_dropout_prob = 0.0
-            cfg.MODEL.attention_dropout_prob = 0.0
-            cfg.MODEL.dtype = 'float32'
-            # Hyper-parameters of the Initializers
-            cfg.INITIALIZER = CN()
-            cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
-            cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
-            cfg.INITIALIZER.bias = ['zeros']
-            # Version of the model. This helps ensure backward compatibility.
-            # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
-            cfg.VERSION = 1
+        if key is not None:
+            return albert_cfg_reg.create(key)
         else:
-            raise NotImplementedError
-        cfg.freeze()
-        return cfg
+            return google_albert_base()
 
     @classmethod
-    def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel':
+    def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'AlbertModel':
         """
 
         Parameters
@@ -385,6 +502,8 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel':
         cfg
         use_pooler
             Whether to use pooler
+        dtype
+            The dtype of the backbone model
 
         Returns
         -------
@@ -396,6 +515,8 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel':
         embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
         weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
         bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
         return cls(vocab_size=cfg.MODEL.vocab_size,
                    units=cfg.MODEL.units,
                    hidden_size=cfg.MODEL.hidden_size,
@@ -411,6 +532,7 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel':
                    activation=cfg.MODEL.activation,
                    layer_norm_eps=cfg.MODEL.layer_norm_eps,
                    dtype=dtype,
+                   layout=cfg.MODEL.layout,
                    embed_initializer=embed_initializer,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer,
@@ -453,6 +575,10 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
         self.mlm_decoder.hybridize()
 
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length,
                        masked_positions):
         """Getting the scores of the masked positions.
@@ -460,10 +586,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
             The type of the token. For example, if the inputs contain two sequences,
             we will set different token types for the first sentence and the second sentence.
         valid_length :
@@ -476,14 +608,21 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units)
         pooled_out
             Shape (batch_size, units)
         mlm_scores :
             Shape (batch_size, num_masked_positions, vocab_size)
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
         mlm_scores = self.mlm_decoder(mlm_features)
         return contextual_embeddings, pooled_out, mlm_scores
 
@@ -528,6 +667,10 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
         self.mlm_decoder.hybridize()
 
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length,
                        masked_positions):
         """Generate the representation given the inputs.
@@ -537,10 +680,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         token_types :
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
@@ -554,7 +703,10 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
         sop_score :
@@ -564,7 +716,11 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
         sop_score = self.sop_classifier(pooled_out)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
         mlm_scores = self.mlm_decoder(mlm_features)
         return contextual_embeddings, pooled_out, sop_score, mlm_scores
 
@@ -604,15 +760,22 @@ def get_pretrained_albert(model_name: str = 'google_albert_base_v2',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_albert())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     spm_model_path = PRETRAINED_URL[model_name]['spm_model']
     vocab_path = PRETRAINED_URL[model_name]['vocab']
     params_path = PRETRAINED_URL[model_name]['params']
     mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('spm_model', spm_model_path), ('vocab', vocab_path)]:
-        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
-                                  path=os.path.join(root, path),
-                                  sha1_hash=FILE_STATS[path])
+    download_jobs = [('spm_model', spm_model_path), ('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for key, path in download_jobs:
+        local_paths[key] = download(url=get_repo_model_zoo_url() + path,
+                                    path=os.path.join(root, path),
+                                    sha1_hash=FILE_STATS[path])
     if load_backbone:
         local_params_path = download(url=get_repo_model_zoo_url() + params_path,
                                      path=os.path.join(root, params_path),
@@ -630,7 +793,8 @@ def get_pretrained_albert(model_name: str = 'google_albert_base_v2',
     tokenizer = SentencepieceTokenizer(local_paths['spm_model'],
                                        vocab=local_paths['vocab'],
                                        lowercase=do_lower)
-    cfg = AlbertModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = AlbertModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, local_mlm_params_path
 
 
diff --git a/src/gluonnlp/models/bart.py b/src/gluonnlp/models/bart.py
index 3f2ae159c9..f4e70d1bc7 100644
--- a/src/gluonnlp/models/bart.py
+++ b/src/gluonnlp/models/bart.py
@@ -68,6 +68,7 @@ def fair_bart_base():
     cfg.MODEL.layer_norm_eps = 1E-5
     cfg.MODEL.pooler_activation = 'tanh'
     cfg.MODEL.layernorm_embedding = True
+    cfg.MODEL.layout = 'NT'
     cfg.MODEL.dtype = 'float32'
 
     # Parameters for the encoder
@@ -191,27 +192,43 @@ def hybrid_forward(self, F, src_data, src_valid_length, tgt_data, tgt_valid_leng
         Parameters
         ----------
         F
-        src_data :
-            Shape (batch_size, src_length)
-        src_valid_length :
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
             Shape (batch_size,)
-        tgt_data :
-            Shape (batch_size, tgt_length)
-        tgt_valid_length :
+        tgt_data
+            - layout = 'NT'
+                Shape (batch_size, tgt_length)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size)
+        tgt_valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        out :
-            Shape (batch_size, tgt_length, tgt_vocab_size)
+        (contextual_embedding)
+            - layout = 'NT'
+                Shape (batch_size, tgt_length, units)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size, units)
+        (pooled_output)
+            This is optional. Shape (batch_size, units)
+        (dec_out)
+            - layout = 'NT'
+                Shape (batch_size, tgt_length, tgt_vocab_size)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size, tgt_vocab_size)
         """
         enc_out = self.encode(F, src_data, src_valid_length)
-        dec_out = self.decode_seq(F, tgt_data, tgt_valid_length, enc_out, src_valid_length)
+        contextual_embedding = self.decode_seq(F, tgt_data, tgt_valid_length, enc_out, src_valid_length)
         if self.use_pooler:
-            pooled_out = self.apply_pooling(dec_out)
-            return dec_out, pooled_out
+            pooled_output = self.apply_pooling(contextual_embedding)
+            return contextual_embedding, pooled_output
         else:
-            dec_out = self.tgt_final_layer(dec_out)
+            dec_out = self.tgt_final_layer(contextual_embedding)
             return dec_out
 
     def apply_pooling(self, sequence):
@@ -231,6 +248,10 @@ def apply_pooling(self, sequence):
         else:
             return outputs
 
+    @property
+    def layout(self) -> str:
+        return self._layout
+
     @property
     def vocab_size(self):
         return self._vocab_size
@@ -315,13 +336,19 @@ def get_pretrained_bart(model_name: str = 'fairseq_roberta_base',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_bart())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     merges_path = PRETRAINED_URL[model_name]['merges']
     vocab_path = PRETRAINED_URL[model_name]['vocab']
     params_path = PRETRAINED_URL[model_name]['params']
 
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('vocab', vocab_path),
-                    ('merges', merges_path)]:
+    download_jobs = [('vocab', vocab_path), ('merges', merges_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
         local_paths[k] = download(url=get_repo_model_zoo_url() + path,
                                   path=os.path.join(root, path),
                                   sha1_hash=FILE_STATS[path])
@@ -339,7 +366,8 @@ def get_pretrained_bart(model_name: str = 'fairseq_roberta_base',
         merges_file=local_paths['merges'],
         vocab_file=local_paths['vocab'],
         lowercase=do_lower)
-    cfg = BartModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = BartModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, local_mlm_params_path
 
 
diff --git a/src/gluonnlp/models/bert.py b/src/gluonnlp/models/bert.py
index fd53ae3b5c..84a1d5ee2e 100644
--- a/src/gluonnlp/models/bert.py
+++ b/src/gluonnlp/models/bert.py
@@ -39,16 +39,108 @@
 from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
 from ..utils.config import CfgNode as CN
 from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
 from ..initializer import TruncNorm
 from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask
 from ..layers import get_activation, PositionalEmbedding, PositionwiseFFN, InitializerType
 from ..op import select_vectors_by_position
 from ..data.tokenizers import HuggingFaceWordPieceTokenizer
 
+bert_cfg_reg = Registry('bert_cfg')
+
+
+@bert_cfg_reg.register()
+def google_en_uncased_bert_base():
+    cfg = CN()
+    # Parameters for thr small model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30522
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_token_types = 2
+    cfg.MODEL.hidden_dropout_prob = 0.1
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.dtype = 'float32'
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Hyper-parameters of the Initializers
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    # Version of the model. This helps ensure backward compatibility.
+    # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_en_uncased_bert_large():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 1024
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_en_cased_bert_base():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 28996
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_en_cased_bert_large():
+    cfg = google_en_uncased_bert_large()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 28996
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_zh_bert_base():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 21128
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_multi_cased_bert_base():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 119547
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_multi_cased_bert_large():
+    cfg = google_en_uncased_bert_large()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 119547
+    cfg.freeze()
+    return cfg
+
 
 PRETRAINED_URL = {
     'google_en_cased_bert_base': {
-        'cfg': 'google_en_cased_bert_base/model-5620839a.yml',
+        'cfg': google_en_cased_bert_base(),
         'vocab': 'google_en_cased_bert_base/vocab-c1defaaa.json',
         'params': 'google_en_cased_bert_base/model-c566c289.params',
         'mlm_params': 'google_en_cased_bert_base/model_mlm-bde14bee.params',
@@ -56,49 +148,49 @@
     },
 
     'google_en_uncased_bert_base': {
-        'cfg': 'google_en_uncased_bert_base/model-4d8422ad.yml',
+        'cfg': google_en_uncased_bert_base(),
         'vocab': 'google_en_uncased_bert_base/vocab-e6d2b21d.json',
         'params': 'google_en_uncased_bert_base/model-3712e50a.params',
         'mlm_params': 'google_en_uncased_bert_base/model_mlm-04e88b58.params',
         'lowercase': True,
     },
     'google_en_cased_bert_large': {
-        'cfg': 'google_en_cased_bert_large/model-9e127fee.yml',
+        'cfg': google_en_cased_bert_large(),
         'vocab': 'google_en_cased_bert_large/vocab-c1defaaa.json',
         'params': 'google_en_cased_bert_large/model-7aa93704.params',
         'mlm_params': 'google_en_cased_bert_large/model_mlm-59ff3f6a.params',
         'lowercase': False,
     },
     'google_en_uncased_bert_large': {
-        'cfg': 'google_en_uncased_bert_large/model-d0c37dcc.yml',
+        'cfg': google_en_uncased_bert_large(),
         'vocab': 'google_en_uncased_bert_large/vocab-e6d2b21d.json',
         'params': 'google_en_uncased_bert_large/model-e53bbc57.params',
         'mlm_params': 'google_en_uncased_bert_large/model_mlm-44bc70c0.params',
         'lowercase': True,
     },
     'google_zh_bert_base': {
-        'cfg': 'google_zh_bert_base/model-9b16bda6.yml',
+        'cfg': google_zh_bert_base(),
         'vocab': 'google_zh_bert_base/vocab-711c13e4.json',
         'params': 'google_zh_bert_base/model-2efbff63.params',
         'mlm_params': 'google_zh_bert_base/model_mlm-75339658.params',
         'lowercase': False,
     },
     'google_multi_cased_bert_base': {
-        'cfg': 'google_multi_cased_bert_base/model-881ad607.yml',
+        'cfg': google_multi_cased_bert_base(),
         'vocab': 'google_multi_cased_bert_base/vocab-016e1169.json',
         'params': 'google_multi_cased_bert_base/model-c2110078.params',
         'mlm_params': 'google_multi_cased_bert_base/model_mlm-4611e7a3.params',
         'lowercase': False,
     },
     'google_en_cased_bert_wwm_large': {
-        'cfg': 'google_en_cased_bert_wwm_large/model-9e127fee.yml',
+        'cfg': google_en_cased_bert_large(),
         'vocab': 'google_en_cased_bert_wwm_large/vocab-c1defaaa.json',
         'params': 'google_en_cased_bert_wwm_large/model-0fe841cf.params',
         'mlm_params': None,
         'lowercase': False,
     },
     'google_en_uncased_bert_wwm_large': {
-        'cfg': 'google_en_uncased_bert_wwm_large/model-d0c37dcc.yml',
+        'cfg': google_en_uncased_bert_large(),
         'vocab': 'google_en_uncased_bert_wwm_large/vocab-e6d2b21d.json',
         'params': 'google_en_uncased_bert_wwm_large/model-cb3ad3c2.params',
         'mlm_params': None,
@@ -124,7 +216,8 @@ def __init__(self, units: int = 512,
                  layer_norm_eps: float = 1E-12,
                  weight_initializer: InitializerType = TruncNorm(stdev=0.02),
                  bias_initializer: InitializerType = 'zeros',
-                 activation='gelu'):
+                 activation='gelu',
+                 layout='NT'):
         super().__init__()
         assert units % num_heads == 0,\
             'In BertTransformer, The units should be divided exactly ' \
@@ -135,6 +228,7 @@ def __init__(self, units: int = 512,
         self._num_layers = num_layers
         self._output_attention = output_attention
         self._output_all_encodings = output_all_encodings
+        self._layout = layout
 
         self.all_layers = nn.HybridSequential()
         for layer_idx in range(num_layers):
@@ -147,7 +241,13 @@ def __init__(self, units: int = 512,
                                       layer_norm_eps=layer_norm_eps,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer,
-                                      activation=activation))
+                                      activation=activation,
+                                      layout=layout,
+                                      dtype=dtype))
+
+    @property
+    def layout(self):
+        return self._layout
 
     def hybrid_forward(self, F, data, valid_length):
         """
@@ -158,30 +258,41 @@ def hybrid_forward(self, F, data, valid_length):
         Parameters
         ----------
         F
-        data :
-            Shape (batch_size, seq_length, C)
-        valid_length :
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        out :
-            Shape (batch_size, seq_length, C_out)
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
         """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
         # 1. Embed the data
-        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full')
+        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype,
+                                       attn_type='full', layout=self.layout)
         out = data
         all_encodings_outputs = []
         additional_outputs = []
         for layer_idx in range(self._num_layers):
             layer = self.all_layers[layer_idx]
             out, attention_weights = layer(out, attn_mask)
-            # out : [batch_size, seq_len, units]
+            # out : [batch_size, seq_len, units] or [seq_len, batch_size, units]
             # attention_weights : [batch_size, num_heads, seq_len, seq_len]
             if self._output_all_encodings:
                 out = F.npx.sequence_mask(out,
                                           sequence_length=valid_length,
-                                          use_sequence_length=True, axis=1)
+                                          use_sequence_length=True, axis=time_axis)
                 all_encodings_outputs.append(out)
 
             if self._output_attention:
@@ -190,7 +301,7 @@ def hybrid_forward(self, F, data, valid_length):
         if not self._output_all_encodings:
             # if self._output_all_encodings, SequenceMask is already applied above
             out = F.npx.sequence_mask(out, sequence_length=valid_length,
-                                      use_sequence_length=True, axis=1)
+                                      use_sequence_length=True, axis=time_axis)
             return out, additional_outputs
         else:
             return all_encodings_outputs, additional_outputs
@@ -215,7 +326,9 @@ def __init__(self,
                  weight_initializer=TruncNorm(stdev=0.02),
                  bias_initializer='zeros',
                  dtype='float32',
-                 use_pooler=True):
+                 use_pooler=True,
+                 layout='NT',
+                 compute_layout='auto'):
         super().__init__()
         self._dtype = dtype
         self.use_pooler = use_pooler
@@ -229,6 +342,11 @@ def __init__(self,
         self.weight_initializer = weight_initializer
         self.bias_initializer = bias_initializer
         self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout is None or compute_layout == 'auto':
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
         # Construct BertTransformer
         self.encoder = BertTransformer(
             units=units,
@@ -244,6 +362,7 @@ def __init__(self,
             weight_initializer=weight_initializer,
             bias_initializer=bias_initializer,
             dtype=dtype,
+            layout=self._compute_layout
         )
         self.encoder.hybridize()
         # Construct word embedding
@@ -270,6 +389,10 @@ def __init__(self,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer)
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length):
         # pylint: disable=arguments-differ
         """Generate the representation given the inputs.
@@ -279,10 +402,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length):
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (batch_size, seq_length)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
@@ -292,16 +421,24 @@ def hybrid_forward(self, F, inputs, token_types, valid_length):
 
         Returns
         -------
-        contextual_embedding :
-            Shape (batch_size, seq_length, units).
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_output :
             This is optional. Shape (batch_size, units)
         """
         initial_embedding = self.get_initial_embedding(F, inputs, token_types)
         prev_out = initial_embedding
         outputs = []
-
-        contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
+        if self._compute_layout != self._layout:
+            # Swap the axes if the compute_layout and layout mismatch
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
         outputs.append(contextual_embeddings)
         if self.use_pooler:
             pooled_out = self.apply_pooling(contextual_embeddings)
@@ -315,24 +452,38 @@ def get_initial_embedding(self, F, inputs, token_types=None):
         ----------
         F
         inputs
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         token_types
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
             If None, it will be initialized as all zero
 
         Returns
         -------
         embedding
             The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_emb)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_emb)
         """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
         embedding = self.word_embed(inputs)
         if token_types is None:
             token_types = F.np.zeros_like(inputs)
         type_embedding = self.token_type_embed(token_types)
         embedding = embedding + type_embedding
         if self.pos_embed_type is not None:
-            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=1))
-            positional_embedding = F.np.expand_dims(positional_embedding, axis=0)
+            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
             embedding = embedding + positional_embedding
         # Extra layer normalization plus dropout
         embedding = self.embed_layer_norm(embedding)
@@ -345,53 +496,52 @@ def apply_pooling(self, sequence):
         This is used for pre-training or fine-tuning a bert model.
         Get the first token of the whole sequence which is [CLS]
 
-        sequence:
-            Shape (batch_size, sequence_length, units)
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
         return:
             Shape (batch_size, units)
         """
-        outputs = sequence[:, 0, :]
+        if self.layout == 'NT':
+            outputs = sequence[:, 0, :]
+        else:
+            outputs = sequence[0, :, :]
         return self.pooler(outputs)
 
     @staticmethod
     def get_cfg(key=None):
-        if key is None:
-            cfg = CN()
-            # Parameters for thr small model
-            cfg.MODEL = CN()
-            cfg.MODEL.vocab_size = 30000
-            cfg.MODEL.units = 256
-            cfg.MODEL.hidden_size = 1024
-            cfg.MODEL.max_length = 512
-            cfg.MODEL.num_heads = 4
-            cfg.MODEL.num_layers = 12
-            cfg.MODEL.pos_embed_type = 'learned'
-            cfg.MODEL.activation = 'gelu'
-            cfg.MODEL.layer_norm_eps = 1E-12
-            cfg.MODEL.num_token_types = 2
-            cfg.MODEL.hidden_dropout_prob = 0.1
-            cfg.MODEL.attention_dropout_prob = 0.1
-            cfg.MODEL.dtype = 'float32'
-            # Hyper-parameters of the Initializers
-            cfg.INITIALIZER = CN()
-            cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
-            cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
-            cfg.INITIALIZER.bias = ['zeros']
-            # Version of the model. This helps ensure backward compatibility.
-            # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
-            cfg.VERSION = 1
+        if key is not None:
+            return bert_cfg_reg.create(key)
         else:
-            raise NotImplementedError
-        cfg.freeze()
-        return cfg
+            return google_en_uncased_bert_base()
 
     @classmethod
-    def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'BertModel':
+    def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'BertModel':
+        """
+
+        Parameters
+        ----------
+        cfg
+            Configuration
+        use_pooler
+            Whether to output the pooled feature
+        dtype
+            data type of the model
+
+        Returns
+        -------
+        ret
+            The constructed BertModel
+        """
         cfg = BertModel.get_cfg().clone_merge(cfg)
         assert cfg.VERSION == 1, 'Wrong version!'
         embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
         weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
         bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
         return cls(vocab_size=cfg.MODEL.vocab_size,
                    units=cfg.MODEL.units,
                    hidden_size=cfg.MODEL.hidden_size,
@@ -408,7 +558,9 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'BertModel':
                    embed_initializer=embed_initializer,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer,
-                   use_pooler=use_pooler)
+                   use_pooler=use_pooler,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
 
 
 @use_np
@@ -447,6 +599,10 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
         self.mlm_decoder.hybridize()
 
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length,
                        masked_positions):
         """Getting the scores of the masked positions.
@@ -454,10 +610,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
@@ -471,14 +633,21 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
-        pooled_out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units)
+            cfg.MODEL.compute_layout = 'auto'
             Shape (batch_size, units)
         mlm_scores :
             Shape (batch_size, num_masked_positions, vocab_size)
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
         mlm_scores = self.mlm_decoder(mlm_features)
         return contextual_embeddings, pooled_out, mlm_scores
 
@@ -523,6 +692,10 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
         self.mlm_decoder.hybridize()
 
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length,
                        masked_positions):
         """Generate the representation given the inputs.
@@ -532,24 +705,33 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
-        valid_length :
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
-        masked_positions :
+        masked_positions
             The masked position of the sequence
             Shape (batch_size, num_masked_positions).
 
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
         nsp_score :
@@ -559,7 +741,11 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
         nsp_score = self.nsp_classifier(pooled_out)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
         mlm_scores = self.mlm_decoder(mlm_features)
         return contextual_embeddings, pooled_out, nsp_score, mlm_scores
 
@@ -599,14 +785,21 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_bert())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     vocab_path = PRETRAINED_URL[model_name]['vocab']
     params_path = PRETRAINED_URL[model_name]['params']
     mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('vocab', vocab_path)]:
-        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
-                                  path=os.path.join(root, path),
-                                  sha1_hash=FILE_STATS[path])
+    download_jobs = [('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for key, path in download_jobs:
+        local_paths[key] = download(url=get_repo_model_zoo_url() + path,
+                                    path=os.path.join(root, path),
+                                    sha1_hash=FILE_STATS[path])
     if load_backbone:
         local_params_path = download(url=get_repo_model_zoo_url() + params_path,
                                      path=os.path.join(root, params_path),
@@ -629,7 +822,8 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base',
                     sep_token='[SEP]',
                     mask_token='[MASK]',
                     lowercase=do_lower)
-    cfg = BertModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = BertModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, local_mlm_params_path
 
 
diff --git a/src/gluonnlp/models/electra.py b/src/gluonnlp/models/electra.py
index a56d7879dc..b8d4e44029 100644
--- a/src/gluonnlp/models/electra.py
+++ b/src/gluonnlp/models/electra.py
@@ -43,9 +43,12 @@
 from ..initializer import TruncNorm
 from ..utils.config import CfgNode as CN
 from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
 from ..attention_cell import gen_self_attn_mask
 from ..data.tokenizers import HuggingFaceWordPieceTokenizer
 
+electra_cfg_reg = Registry('electra_cfg')
+
 
 def get_generator_cfg(model_config):
     """
@@ -66,9 +69,73 @@ def get_generator_cfg(model_config):
     return generator_cfg
 
 
+@electra_cfg_reg.register()
+def google_electra_small():
+    cfg = CN()
+    # Model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30522
+    cfg.MODEL.embed_size = 128
+    cfg.MODEL.units = 256
+    cfg.MODEL.hidden_size = 1024
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 4
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_token_types = 2
+    # Dropout regularization
+    cfg.MODEL.hidden_dropout_prob = 0.1
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.dtype = 'float32'
+    # Layout flags
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Generator hyper-parameters
+    cfg.MODEL.generator_layers_scale = 1.0
+    cfg.MODEL.generator_units_scale = 1.0
+    # Initializer
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@electra_cfg_reg.register()
+def google_electra_base():
+    cfg = google_electra_small()
+    cfg.defrost()
+    cfg.MODEL.embed_size = 768
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.generator_units_scale = 0.33333
+    cfg.freeze()
+    return cfg
+
+
+@electra_cfg_reg.register()
+def google_electra_large():
+    cfg = google_electra_small()
+    cfg.defrost()
+    cfg.MODEL.embed_size = 1024
+    cfg.MODEL.units = 1024
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.generator_units_scale = 0.25
+    cfg.freeze()
+    return cfg
+
+
 PRETRAINED_URL = {
     'google_electra_small': {
-        'cfg': 'google_electra_small/model-9ffb21c8.yml',
+        'cfg': google_electra_small(),
         'vocab': 'google_electra_small/vocab-e6d2b21d.json',
         'params': 'google_electra_small/model-2654c8b4.params',
         'disc_model': 'google_electra_small/disc_model-137714b6.params',
@@ -76,7 +143,7 @@ def get_generator_cfg(model_config):
         'lowercase': True,
     },
     'google_electra_base': {
-        'cfg': 'google_electra_base/model-5b35ca0b.yml',
+        'cfg': google_electra_base(),
         'vocab': 'google_electra_base/vocab-e6d2b21d.json',
         'params': 'google_electra_base/model-31c235cc.params',
         'disc_model': 'google_electra_base/disc_model-514bd353.params',
@@ -84,7 +151,7 @@ def get_generator_cfg(model_config):
         'lowercase': True,
     },
     'google_electra_large': {
-        'cfg': 'google_electra_large/model-31b7dfdd.yml',
+        'cfg': google_electra_large(),
         'vocab': 'google_electra_large/vocab-e6d2b21d.json',
         'params': 'google_electra_large/model-9baf9ff5.params',
         'disc_model': 'google_electra_large/disc_model-5b820c02.params',
@@ -96,6 +163,7 @@ def get_generator_cfg(model_config):
 FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'electra.txt'))
 
 
+# TODO(sxjscience) Use BertTransformer
 @use_np
 class ElectraEncoder(HybridBlock):
     def __init__(self, units=512,
@@ -110,7 +178,35 @@ def __init__(self, units=512,
                  layer_norm_eps=1E-12,
                  weight_initializer=TruncNorm(stdev=0.02),
                  bias_initializer='zeros',
-                 activation='gelu'):
+                 activation='gelu',
+                 layout='NT'):
+        """
+
+        Parameters
+        ----------
+        units
+            The number of units
+        hidden_size
+            The hidden size
+        num_layers
+            Number of layers
+        num_heads
+            Number of heads
+        attention_dropout_prob
+            Dropout probability of the attention layer
+        hidden_dropout_prob
+            Dropout probability
+        output_attention
+            Whether to output the attention weights
+        dtype
+            Data type of the weights
+        output_all_encodings
+        layer_norm_eps
+        weight_initializer
+        bias_initializer
+        activation
+        layout
+        """
         super().__init__()
         assert units % num_heads == 0, \
             'In ElectraEncoder, The units should be divisible ' \
@@ -118,6 +214,7 @@ def __init__(self, units=512,
             .format(units, num_heads)
 
         self._dtype = dtype
+        self._layout = layout
         self._num_layers = num_layers
 
         self._output_attention = output_attention
@@ -134,7 +231,13 @@ def __init__(self, units=512,
                                         layer_norm_eps=layer_norm_eps,
                                         weight_initializer=weight_initializer,
                                         bias_initializer=bias_initializer,
-                                        activation=activation))
+                                        activation=activation,
+                                        dtype=dtype,
+                                        layout=layout))
+
+    @property
+    def layout(self):
+        return self._layout
 
     def hybrid_forward(self, F, data, valid_length):
         """
@@ -145,18 +248,31 @@ def hybrid_forward(self, F, data, valid_length):
         Parameters
         ----------
         F
-        data :
-            Shape (batch_size, seq_length, C)
-        valid_length :
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        out :
-            Shape (batch_size, seq_length, C_out)
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
         """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
         # 1. Embed the data
-        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full')
+        attn_mask = gen_self_attn_mask(F, data, valid_length,
+                                       dtype=self._dtype,
+                                       layout=self._layout,
+                                       attn_type='full')
         out = data
         all_encodings_outputs = []
         additional_outputs = []
@@ -168,7 +284,8 @@ def hybrid_forward(self, F, data, valid_length):
             if self._output_all_encodings:
                 out = F.npx.sequence_mask(out,
                                           sequence_length=valid_length,
-                                          use_sequence_length=True, axis=1)
+                                          use_sequence_length=True,
+                                          axis=time_axis)
                 all_encodings_outputs.append(out)
 
             if self._output_attention:
@@ -177,7 +294,7 @@ def hybrid_forward(self, F, data, valid_length):
         if not self._output_all_encodings:
             # if self._output_all_encodings, SequenceMask is already applied above
             out = F.npx.sequence_mask(out, sequence_length=valid_length,
-                                      use_sequence_length=True, axis=1)
+                                      use_sequence_length=True, axis=time_axis)
             return out, additional_outputs
         else:
             return all_encodings_outputs, additional_outputs
@@ -208,7 +325,9 @@ def __init__(self,
                  weight_initializer=TruncNorm(stdev=0.02),
                  bias_initializer='zeros',
                  dtype='float32',
-                 use_pooler=True):
+                 use_pooler=True,
+                 layout='NT',
+                 compute_layout='auto'):
         super().__init__()
         self._dtype = dtype
         self.use_pooler = use_pooler
@@ -223,6 +342,11 @@ def __init__(self,
         self.weight_initializer = weight_initializer
         self.bias_initializer = bias_initializer
         self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout is None or compute_layout == 'auto':
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
         # Construct ElectraEncoder
         self.encoder = ElectraEncoder(
             units=units,
@@ -238,6 +362,7 @@ def __init__(self,
             weight_initializer=weight_initializer,
             bias_initializer=bias_initializer,
             dtype=dtype,
+            layout=self._compute_layout,
         )
         self.encoder.hybridize()
 
@@ -262,6 +387,10 @@ def __init__(self,
                                                   weight_initializer=weight_initializer,
                                                   bias_initializer=bias_initializer)
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         # pylint: disable=arguments-differ
         """Generate the representation given the inputs.
@@ -271,22 +400,31 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
-        valid_length :
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
 
         Returns
         -------
-        contextual_embedding :
-            Shape (batch_size, seq_length, units).
-        pooled_output :
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_output
             This is optional. Shape (batch_size, units)
         """
         initial_embedding = self.get_initial_embedding(F, inputs, token_types)
@@ -295,17 +433,27 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None):
         if self.embed_size != self.units:
             prev_out = self.embed_factorized_proj(prev_out)
         outputs = []
-        contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
+        if self._compute_layout != self._layout:
+            # Swap the axes if the compute_layout and layout mismatch
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
         outputs.append(contextual_embeddings)
         if self.use_pooler:
             # Here we just get the first token ([CLS]) without any pooling strategy,
-            # which is slightly different between bert model with the pooled_out
+            # which is slightly different from bert model with the pooled_out
             # the attribute name is keeping the same as bert and albert model with defualt
             # use_pooler=True
-            pooled_out = contextual_embeddings[:, 0, :]
+            if self._layout == 'NT':
+                pooled_out = contextual_embeddings[:, 0, :]
+            else:
+                pooled_out = contextual_embeddings[0, :, :]
             outputs.append(pooled_out)
         return tuple(outputs) if len(outputs) > 1 else outputs[0]
 
+    #TODO(sxjscience) Move to a `common.py`
     def get_initial_embedding(self, F, inputs, token_types=None):
         """Get the initial token embeddings that considers the token type and positional embeddings
 
@@ -313,24 +461,38 @@ def get_initial_embedding(self, F, inputs, token_types=None):
         ----------
         F
         inputs
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         token_types
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
             If None, it will be initialized as all zero
 
         Returns
         -------
         embedding
             The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_embed)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_embed)
         """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
         embedding = self.word_embed(inputs)
         if token_types is None:
             token_types = F.np.zeros_like(inputs)
         type_embedding = self.token_type_embed(token_types)
         embedding = embedding + type_embedding
         if self.pos_embed_type is not None:
-            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=1))
-            positional_embedding = F.np.expand_dims(positional_embedding, axis=0)
+            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
             embedding = embedding + positional_embedding
         # Extra layer normalization plus dropout
         embedding = self.embed_layer_norm(embedding)
@@ -339,48 +501,20 @@ def get_initial_embedding(self, F, inputs, token_types=None):
 
     @staticmethod
     def get_cfg(key=None):
-        if key is None:
-            cfg = CN()
-            # Model Parameters for the electra small
-            cfg.MODEL = CN()
-            cfg.MODEL.vocab_size = 30522
-            cfg.MODEL.embed_size = 128
-            cfg.MODEL.units = 256
-            cfg.MODEL.hidden_size = 1024
-            cfg.MODEL.max_length = 512
-            cfg.MODEL.num_heads = 4
-            cfg.MODEL.num_layers = 12
-            cfg.MODEL.pos_embed_type = 'learned'
-            # Unlike BERT and ALBERT, which ues gelu(tanh), the gelu(erf) is used in Electra.
-            cfg.MODEL.activation = 'gelu'
-            cfg.MODEL.layer_norm_eps = 1E-12
-            cfg.MODEL.num_token_types = 2
-            cfg.MODEL.hidden_dropout_prob = 0.1
-            cfg.MODEL.attention_dropout_prob = 0.1
-            cfg.MODEL.dtype = 'float32'
-            cfg.MODEL.generator_layers_scale = 1.0
-            # multiplier for units, hidden_size, and num_heads
-            cfg.MODEL.generator_units_scale = 1.0
-            # Hyper-parameters of the Initializers
-            cfg.INITIALIZER = CN()
-            cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
-            cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
-            cfg.INITIALIZER.bias = ['zeros']
-            # Version of the model. This helps ensure backward compatibility.
-            # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
-            cfg.VERSION = 1
-            cfg.freeze()
+        if key is not None:
+            return electra_cfg_reg.create(key)
         else:
-            raise NotImplementedError
-        return cfg
+            return google_electra_base()
 
     @classmethod
-    def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'ElectraModel':
+    def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'ElectraModel':
         cfg = ElectraModel.get_cfg().clone_merge(cfg)
         assert cfg.VERSION == 1, 'Wrong version!'
         embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
         weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
         bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
         return cls(vocab_size=cfg.MODEL.vocab_size,
                    units=cfg.MODEL.units,
                    hidden_size=cfg.MODEL.hidden_size,
@@ -398,7 +532,9 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'ElectraModel':
                    embed_initializer=embed_initializer,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer,
-                   use_pooler=use_pooler)
+                   use_pooler=use_pooler,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
 
 
 @use_np
@@ -447,25 +583,37 @@ def hybrid_forward(self, F, inputs, token_types, valid_length):
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
-        valid_length :
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
 
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
         rtd_scores
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
         rtd_scores = self.rtd_encoder(contextual_embeddings).squeeze(-1)
@@ -515,8 +663,21 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
         self.mlm_decoder.hybridize()
 
-    def tie_embeddings(self, word_embed_params=None, token_type_embed_params=None,
-                       token_pos_embed_params=None, embed_layer_norm_params=None):
+    # TODO(sxjscience,zheyu) Should design a better API
+    def tie_embeddings(self, word_embed_params=None,
+                       token_type_embed_params=None,
+                       token_pos_embed_params=None,
+                       embed_layer_norm_params=None):
+        """Tie the embedding layers between the backbone and the MLM decoder
+
+        Parameters
+        ----------
+        word_embed_params
+        token_type_embed_params
+        token_pos_embed_params
+        embed_layer_norm_params
+
+        """
         self.backbone_model.word_embed.share_parameters(word_embed_params)
         self.mlm_decoder[-1].share_parameters(word_embed_params)
         self.backbone_model.token_type_embed.share_parameters(token_type_embed_params)
@@ -529,10 +690,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions)
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
@@ -546,14 +713,21 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions)
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
         mlm_scores :
             Shape (batch_size, num_masked_positions, vocab_size)
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.backbone_model.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
         mlm_scores = self.mlm_decoder(mlm_features)
         return contextual_embeddings, pooled_out, mlm_scores
 
@@ -561,7 +735,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions)
 @use_np
 class ElectraForPretrain(HybridBlock):
     """
-    A integrated model combined with a generator and a discriminator.  Generator here
+    An integrated model combined with a generator and a discriminator.  Generator here
     produces a corrupted tokens playing as fake data to fool a discriminator whose
     objective is to distinguish whether each token in the input sentence it accepts
     is the same as the original. It is a classification task instead of prediction
@@ -612,11 +786,15 @@ def __init__(self,
         self.disc_cfg = disc_cfg
         self.vocab_size = disc_cfg.MODEL.vocab_size
         self.gen_cfg = get_generator_cfg(disc_cfg)
-        self.discriminator = ElectraDiscriminator(disc_cfg)
+        self.discriminator = ElectraDiscriminator(disc_cfg,
+                                                  weight_initializer=weight_initializer,
+                                                  bias_initializer=bias_initializer)
         self.disc_backbone = self.discriminator.backbone_model
 
         if not uniform_generator and not tied_generator:
-            self.generator = ElectraGenerator(self.gen_cfg)
+            self.generator = ElectraGenerator(self.gen_cfg,
+                                              weight_initializer=weight_initializer,
+                                              bias_initializer=bias_initializer)
             if tied_embeddings:
                 self.generator.tie_embeddings(self.disc_backbone.word_embed.collect_params(),
                                               self.disc_backbone.token_type_embed.collect_params(),
@@ -626,7 +804,10 @@ def __init__(self,
 
         elif tied_generator:
             # Reuse the weight of the discriminator backbone model
-            self.generator = ElectraGenerator(self.gen_cfg)
+            self.generator = ElectraGenerator(self.gen_cfg,
+                                              weight_initializer=weight_initializer,
+                                              bias_initializer=bias_initializer)
+            # TODO(sxjscience, zheyu) Verify
             self.generator.backbone_model = self.disc_backbone
             self.generator.hybridize()
         elif uniform_generator:
@@ -650,18 +831,24 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
+        inputs
             The masked input
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
-        valid_length :
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
-        unmasked_tokens :
+        unmasked_tokens
             The original tokens that appear in the unmasked input sequence
             Shape (batch_size, num_masked_positions).
         masked_positions :
@@ -670,20 +857,26 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
 
         Returns
         -------
-        mlm_scores :
+        mlm_scores
             Shape (batch_size, num_masked_positions, vocab_size)
-        rtd_scores :
-            Shape (batch_size, seq_length)
+        rtd_scores
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         replaced_inputs :
             Shape (batch_size, num_masked_positions)
-        labels :
-            Shape (batch_size, seq_length)
+        labels
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         """
         if self._uniform_generator:
             # generate the corrupt tokens randomly with a mlm_scores vector whose value is all 0
-            zero_logits = F.np.zeros(self.vocab_size)
-            zero_logits = F.np.expand_dims(F.np.expand_dims(zero_logits, axis=0), axis=0)
-            mlm_scores = F.np.expand_dims(F.np.zeros_like(masked_positions), axis=-1)
+            zero_logits = F.np.zeros((1, 1, self.vocab_size), dtype=self._dtype)
+            mlm_scores = F.np.expand_dims(F.np.zeros_like(masked_positions, dtype=self._dtype),
+                                          axis=-1)
             mlm_scores = mlm_scores + zero_logits
         else:
             _, _, mlm_scores = self.generator(inputs, token_types, valid_length, masked_positions)
@@ -698,12 +891,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
     def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, logits):
         """
         Sample from the generator to create corrupted input.
+
         Parameters
         ----------
         F
         inputs
             The masked input
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         unmasked_tokens
             The original tokens that appear in the unmasked input sequence
             Shape (batch_size, num_masked_positions).
@@ -715,10 +912,18 @@ def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, log
 
         Returns
         -------
+        corrupted_tokens
+            The corrupted tokens
         fake_data
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         labels
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         """
 
         if self._disallow_correct:
@@ -734,6 +939,8 @@ def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, log
             use_np_gumbel=False)
         corrupted_tokens = F.np.argmax(prob, axis=-1).astype(np.int32)
 
+        if self.disc_backbone.layout == 'TN':
+            inputs = inputs.T
         # Following the Official electra to deal with duplicate positions as
         # https://github.com/google-research/electra/issues/41
         original_data, updates_mask = updated_vectors_by_position(F,
@@ -742,7 +949,10 @@ def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, log
             inputs, corrupted_tokens, masked_positions)
 
         labels = updates_mask * F.np.not_equal(fake_data, original_data)
-        return corrupted_tokens, fake_data, labels
+        if self.disc_backbone.layout == 'TN':
+            return corrupted_tokens, fake_data.T, labels.T
+        else:
+            return corrupted_tokens, fake_data, labels
 
 
 def list_pretrained_electra():
@@ -787,13 +997,20 @@ def get_pretrained_electra(model_name: str = 'google_electra_small',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_electra())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     vocab_path = PRETRAINED_URL[model_name]['vocab']
     params_path = PRETRAINED_URL[model_name]['params']
     disc_params_path = PRETRAINED_URL[model_name]['disc_model']
     gen_params_path = PRETRAINED_URL[model_name]['gen_model']
 
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('vocab', vocab_path)]:
+    download_jobs = [('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
         local_paths[k] = download(url=get_repo_model_zoo_url() + path,
                                   path=os.path.join(root, path),
                                   sha1_hash=FILE_STATS[path])
@@ -827,7 +1044,8 @@ def get_pretrained_electra(model_name: str = 'google_electra_small',
         sep_token='[SEP]',
         mask_token='[MASK]',
         lowercase=do_lower)
-    cfg = ElectraModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = ElectraModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, (local_disc_params_path, local_gen_params_path)
 
 
diff --git a/src/gluonnlp/models/mobilebert.py b/src/gluonnlp/models/mobilebert.py
index 502d7f4750..5a81de7c64 100644
--- a/src/gluonnlp/models/mobilebert.py
+++ b/src/gluonnlp/models/mobilebert.py
@@ -41,6 +41,7 @@
 from ..initializer import TruncNorm
 from ..utils.config import CfgNode as CN
 from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
 from ..registry import BACKBONE_REGISTRY
 from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask
 from ..data.tokenizers import HuggingFaceWordPieceTokenizer
@@ -48,9 +49,51 @@
 __all__ = ['MobileBertModel', 'MobileBertForMLM', 'MobileBertForPretrain',
            'list_pretrained_mobilebert', 'get_pretrained_mobilebert']
 
+mobilebert_cfg_reg = Registry('mobilebert_cfg')
+
+
+@mobilebert_cfg_reg.register()
+def google_uncased_mobilebert():
+    cfg = CN()
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30522
+    cfg.MODEL.units = 512
+    cfg.MODEL.embed_size = 128
+    cfg.MODEL.inner_size = 128
+    cfg.MODEL.hidden_size = 512
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 4
+    cfg.MODEL.num_layers = 24
+
+    cfg.MODEL.use_bottleneck = True  # Whether to use bottleneck
+    cfg.MODEL.trigram_embed = True  # Trigram embedding
+    cfg.MODEL.classifier_activation = False  # Whether to use an additional pooling layer
+    cfg.MODEL.bottleneck_strategy = 'qk_sharing'
+    cfg.MODEL.num_stacked_ffn = 4
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'relu'
+    cfg.MODEL.num_token_types = 2
+    cfg.MODEL.hidden_dropout_prob = 0.0
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.normalization = 'no_norm'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.dtype = 'float32'
+    # Layout flags
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Initializer
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
 PRETRAINED_URL = {
     'google_uncased_mobilebert': {
-        'cfg': 'google_uncased_mobilebert/model-1c33216b.yml',
+        'cfg': google_uncased_mobilebert(),
         'vocab': 'google_uncased_mobilebert/vocab-e6d2b21d.json',
         'params': 'google_uncased_mobilebert/model-c8346cf2.params',
         'mlm_params': 'google_uncased_mobilebert/model_mlm-53948e82.params',
@@ -66,7 +109,7 @@
 class MobileBertEncoderLayer(HybridBlock):
     """The Transformer Encoder Layer in Mobile Bert"""
     # TODO(zheyuye), use stacked groups for single ffn layer in transformer.TransformerEncoderLayer
-    # and revise the other models and scripts, masking sure their are compatible.
+    # and revise the other models and scripts, making sure they are compatible.
 
     def __init__(self,
                  use_bottleneck: bool = True,
@@ -85,12 +128,14 @@ def __init__(self,
                  use_qkv_bias: bool = True,
                  weight_initializer: Optional[InitializerType] = None,
                  bias_initializer: Optional[InitializerType] = 'zeros',
-                 dtype='float32'):
+                 dtype='float32',
+                 layout='NT'):
         """
 
         Parameters
         ----------
         use_bottleneck
+            Whether to use the bottleneck layer.
         units
             size of inter-bottleneck
         real_units
@@ -110,6 +155,9 @@ def __init__(self,
         weight_initializer
         bias_initializer
         dtype
+            Data type of the block
+        layout
+            Layout of the input + output
         """
         super().__init__()
         self._use_bottleneck = use_bottleneck
@@ -119,6 +167,7 @@ def __init__(self,
         self._num_stacked_ffn = num_stacked_ffn
         self._bottleneck_strategy = bottleneck_strategy
         self._dtype = dtype
+        self._layout = layout
         assert real_units % num_heads == 0, 'units must be divisive by the number of heads'
         self.dropout_layer = nn.Dropout(hidden_dropout_prob)
         if use_bottleneck:
@@ -159,24 +208,47 @@ def __init__(self,
                                        bias_initializer=bias_initializer,
                                        dtype=self._dtype)
         # The in_units of qkv varies according to the sharing strategy
+        if self._use_bottleneck:
+            if self._bottleneck_strategy == 'qk_sharing':
+                attn_query_in_units = real_units
+                attn_key_in_units = real_units
+                attn_value_in_units = units
+            elif self._bottleneck_strategy == 'from_bottleneck':
+                attn_query_in_units = real_units
+                attn_key_in_units = real_units
+                attn_value_in_units = real_units
+            elif self._bottleneck_strategy == 'from_input':
+                attn_query_in_units = units
+                attn_key_in_units = units
+                attn_value_in_units = units
+            else:
+                raise NotImplementedError
+        else:
+            attn_query_in_units = units
+            attn_key_in_units = units
+            attn_value_in_units = units
         self.attn_query = nn.Dense(units=real_units,
+                                   in_units=attn_query_in_units,
                                    flatten=False,
                                    use_bias=use_qkv_bias,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer,
                                    dtype=self._dtype)
         self.attn_key = nn.Dense(units=real_units,
+                                 in_units=attn_key_in_units,
                                  flatten=False,
                                  use_bias=use_qkv_bias,
                                  weight_initializer=weight_initializer,
                                  bias_initializer=bias_initializer,
                                  dtype=self._dtype)
         self.attn_value = nn.Dense(units=real_units,
+                                   in_units=attn_value_in_units,
                                    flatten=False,
                                    use_bias=use_qkv_bias,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer,
                                    dtype=self._dtype)
+        attention_layout = 'NTK' if self._layout == 'NT' else 'TNK'
         self.attention_cell = \
             MultiHeadAttentionCell(
                 query_units=real_units,
@@ -184,7 +256,7 @@ def __init__(self,
                 attention_dropout=attention_dropout_prob,
                 scaled=True,
                 dtype=self._dtype,
-                layout='NTK'
+                layout=attention_layout
             )
         self.layer_norm = get_layer_norm(normalization=normalization,
                                          in_channels=real_units,
@@ -209,26 +281,35 @@ def __init__(self,
                                 layer_norm_eps=layer_norm_eps,
                                 dtype=self._dtype))
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, data, attn_mask):
         """
 
         Parameters
         ----------
         F
-        data :
-            Shape (batch_size, seq_length, C_in)
-        attn_mask :
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        attn_mask
+            The attention mask
             Shape (batch_size, seq_length, seq_length)
 
         Returns
         -------
-        out :
-            Shape (batch_size, seq_length, C_out)
-        attn_weight :
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        attn_weight
             Shape (batch_size, seq_length, seq_length)
         """
-        # TODO(sxjscience) Cannot use negative axis due to
-        #  https://github.com/apache/incubator-mxnet/issues/18132
         if self._use_bottleneck:
             bn_proj = self.in_bottleneck_proj(data)
             bn_proj = self.in_bottleneck_ln(bn_proj)
@@ -241,7 +322,7 @@ def hybrid_forward(self, F, data, attn_mask):
                 key = qk_shared
                 value = data
             elif self._bottleneck_strategy == 'from_bottleneck':
-                # for Mobile mobile bert Tiny
+                # for Mobile Bert Tiny
                 query = bn_proj
                 key = bn_proj
                 value = bn_proj
@@ -298,12 +379,14 @@ def __init__(self,
                  layer_norm_eps: float = 1E-12,
                  weight_initializer: InitializerType = TruncNorm(stdev=0.02),
                  bias_initializer: InitializerType = 'zeros',
-                 dtype='float32'):
+                 dtype='float32',
+                 layout='NT'):
         super().__init__()
         self._dtype = dtype
         self._num_layers = num_layers
         self._output_attention = output_attention
         self._output_all_encodings = output_all_encodings
+        self._layout = layout
 
         assert bottleneck_strategy in ['qk_sharing', 'from_bottleneck', 'from_input'], \
             'The bottleneck strategy={} is not supported.'.format(bottleneck_strategy)
@@ -329,7 +412,12 @@ def __init__(self,
                                        weight_initializer=weight_initializer,
                                        bias_initializer=bias_initializer,
                                        normalization=normalization,
-                                       activation=activation))
+                                       activation=activation,
+                                       layout=layout))
+
+    @property
+    def layout(self):
+        return self._layout
 
     def hybrid_forward(self, F, data, valid_length):
         """
@@ -340,18 +428,34 @@ def hybrid_forward(self, F, data, valid_length):
         Parameters
         ----------
         F
-        data :
-            Shape (batch_size, seq_length, C)
-        valid_length :
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        out :
-            Shape (batch_size, seq_length, C_out)
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
         """
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        elif self._layout == 'TN':
+            batch_axis, time_axis = 1, 0
+        else:
+            raise NotImplementedError('Received layout="{}". '
+                                      'Only "NT" and "TN" are supported.'.format(self._layout))
         # 1. Embed the data
-        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full')
+        attn_mask = gen_self_attn_mask(F, data, valid_length,
+                                       dtype=self._dtype,
+                                       layout=self._layout,
+                                       attn_type='full')
         out = data
         all_encodings_outputs = []
         additional_outputs = []
@@ -364,7 +468,8 @@ def hybrid_forward(self, F, data, valid_length):
             if self._output_all_encodings:
                 out = F.npx.sequence_mask(out,
                                           sequence_length=valid_length,
-                                          use_sequence_length=True, axis=1)
+                                          use_sequence_length=True,
+                                          axis=time_axis)
                 all_encodings_outputs.append(out)
 
             if self._output_attention:
@@ -373,7 +478,8 @@ def hybrid_forward(self, F, data, valid_length):
         if not self._output_all_encodings:
             # if self._output_all_encodings, SequenceMask is already applied above
             out = F.npx.sequence_mask(out, sequence_length=valid_length,
-                                      use_sequence_length=True, axis=1)
+                                      use_sequence_length=True,
+                                      axis=time_axis)
             return out, additional_outputs
         else:
             return all_encodings_outputs, additional_outputs
@@ -406,7 +512,9 @@ def __init__(self,
                  trigram_embed=True,
                  use_pooler=True,
                  classifier_activation=False,
-                 dtype='float32'):
+                 dtype='float32',
+                 layout='NT',
+                 compute_layout='auto'):
         super().__init__()
         self._dtype = dtype
         self.use_bottleneck = use_bottleneck
@@ -428,6 +536,12 @@ def __init__(self,
         self.weight_initializer = weight_initializer
         self.bias_initializer = bias_initializer
         self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout == 'auto' or compute_layout is None:
+            self._compute_layout = layout
+        else:
+            assert compute_layout in ['TN', 'NT']
+            self._compute_layout = compute_layout
         # Construct MobileBertTransformer
         self.encoder = MobileBertTransformer(
             units=units,
@@ -447,6 +561,7 @@ def __init__(self,
             weight_initializer=weight_initializer,
             bias_initializer=bias_initializer,
             dtype=dtype,
+            layout=self._compute_layout,
         )
         self.encoder.hybridize()
         # Construct word embedding
@@ -455,7 +570,12 @@ def __init__(self,
                                        weight_initializer=embed_initializer,
                                        dtype=dtype)
         if trigram_embed or embed_size != units:
+            if trigram_embed:
+                in_units = 3 * embed_size
+            else:
+                in_units = embed_size
             self.embed_factorized_proj = nn.Dense(units=units,
+                                                  in_units=in_units,
                                                   flatten=False,
                                                   weight_initializer=weight_initializer,
                                                   bias_initializer=bias_initializer)
@@ -467,7 +587,8 @@ def __init__(self,
         # Construct token type embedding
         self.token_type_embed = nn.Embedding(input_dim=num_token_types,
                                              output_dim=units,
-                                             weight_initializer=weight_initializer)
+                                             weight_initializer=weight_initializer,
+                                             dtype=self._dtype)
         self.token_pos_embed = PositionalEmbedding(units=units,
                                                    max_length=max_length,
                                                    dtype=self._dtype,
@@ -478,9 +599,18 @@ def __init__(self,
                                    in_units=units,
                                    flatten=False,
                                    activation='tanh',
+                                   dtype=self._dtype,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer)
 
+    @property
+    def layout(self):
+        return self._layout
+
+    @property
+    def dtype(self):
+        return self._dtype
+
     def hybrid_forward(self, F, inputs, token_types, valid_length):
         # pylint: disable=arguments-differ
         """Generate the representation given the inputs.
@@ -490,11 +620,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length):
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
-
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
         valid_length :
@@ -510,24 +645,34 @@ def hybrid_forward(self, F, inputs, token_types, valid_length):
         """
         embedding = self.get_initial_embedding(F, inputs, token_types)
 
-        contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length)
-        outputs = []
-        outputs.append(contextual_embeddings)
+        if self._compute_layout != self._layout:
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(embedding, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length)
         if self.use_pooler:
             pooled_out = self.apply_pooling(contextual_embeddings)
-            outputs.append(pooled_out)
-        return tuple(outputs) if len(outputs) > 1 else outputs[0]
+            return contextual_embeddings, pooled_out
+        else:
+            return contextual_embeddings
 
-    def get_initial_embedding(self, F, inputs, token_types=None, trigram_embed=True):
+    def get_initial_embedding(self, F, inputs, token_types=None):
         """Get the initial token embeddings that considers the token type and positional embeddings
 
         Parameters
         ----------
         F
         inputs
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
         token_types
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
             If None, it will be initialized as all zero
 
         Returns
@@ -535,24 +680,39 @@ def get_initial_embedding(self, F, inputs, token_types=None, trigram_embed=True)
         embedding
             The initial embedding that will be fed into the encoder
         """
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        elif self._layout == 'TN':
+            batch_axis, time_axis = 1, 0
+        else:
+            raise NotImplementedError
         word_embedding = self.word_embed(inputs)
 
-        if trigram_embed:
-            word_embedding = F.np.concatenate(
-                [F.np.pad(word_embedding[:, 1:], ((0, 0), (0, 1), (0, 0))),
-                 word_embedding,
-                 F.np.pad(word_embedding[:, :-1], ((0, 0), (1, 0), (0, 0)))], axis=-1)
+        if self.trigram_embed:
+            if self._layout == 'NT':
+                word_embedding = F.np.concatenate(
+                    [F.np.pad(word_embedding[:, 1:], ((0, 0), (0, 1), (0, 0))),
+                     word_embedding,
+                     F.np.pad(word_embedding[:, :-1], ((0, 0), (1, 0), (0, 0)))], axis=-1)
+            elif self._layout == 'TN':
+                word_embedding = F.np.concatenate(
+                    [F.np.pad(word_embedding[1:, :], ((0, 1), (0, 0), (0, 0))),
+                     word_embedding,
+                     F.np.pad(word_embedding[:-1, :], ((1, 0), (0, 0), (0, 0)))], axis=-1)
+            else:
+                raise NotImplementedError
         # Projecting the embedding into units only for word embedding
-        if trigram_embed or self.embed_size != self.units:
-            embedding = self.embed_factorized_proj(word_embedding)
+        if self.trigram_embed or self.embed_size != self.units:
+            word_embedding = self.embed_factorized_proj(word_embedding)
 
         if token_types is None:
-            token_types = F.np.zeros_like(embedding)
+            token_types = F.np.zeros_like(inputs)
         type_embedding = self.token_type_embed(token_types)
-        embedding = embedding + type_embedding
+        embedding = word_embedding + type_embedding
         if self.pos_embed_type is not None:
-            positional_embedding = self.token_pos_embed(F.npx.arange_like(embedding, axis=1))
-            positional_embedding = F.np.expand_dims(positional_embedding, axis=0)
+            positional_embedding =\
+                self.token_pos_embed(F.npx.arange_like(embedding, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
             embedding = embedding + positional_embedding
         # Extra layer normalization plus dropout
         embedding = self.embed_layer_norm(embedding)
@@ -565,12 +725,23 @@ def apply_pooling(self, sequence):
         This is used for pre-training or fine-tuning a mobile bert model.
         Get the first token of the whole sequence which is [CLS]
 
-        sequence:
-            Shape (batch_size, sequence_length, units)
-        return:
+        Parameters
+        ----------
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+
+        Returns
+        -------
+        outputs
             Shape (batch_size, units)
         """
-        outputs = sequence[:, 0, :]
+        if self._layout == 'NT':
+            outputs = sequence[:, 0, :]
+        else:
+            outputs = sequence[0, :, :]
         if self.classifier_activation:
             return self.pooler(outputs)
         else:
@@ -578,53 +749,23 @@ def apply_pooling(self, sequence):
 
     @staticmethod
     def get_cfg(key=None):
-        if key is None:
-            cfg = CN()
-            cfg.MODEL = CN()
-            cfg.MODEL.vocab_size = 30522
-            cfg.MODEL.embed_size = 128
-            cfg.MODEL.units = 512
-            cfg.MODEL.hidden_size = 512
-            cfg.MODEL.inner_size = 128
-            cfg.MODEL.max_length = 512
-            cfg.MODEL.num_heads = 4
-            cfg.MODEL.num_layers = 12
-            cfg.MODEL.num_stacked_ffn = 4
-            cfg.MODEL.pos_embed_type = 'learned'
-            cfg.MODEL.activation = 'relu'
-            cfg.MODEL.normalization = 'no_norm'
-            cfg.MODEL.layer_norm_eps = 1E-12
-            cfg.MODEL.bottleneck_strategy = 'qk_sharing'
-            cfg.MODEL.num_token_types = 2
-            cfg.MODEL.hidden_dropout_prob = 0.0
-            cfg.MODEL.attention_dropout_prob = 0.1
-            cfg.MODEL.dtype = 'float32'
-            # Hyper-parameters of the Initializers
-            cfg.INITIALIZER = CN()
-            cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
-            cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
-            cfg.INITIALIZER.bias = ['zeros']
-            # Version of the model. This helps ensure backward compatibility.
-            # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
-            cfg.VERSION = 1
+        if key is not None:
+            return mobilebert_cfg_reg.create(key)
         else:
-            raise NotImplementedError
-        cfg.freeze()
-        return cfg
+            return google_uncased_mobilebert()
 
     @classmethod
     def from_cfg(cls,
                  cfg,
                  use_pooler=True,
-                 dtype='float32',
-                 use_bottleneck=True,
-                 trigram_embed=True,
-                 classifier_activation=False) -> 'MobileBertModel':
+                 dtype=None) -> 'MobileBertModel':
         cfg = MobileBertModel.get_cfg().clone_merge(cfg)
         assert cfg.VERSION == 1, 'Wrong version!'
         embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
         weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
         bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
         return cls(vocab_size=cfg.MODEL.vocab_size,
                    units=cfg.MODEL.units,
                    hidden_size=cfg.MODEL.hidden_size,
@@ -646,17 +787,17 @@ def from_cfg(cls,
                    embed_initializer=embed_initializer,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer,
-                   use_bottleneck=use_bottleneck,
-                   trigram_embed=trigram_embed,
+                   use_bottleneck=cfg.MODEL.use_bottleneck,
+                   trigram_embed=cfg.MODEL.trigram_embed,
                    use_pooler=use_pooler,
-                   classifier_activation=classifier_activation)
+                   classifier_activation=cfg.MODEL.classifier_activation,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
 
 
 @use_np
 class MobileBertForMLM(HybridBlock):
     def __init__(self, backbone_cfg,
-                 use_bottleneck=True,
-                 trigram_embed=True,
                  weight_initializer=None,
                  bias_initializer=None):
         """
@@ -668,9 +809,7 @@ def __init__(self, backbone_cfg,
         bias_initializer
         """
         super().__init__()
-        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg,
-                                                       use_bottleneck=use_bottleneck,
-                                                       trigram_embed=trigram_embed)
+        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg)
         if weight_initializer is None:
             weight_initializer = self.backbone_model.weight_initializer
         if bias_initializer is None:
@@ -680,7 +819,8 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer))
+                                      bias_initializer=bias_initializer,
+                                      dtype=self.backbone_model.dtype))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
         # use basic layer normalization for pretaining
         self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
@@ -692,14 +832,14 @@ def __init__(self, backbone_cfg,
             units=self.backbone_model.vocab_size,
             in_units=self.backbone_model.embed_size,
             flatten=False,
+            dtype=self.backbone_model.dtype,
             bias_initializer=bias_initializer)
         self.embedding_table.weight = self.backbone_model.word_embed.weight
         if self.backbone_model.embed_size != self.backbone_model.units:
             self.extra_table = nn.Dense(
                 units=self.backbone_model.vocab_size,
                 use_bias=False,
-                in_units=self.backbone_model.units -
-                self.backbone_model.embed_size,
+                in_units=self.backbone_model.units - self.backbone_model.embed_size,
                 flatten=False)
 
     def hybrid_forward(self, F, inputs, token_types, valid_length,
@@ -709,30 +849,43 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
             The type of the token. For example, if the inputs contain two sequences,
             we will set different token types for the first sentence and the second sentence.
-        valid_length :
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
-        masked_positions :
+        masked_positions
             The masked position of the sequence
             Shape (batch_size, num_masked_positions).
 
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
-        mlm_scores :
+        mlm_scores
             Shape (batch_size, num_masked_positions, vocab_size)
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.backbone_model.layout == 'TN':
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
         intermediate_output = self.mlm_decoder(mlm_features)
         if self.backbone_model.embed_size != self.backbone_model.units:
             scores = self.embedding_table(
@@ -748,8 +901,6 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
 @use_np
 class MobileBertForPretrain(HybridBlock):
     def __init__(self, backbone_cfg,
-                 use_bottleneck=True,
-                 trigram_embed=True,
                  weight_initializer=None,
                  bias_initializer=None):
         """
@@ -762,22 +913,22 @@ def __init__(self, backbone_cfg,
         bias_initializer
         """
         super().__init__()
-        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg,
-                                                       use_bottleneck=use_bottleneck,
-                                                       trigram_embed=trigram_embed)
+        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg)
         if weight_initializer is None:
             weight_initializer = self.backbone_model.weight_initializer
         if bias_initializer is None:
             bias_initializer = self.backbone_model.bias_initializer
         # Construct nsp_classifier for next sentence prediction
         self.nsp_classifier = nn.Dense(units=2,
-                                       weight_initializer=weight_initializer)
+                                       weight_initializer=weight_initializer,
+                                       dtype=self.backbone_model.dtype)
         self.mlm_decoder = nn.HybridSequential()
         # Extra non-linear layer
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer))
+                                      bias_initializer=bias_initializer,
+                                      dtype=self.backbone_model.dtype))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
         # use basic layer normalization for pretaining
         self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
@@ -789,7 +940,8 @@ def __init__(self, backbone_cfg,
             units=self.backbone_model.vocab_size,
             in_units=self.backbone_model.embed_size,
             flatten=False,
-            bias_initializer=bias_initializer)
+            bias_initializer=bias_initializer,
+            dtype=self.backbone_model.dtype)
         self.embedding_table.weight = self.backbone_model.word_embed.weight
         if self.backbone_model.embed_size != self.backbone_model.units:
             self.extra_table = nn.Dense(
@@ -798,7 +950,8 @@ def __init__(self, backbone_cfg,
                 self.backbone_model.embed_size,
                 flatten=False,
                 use_bias=False,
-                bias_initializer=bias_initializer)
+                bias_initializer=bias_initializer,
+                dtype=self.backbone_model.dtype)
 
     def hybrid_forward(self, F, inputs, token_types, valid_length,
                        masked_positions):
@@ -809,34 +962,47 @@ def hybrid_forward(self, F, inputs, token_types, valid_length,
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        token_types :
-            Shape (batch_size, seq_length)
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
             If the inputs contain two sequences, we will set different token types for the first
              sentence and the second sentence.
-        valid_length :
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
-        masked_positions :
+        masked_positions
             The masked position of the sequence
             Shape (batch_size, num_masked_positions).
 
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
-        nsp_score :
+        nsp_score
             Shape (batch_size, 2)
-        mlm_scores :
+        mlm_scores
             Shape (batch_size, num_masked_positions, vocab_size)
         """
         contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
         nsp_score = self.nsp_classifier(pooled_out)
-        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        if self.backbone_model.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
         intermediate_output = self.mlm_decoder(mlm_features)
         if self.backbone_model.embed_size != self.backbone_model.units:
             scores = self.embedding_table(
@@ -884,11 +1050,18 @@ def get_pretrained_mobilebert(model_name: str = 'google_uncased_mobilebert',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_mobilebert())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     vocab_path = PRETRAINED_URL[model_name]['vocab']
     params_path = PRETRAINED_URL[model_name]['params']
     mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('vocab', vocab_path)]:
+    download_jobs = [('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
         local_paths[k] = download(url=get_repo_model_zoo_url() + path,
                                   path=os.path.join(root, path),
                                   sha1_hash=FILE_STATS[path])
@@ -914,7 +1087,8 @@ def get_pretrained_mobilebert(model_name: str = 'google_uncased_mobilebert',
                     sep_token='[SEP]',
                     mask_token='[MASK]',
                     lowercase=do_lower)
-    cfg = MobileBertModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = MobileBertModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, local_mlm_params_path
 
 
diff --git a/src/gluonnlp/models/roberta.py b/src/gluonnlp/models/roberta.py
index 8400f89fbd..b9af04dafd 100644
--- a/src/gluonnlp/models/roberta.py
+++ b/src/gluonnlp/models/roberta.py
@@ -42,31 +42,13 @@
 from ..layers import PositionalEmbedding, get_activation
 from ..registry import BACKBONE_REGISTRY
 from ..utils.misc import download, load_checksum_stats
+from ..utils.registry import Registry
 from .transformer import TransformerEncoderLayer
 from ..initializer import TruncNorm
 from ..utils.config import CfgNode as CN
 from ..attention_cell import gen_self_attn_mask
-from ..utils.registry import Registry
 from ..data.tokenizers import HuggingFaceByteBPETokenizer
 
-PRETRAINED_URL = {
-    'fairseq_roberta_base': {
-        'cfg': 'fairseq_roberta_base/model-565d1db7.yml',
-        'merges': 'fairseq_roberta_base/gpt2-396d4d8e.merges',
-        'vocab': 'fairseq_roberta_base/gpt2-f1335494.vocab',
-        'params': 'fairseq_roberta_base/model-09a1520a.params',
-        'mlm_params': 'fairseq_roberta_base/model_mlm-29889e2b.params',
-        'lowercase': False,
-    },
-    'fairseq_roberta_large': {
-        'cfg': 'fairseq_roberta_large/model-6e66dc4a.yml',
-        'merges': 'fairseq_roberta_large/gpt2-396d4d8e.merges',
-        'vocab': 'fairseq_roberta_large/gpt2-f1335494.vocab',
-        'params': 'fairseq_roberta_large/model-6b043b91.params',
-        'mlm_params': 'fairseq_roberta_large/model_mlm-119f38e1.params',
-        'lowercase': False,
-    }
-}
 
 FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'roberta.txt'))
 roberta_cfg_reg = Registry('roberta_cfg')
@@ -90,6 +72,10 @@ def roberta_base():
     cfg.MODEL.hidden_dropout_prob = 0.1
     cfg.MODEL.attention_dropout_prob = 0.1
     cfg.MODEL.dtype = 'float32'
+    # Layout
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Initialization method
     cfg.INITIALIZER = CN()
     cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
     cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]
@@ -111,6 +97,97 @@ def roberta_large():
     return cfg
 
 
+PRETRAINED_URL = {
+    'fairseq_roberta_base': {
+        'cfg': roberta_base(),
+        'merges': 'fairseq_roberta_base/gpt2-396d4d8e.merges',
+        'vocab': 'fairseq_roberta_base/gpt2-f1335494.vocab',
+        'params': 'fairseq_roberta_base/model-09a1520a.params',
+        'mlm_params': 'fairseq_roberta_base/model_mlm-29889e2b.params',
+        'lowercase': False,
+    },
+    'fairseq_roberta_large': {
+        'cfg': roberta_large(),
+        'merges': 'fairseq_roberta_large/gpt2-396d4d8e.merges',
+        'vocab': 'fairseq_roberta_large/gpt2-f1335494.vocab',
+        'params': 'fairseq_roberta_large/model-6b043b91.params',
+        'mlm_params': 'fairseq_roberta_large/model_mlm-119f38e1.params',
+        'lowercase': False,
+    }
+}
+
+
+@use_np
+class RobertaEncoder(HybridBlock):
+    def __init__(self,
+                 units=768,
+                 hidden_size=3072,
+                 num_layers=12,
+                 num_heads=12,
+                 attention_dropout_prob=0.1,
+                 hidden_dropout_prob=0.1,
+                 layer_norm_eps=1E-5,
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 activation='gelu',
+                 dtype='float32',
+                 output_all_encodings=False,
+                 output_attention=False,
+                 layout='NT'):
+        super().__init__()
+        self.units = units
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.attention_dropout_prob = attention_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.activation = activation
+        self._dtype = dtype
+        self._layout = layout
+        self._output_all_encodings = output_all_encodings
+        self._output_attention = output_attention
+        self.all_layers = nn.HybridSequential()
+        for layer_idx in range(self.num_layers):
+            self.all_layers.add(
+                TransformerEncoderLayer(
+                    units=self.units,
+                    hidden_size=self.hidden_size,
+                    num_heads=self.num_heads,
+                    attention_dropout_prob=self.attention_dropout_prob,
+                    hidden_dropout_prob=self.hidden_dropout_prob,
+                    layer_norm_eps=self.layer_norm_eps,
+                    weight_initializer=weight_initializer,
+                    bias_initializer=bias_initializer,
+                    activation=self.activation,
+                    dtype=self._dtype,
+                    layout=layout)
+            )
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, x, valid_length):
+        atten_mask = gen_self_attn_mask(F, x, valid_length,
+                                        layout=self._layout,
+                                        dtype=self._dtype, attn_type='full')
+        all_encodings_outputs = [x]
+        additional_outputs = []
+        for layer_idx in range(self.num_layers):
+            layer = self.all_layers[layer_idx]
+            x, attention_weights = layer(x, atten_mask)
+            if self._output_all_encodings:
+                all_encodings_outputs.append(x)
+            if self._output_attention:
+                additional_outputs.append(attention_weights)
+        # sequence_mask is not necessary here because masking could be performed in downstream tasks
+        if self._output_all_encodings:
+            return all_encodings_outputs, additional_outputs
+        else:
+            return x, additional_outputs
+
+
 @use_np
 class RobertaModel(HybridBlock):
     def __init__(self,
@@ -133,7 +210,9 @@ def __init__(self,
                  use_pooler=True,
                  classifier_activation=False,
                  encoder_normalize_before=True,
-                 output_all_encodings=False):
+                 output_all_encodings=False,
+                 layout='NT',
+                 compute_layout='auto'):
         """
 
         Parameters
@@ -159,7 +238,13 @@ def __init__(self,
         classifier_activation
             Whether to use classification head
         encoder_normalize_before
+            Whether to normalize before the
         output_all_encodings
+            Whether to output all encodings
+        layout
+            The layout
+        compute_layout
+            The computation layout
         """
         super().__init__()
         self._dtype = dtype
@@ -181,7 +266,11 @@ def __init__(self,
         self.encoder_normalize_before = encoder_normalize_before
         self.weight_initializer = weight_initializer
         self.bias_initializer = bias_initializer
-
+        self._layout = layout
+        if compute_layout == 'auto' or compute_layout is None:
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
         self.word_embed = nn.Embedding(
             input_dim=self.vocab_size,
             output_dim=self.units,
@@ -211,7 +300,8 @@ def __init__(self,
             bias_initializer=bias_initializer,
             activation=self.activation,
             dtype=self._dtype,
-            output_all_encodings=self._output_all_encodings
+            output_all_encodings=self._output_all_encodings,
+            layout=self._compute_layout,
         )
         self.encoder.hybridize()
 
@@ -224,20 +314,26 @@ def __init__(self,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer)
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, tokens, valid_length):
-        outputs = []
         embedding = self.get_initial_embedding(F, tokens)
-
-        contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length)
-        outputs.append(contextual_embeddings)
-        if self._output_all_encodings:
-            contextual_embeddings = contextual_embeddings[-1]
-
+        if self._layout != self._compute_layout:
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(embedding, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length)
         if self.use_pooler:
-            pooled_out = self.apply_pooling(contextual_embeddings)
-            outputs.append(pooled_out)
-
-        return tuple(outputs) if len(outputs) > 1 else outputs[0]
+            if isinstance(contextual_embeddings, list):
+                pooled_out = self.apply_pooling(contextual_embeddings[-1])
+            else:
+                pooled_out = self.apply_pooling(contextual_embeddings)
+            return contextual_embeddings, pooled_out
+        else:
+            return contextual_embeddings
 
     def get_initial_embedding(self, F, inputs):
         """Get the initial token embeddings that considers the token type and positional embeddings
@@ -246,17 +342,28 @@ def get_initial_embedding(self, F, inputs):
         ----------
         F
         inputs
-            Shape (batch_size, seq_length)
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
 
         Returns
         -------
         embedding
             The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
         """
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        else:
+            batch_axis, time_axis = 1, 0
         embedding = self.word_embed(inputs)
         if self.pos_embed_type:
-            positional_embedding = self.pos_embed(F.npx.arange_like(inputs, axis=1))
-            positional_embedding = F.np.expand_dims(positional_embedding, axis=0)
+            positional_embedding = self.pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
             embedding = embedding + positional_embedding
         if self.encoder_normalize_before:
             embedding = self.embed_ln(embedding)
@@ -270,12 +377,25 @@ def apply_pooling(self, sequence):
         This is used for pre-training or fine-tuning a mobile bert model.
         Get the first token of the whole sequence which is [CLS]
 
-        sequence:
-            Shape (batch_size, sequence_length, units)
-        return:
+        Parameters
+        ----------
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+
+        Returns
+        -------
+        ret
             Shape (batch_size, units)
         """
-        outputs = sequence[:, 0, :]
+        if self._layout == 'NT':
+            outputs = sequence[:, 0, :]
+        elif self._layout == 'TN':
+            outputs = sequence[0, :, :]
+        else:
+            raise NotImplementedError
         if self.classifier_activation:
             return self.pooler(outputs)
         else:
@@ -283,7 +403,7 @@ def apply_pooling(self, sequence):
 
     @staticmethod
     def get_cfg(key=None):
-        if key:
+        if key is not None:
             return roberta_cfg_reg.create(key)
         else:
             return roberta_base()
@@ -292,14 +412,14 @@ def get_cfg(key=None):
     def from_cfg(cls,
                  cfg,
                  use_pooler=True,
-                 dtype='float32',
-                 classifier_activation=False,
-                 encoder_normalize_before=True,
+                 dtype=None,
                  output_all_encodings=False) -> 'RobertaModel':
         cfg = RobertaModel.get_cfg().clone_merge(cfg)
         embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
         weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
         bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
         return cls(vocab_size=cfg.MODEL.vocab_size,
                    units=cfg.MODEL.units,
                    hidden_size=cfg.MODEL.hidden_size,
@@ -317,71 +437,9 @@ def from_cfg(cls,
                    bias_initializer=bias_initializer,
                    dtype=dtype,
                    use_pooler=use_pooler,
-                   encoder_normalize_before=encoder_normalize_before,
-                   output_all_encodings=output_all_encodings)
-
-
-@use_np
-class RobertaEncoder(HybridBlock):
-    def __init__(self,
-                 units=768,
-                 hidden_size=3072,
-                 num_layers=12,
-                 num_heads=12,
-                 attention_dropout_prob=0.1,
-                 hidden_dropout_prob=0.1,
-                 layer_norm_eps=1E-5,
-                 weight_initializer=TruncNorm(stdev=0.02),
-                 bias_initializer='zeros',
-                 activation='gelu',
-                 dtype='float32',
-                 output_all_encodings=False,
-                 output_attention=False):
-        super().__init__()
-        self.units = units
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.attention_dropout_prob = attention_dropout_prob
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.layer_norm_eps = layer_norm_eps
-        self.activation = activation
-        self._dtype = dtype
-        self._output_all_encodings = output_all_encodings
-        self._output_attention = output_attention
-        self.all_layers = nn.HybridSequential()
-        for layer_idx in range(self.num_layers):
-            self.all_layers.add(
-                TransformerEncoderLayer(
-                    units=self.units,
-                    hidden_size=self.hidden_size,
-                    num_heads=self.num_heads,
-                    attention_dropout_prob=self.attention_dropout_prob,
-                    hidden_dropout_prob=self.hidden_dropout_prob,
-                    layer_norm_eps=self.layer_norm_eps,
-                    weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer,
-                    activation=self.activation,
-                    dtype=self._dtype)
-            )
-
-    def hybrid_forward(self, F, x, valid_length):
-        atten_mask = gen_self_attn_mask(F, x, valid_length,
-                                        dtype=self._dtype, attn_type='full')
-        all_encodings_outputs = [x]
-        additional_outputs = []
-        for layer_idx in range(self.num_layers):
-            layer = self.all_layers[layer_idx]
-            x, attention_weights = layer(x, atten_mask)
-            if self._output_all_encodings:
-                all_encodings_outputs.append(x)
-            if self._output_attention:
-                additional_outputs.append(attention_weights)
-        # sequence_mask is not necessary here because masking could be performed in downstream tasks
-        if self._output_all_encodings:
-            return all_encodings_outputs, additional_outputs
-        else:
-            return x, additional_outputs
+                   output_all_encodings=output_all_encodings,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
 
 
 @use_np
@@ -432,19 +490,25 @@ def hybrid_forward(self, F, inputs, valid_length, masked_positions):
         Parameters
         ----------
         F
-        inputs :
-            Shape (batch_size, seq_length)
-        valid_length :
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        valid_length
             The valid length of each sequence
             Shape (batch_size,)
-        masked_positions :
+        masked_positions
             The masked position of the sequence
             Shape (batch_size, num_masked_positions).
 
         Returns
         -------
         contextual_embedding
-            Shape (batch_size, seq_length, units).
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
         pooled_out
             Shape (batch_size, units)
         mlm_scores :
@@ -456,6 +520,8 @@ def hybrid_forward(self, F, inputs, valid_length, masked_positions):
             contextual_embeddings = all_encodings_outputs[-1]
         else:
             contextual_embeddings = all_encodings_outputs
+        if self.backbone_model.layout == 'TN':
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
         mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
         mlm_scores = self.mlm_decoder(mlm_features)
         return all_encodings_outputs, pooled_out, mlm_scores
@@ -469,7 +535,7 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base',
                            root: str = get_model_zoo_home_dir(),
                            load_backbone: bool = True,
                            load_mlm: bool = False) \
-        -> Tuple[CN, HuggingFaceByteBPETokenizer, str]:
+        -> Tuple[CN, HuggingFaceByteBPETokenizer, str, str]:
     """Get the pretrained RoBERTa weights
 
     Parameters
@@ -497,14 +563,20 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_roberta())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     merges_path = PRETRAINED_URL[model_name]['merges']
     vocab_path = PRETRAINED_URL[model_name]['vocab']
     params_path = PRETRAINED_URL[model_name]['params']
     mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
 
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('vocab', vocab_path),
-                    ('merges', merges_path)]:
+    download_jobs = [('vocab', vocab_path), ('merges', merges_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
         local_paths[k] = download(url=get_repo_model_zoo_url() + path,
                                   path=os.path.join(root, path),
                                   sha1_hash=FILE_STATS[path])
@@ -526,7 +598,8 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base',
                     merges_file=local_paths['merges'],
                     vocab_file=local_paths['vocab'],
                     lowercase=do_lower)
-    cfg = RobertaModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = RobertaModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, local_mlm_params_path
 
 
diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py
index c83708917e..ea8940d8d3 100644
--- a/src/gluonnlp/models/transformer.py
+++ b/src/gluonnlp/models/transformer.py
@@ -1,3 +1,5 @@
+from abc import ABC
+
 import numpy as np
 import mxnet as mx
 from mxnet import use_np
@@ -31,6 +33,7 @@ def transformer_base():
     cfg.MODEL.attention_dropout = 0.0
     cfg.MODEL.activation_dropout = 0.0
     cfg.MODEL.dropout = 0.1
+    cfg.MODEL.layout = 'NT'
     cfg.MODEL.dtype = 'float32'
 
     # Parameters for the encoder
@@ -53,10 +56,6 @@ def transformer_base():
     cfg.MODEL.DECODER.activation = 'relu'
     cfg.MODEL.DECODER.pre_norm = False
 
-    # Parameters for mixture of models
-    cfg.MODEL.method = 'hMoElp'
-    cfg.MODEL.num_experts = 3
-
     # Parameters for the initializer
     cfg.INITIALIZER = CN()
     cfg.INITIALIZER.embed = ['xavier', 'gaussian', 'in', 1.0]
@@ -141,7 +140,8 @@ def __init__(self,
                  weight_initializer: Optional[InitializerType] = None,
                  bias_initializer: Optional[InitializerType] = 'zeros',
                  activation: str = 'relu',
-                 dtype='float32'):
+                 dtype='float32',
+                 layout='NT'):
         """
 
         Parameters
@@ -165,6 +165,7 @@ def __init__(self,
         bias_initializer
         activation
         dtype
+        layout
         """
         super().__init__()
         self._units = units
@@ -175,6 +176,9 @@ def __init__(self,
         self._activation_dropout_prob = activation_dropout_prob
         self._pre_norm = pre_norm
         self._dtype = dtype
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
         assert self._units % self._num_heads == 0, 'units must be divisive by the number of heads'
         self.dropout_layer = nn.Dropout(hidden_dropout_prob)
         self.attn_qkv = nn.Dense(3 * units,
@@ -191,14 +195,15 @@ def __init__(self,
                                        weight_initializer=weight_initializer,
                                        bias_initializer=bias_initializer,
                                        dtype=self._dtype)
-        self.self_attention =\
+        attention_layout = 'NTK' if self._layout == 'NT' else 'TNK'
+        self.self_attention = \
             MultiHeadAttentionCell(
                 query_units=self._units,
                 num_heads=self._num_heads,
                 attention_dropout=self._attention_dropout_prob,
                 scaled=True,
                 dtype=self._dtype,
-                layout='NTK'
+                layout=attention_layout
             )
         self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps,
                                        in_channels=units)
@@ -213,6 +218,10 @@ def __init__(self,
                                    pre_norm=pre_norm,
                                    dtype=self._dtype)
 
+    @property
+    def layout(self) -> str:
+        return self._layout
+
     def hybrid_forward(self, F, data, attn_mask):
         """
 
@@ -220,19 +229,23 @@ def hybrid_forward(self, F, data, attn_mask):
         ----------
         F
         data :
-            Shape (batch_size, seq_length, C_in)
+            If layout == 'NT'
+                Shape (batch_size, seq_length, C_in)
+            Else
+                Shape (seq_length, batch_size, C_in)
         attn_mask :
             Shape (batch_size, seq_length, seq_length)
 
         Returns
         -------
         out :
-            Shape (batch_size, seq_length, C_out)
+            If layout == 'NT'
+                Shape (batch_size, seq_length, C_out)
+            Else
+                Shape (seq_length, batch_size, C_out)
         attn_weight :
             Shape (batch_size, seq_length, seq_length)
         """
-        # TODO(sxjscience) Cannot use negative axis due to
-        #  https://github.com/apache/incubator-mxnet/issues/18132
         if self._pre_norm:
             data = self.layer_norm(data)
         query, key, value = F.np.split(self.attn_qkv(data), 3, axis=-1)
@@ -256,7 +269,7 @@ def __init__(self, num_layers=6, recurrent=False,
                  activation_dropout=0.0, dropout=0.1,
                  attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False,
                  pre_norm=False, weight_initializer=None, bias_initializer='zeros',
-                 activation='relu', dtype='float32'):
+                 activation='relu', dtype='float32', layout='NT'):
         """
 
         Parameters
@@ -277,6 +290,8 @@ def __init__(self, num_layers=6, recurrent=False,
         weight_initializer
         bias_initializer
         activation
+        dtype
+        layout
         """
         super().__init__()
         self._dtype = dtype
@@ -284,6 +299,9 @@ def __init__(self, num_layers=6, recurrent=False,
         self._recurrent = recurrent
         self._data_norm = data_norm
         self._pre_norm = pre_norm
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
         self.dropout_layer = nn.Dropout(dropout)
         if self._pre_norm:
             self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps,
@@ -307,8 +325,13 @@ def __init__(self, num_layers=6, recurrent=False,
                 bias_initializer=bias_initializer,
                 pre_norm=pre_norm,
                 activation=activation,
+                layout=self._layout,
                 dtype=dtype))
 
+    @property
+    def layout(self) -> str:
+        return self._layout
+
     def hybrid_forward(self, F, data, valid_length):
         """
 
@@ -316,18 +339,26 @@ def hybrid_forward(self, F, data, valid_length):
         ----------
         F
         data :
-            Shape (batch_size, seq_length, C)
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
         valid_length :
             Shape (batch_size,)
 
         Returns
         -------
         out :
-            Shape (batch_size, seq_length, C_out)
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
         """
         # 1. Embed the data
         attn_mask = gen_self_attn_mask(F, data, valid_length,
-                                       dtype=self._dtype, attn_type='full')
+                                       dtype=self._dtype,
+                                       layout=self.layout,
+                                       attn_type='full')
         out = self.dropout_layer(data)
         if self._data_norm:
             out = self.ln_data(out)
@@ -356,7 +387,8 @@ def __init__(self, units: int = 512,
                  pre_norm: bool = False,
                  weight_initializer=None,
                  bias_initializer='zeros',
-                 dtype='float32'):
+                 dtype='float32',
+                 layout='NT'):
         """
 
         Parameters
@@ -377,6 +409,9 @@ def __init__(self, units: int = 512,
         weight_initializer
         bias_initializer
         dtype
+            Data type
+        layout
+            Layout of the input
         """
         super().__init__()
         self._dtype = dtype
@@ -388,6 +423,10 @@ def __init__(self, units: int = 512,
         self._num_heads = num_heads
         self._attention_dropout = attention_dropout
         self._dtype = dtype
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        attention_layout = 'NTK' if layout == 'NT' else 'TNK'
         self.dropout_layer = nn.Dropout(dropout)
         if units % num_heads:
             raise ValueError('In Transformer, units should be divided exactly by the number of '
@@ -430,7 +469,7 @@ def __init__(self, units: int = 512,
                                                       num_heads=num_heads,
                                                       attention_dropout=self._attention_dropout,
                                                       dtype=dtype,
-                                                      layout='NTK')
+                                                      layout=attention_layout)
         self.proj_inter = nn.Dense(units=units, in_units=units,
                                    flatten=False, use_bias=True,
                                    weight_initializer=weight_initializer,
@@ -449,6 +488,10 @@ def __init__(self, units: int = 512,
                                    pre_norm=pre_norm,
                                    dtype=dtype)
 
+    @property
+    def layout(self) -> str:
+        return self._layout
+
     def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask):
         """
 
@@ -456,9 +499,15 @@ def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask):
         ----------
         F
         data :
-            Shape (batch_size, seq_length, C_in)
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
         mem :
-            Shape (batch_size, mem_length, C_mem)
+            - layout = 'NT'
+                Shape (batch_size, mem_length, C_mem)
+            - layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
         self_causal_mask :
             Shape (batch_size, seq_length, seq_length)
             Mask for the causal self-attention.
@@ -485,11 +534,11 @@ def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask):
         Returns
         -------
         out :
-            Shape (batch_size, seq_length, C_out)
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
         """
-        # TODO(szhengac)
-        #  Try the architecture in the "[ECCV2016] Identity Mappings in Deep Residual Networks".
-        #  Shuai proposed to switch the order of the activation layer.
         # 1. Get the causal self-attention value
         if self._pre_norm:
             data = self.ln_in(data)
@@ -524,22 +573,37 @@ def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask):
 
     @property
     def state_batch_axis(self):
-        return 0, 0
+        if self.layout == 'NT':
+            return 0, 0
+        else:
+            return 1, 1
 
     def init_states(self, batch_size, ctx, dtype='float32'):
         """Initialize the states required for incremental decoding
 
         Returns
         -------
-        init_key :
-            Shape (batch_size, 0, N, C_key)
+        init_key
+            - layout = 'NT'
+                Shape (batch_size, 0, N, C_key)
+            - layout = 'TN'
+                Shape (0, batch_size, N, C_key)
         init_value :
-            Shape (batch_size, 0, N, C_value)
+            - layout = 'NT'
+                Shape (batch_size, 0, N, C_value)
+            - layout = 'TN'
+                Shape (0, batch_size, N, C_value)
         """
-        init_key = mx.np.zeros(shape=(batch_size, 0, self._num_heads,
-                                      self._units // self._num_heads), ctx=ctx, dtype=dtype)
-        init_value = mx.np.zeros(shape=(batch_size, 0, self._num_heads,
-                                        self._units // self._num_heads), ctx=ctx, dtype=dtype)
+        if self.layout == 'NT':
+            init_key = mx.np.zeros(shape=(batch_size, 0, self._num_heads,
+                                          self._units // self._num_heads), ctx=ctx, dtype=dtype)
+            init_value = mx.np.zeros(shape=(batch_size, 0, self._num_heads,
+                                            self._units // self._num_heads), ctx=ctx, dtype=dtype)
+        else:
+            init_key = mx.np.zeros(shape=(0, batch_size, self._num_heads,
+                                          self._units // self._num_heads), ctx=ctx, dtype=dtype)
+            init_value = mx.np.zeros(shape=(0, batch_size, self._num_heads,
+                                            self._units // self._num_heads), ctx=ctx, dtype=dtype)
         return init_key, init_value
 
     def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_mask=None):
@@ -549,16 +613,25 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma
         ----------
         F
         data
-            Shape (batch_size, 1, C_in)
+            Shape (batch_size, C_in)
         states
             The previous states, contains
-            - prev_multi_key
-                Shape (batch_size, prev_seq_length, num_heads, C_key)
-            - prev_multi_value
-                Shape (batch_size, prev_seq_length, num_heads, C_value)
+            1. layout = 'NT':
+                - prev_multi_key
+                    Shape (batch_size, prev_seq_length, num_heads, C_key)
+                - prev_multi_value
+                    Shape (batch_size, prev_seq_length, num_heads, C_value)
+            2. layout = 'TN'
+                - prev_multi_key
+                    Shape (prev_seq_length, batch_size, num_heads, C_key)
+                - prev_multi_value
+                    Shape (prev_seq_length, batch_size, num_heads, C_value)
         mem
             The memory
-            Shape (batch_size, mem_length, C_mem)
+            1. layout = 'NT':
+                Shape (batch_size, mem_length, C_mem)
+            2. layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
         mem_valid_length
             Valid length of the memory
             Shape (batch_size,)
@@ -569,7 +642,7 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma
         Returns
         -------
         out
-            Shape (batch_size, 1, C_out)
+            Shape (batch_size, C_out)
         updated_states
             - new_key
                 Shape (batch_size, prev_seq_length + 1, num_heads, C_key)
@@ -578,19 +651,28 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma
         """
         if self._pre_norm:
             data = self.ln_in(data)
-        prev_key, prev_value = states  # Shape (B, prev_L, #Head, C_K), (B, prev_L, #Head, C_V)
+        if self.layout == 'NT':
+            time_axis = 1
+        else:
+            time_axis = 0
+        data = F.np.expand_dims(data, axis=time_axis)
+        # Shape (B, prev_L, #Head, C_K), (B, prev_L, #Head, C_V)
+        #  or (prev_L, B, #Head, C_K), (prev_L, B, #Head, C_V)
+        prev_key, prev_value = states
         if mem_attn_mask is None:
             mem_attn_mask = gen_mem_attn_mask(F, mem, mem_valid_length, data, None,
-                                              dtype=self._dtype)
+                                              dtype=self._dtype, layout=self.layout)
         # 1. Get the causal self-attention value, we need to attend to both the current data
         # and the previous stored key/values
-        step_qkv = self.attn_in_qkv(data)  # Shape (B, 1, 3 * num_heads * C_key)
+        # Shape (B, 1, 3 * num_heads * C_key)
+        #  or (1, B, 3 * num_heads * C_key)
+        step_qkv = self.attn_in_qkv(data)
         step_query, step_key, step_value = F.np.split(step_qkv, 3, axis=-1)
         step_query = F.npx.reshape(step_query, (-2, -2, self._num_heads, -1))
         step_key = F.npx.reshape(step_key, (-2, -2, self._num_heads, -1))
         step_value = F.npx.reshape(step_value, (-2, -2, self._num_heads, -1))
-        new_key = F.np.concatenate([prev_key, step_key], axis=1)
-        new_value = F.np.concatenate([prev_value, step_value], axis=1)
+        new_key = F.np.concatenate([prev_key, step_key], axis=time_axis)
+        new_value = F.np.concatenate([prev_value, step_value], axis=time_axis)
         out, [_, attn_weight] = self.self_attention(step_query, new_key, new_value, None)
         out = self.proj_in(out)
         out = self.dropout_layer(out)
@@ -615,6 +697,7 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma
             out = self.ln_inter(out)
         # 3. Encode the output via an FFN layer
         out = self.ffn(out)
+        out = F.npx.reshape(out, (-5, -1))
         return out, (new_key, new_value)
 
 
@@ -625,7 +708,8 @@ def __init__(self, num_layers=6, recurrent=False,
                  num_heads=8, max_shift=None, activation_dropout=0.0,
                  dropout=0.1, attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False,
                  pre_norm=False, weight_initializer=None, bias_initializer=None,
-                 activation='relu', dtype='float32'):
+                 activation='relu', dtype='float32',
+                 layout='NT'):
         super().__init__()
         self._dtype = dtype
         self._units = units
@@ -635,6 +719,9 @@ def __init__(self, num_layers=6, recurrent=False,
         self.max_shift = max_shift
         self._data_norm = data_norm
         self._pre_norm = pre_norm
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
         self.dropout_layer = nn.Dropout(dropout)
         if self._data_norm:
             self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps,
@@ -658,35 +745,53 @@ def __init__(self, num_layers=6, recurrent=False,
                                                     bias_initializer=bias_initializer,
                                                     activation=activation,
                                                     pre_norm=pre_norm,
+                                                    layout=layout,
                                                     dtype=dtype))
 
+    @property
+    def layout(self) -> str:
+        return self._layout
+
     def hybrid_forward(self, F, data, valid_length, mem_data, mem_valid_length):
         """
 
         Parameters
         ----------
         F
-        data :
-            Shape (batch_size, seq_length, C_in)
-        valid_length :
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        valid_length
             Shape (batch_size,)
-        mem_data :
-            Shape (batch_size, mem_length, C_mem)
-        mem_valid_length :
+        mem_data
+            - layout = 'NT'
+                Shape (batch_size, mem_length, C_mem)
+            - layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
+        mem_valid_length
             Shape (batch_size,)
+
         Returns
         -------
-        out :
-            Shape (batch_size, seq_length, C_out)
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
         """
         # 1. Embed the data
         out = self.dropout_layer(data)
         if self._data_norm:
             out = self.ln_data(out)
         self_causal_mask = gen_self_attn_mask(F, data, valid_length,
-                                              dtype=self._dtype, attn_type='causal')
+                                              dtype=self._dtype,
+                                              attn_type='causal',
+                                              layout=self._layout)
         mem_attn_mask = gen_mem_attn_mask(F, mem_data, mem_valid_length, data, valid_length,
-                                          dtype=self._dtype)
+                                          dtype=self._dtype,
+                                          layout=self._layout)
         for i in range(self.num_layers):
             if self.recurrent:
                 layer = self.layers[0]
@@ -708,15 +813,19 @@ def state_batch_axis(self):
             ret.append(layer.state_batch_axis)
         return ret
 
-    def init_states(self, batch_size, ctx, dtype):
+    def init_states(self, batch_size, ctx, dtype='float32'):
         """Initialize the states required for incremental decoding
 
         Returns
         -------
-        init_key :
-            Shape (batch_size, 0, N, C_key)
-        init_value :
-            Shape (batch_size, 0, N, C_value)
+        states
+            A list of states, each includes:
+                - init_key :
+                    layout = 'NT':
+                        Shape (batch_size, 0, N, C_key)
+                - init_value :
+                    layout = 'TN':
+                        Shape (0, batch_size, N, C_value)
         """
         states = []
         for i in range(self.num_layers):
@@ -736,16 +845,25 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length):
         ----------
         F
         data
-            Shape (batch_size, 1, C_in)
+            Shape (batch_size, C_in)
         states
             The previous states, contain a list of
-            - prev_multi_key
-                Shape (batch_size, prev_seq_length, num_heads, C_key)
-            - prev_multi_value
-                Shape (batch_size, prev_seq_length, num_heads, C_value)
+            1. layout = 'NT'
+                - prev_multi_key
+                    Shape (batch_size, prev_seq_length, num_heads, C_key)
+                - prev_multi_value
+                    Shape (batch_size, prev_seq_length, num_heads, C_value)
+            2. layout = 'TN'
+                - prev_multi_key
+                    Shape (prev_seq_length, batch_size, num_heads, C_key)
+                - prev_multi_value
+                    Shape (prev_seq_length, batch_size, num_heads, C_value)
         mem
             The memory
-            Shape (batch_size, mem_length, C_mem)
+            1. layout = 'NT'
+                Shape (batch_size, mem_length, C_mem)
+            2. layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
         mem_valid_length
             Valid length of the memory
             Shape (batch_size,)
@@ -753,20 +871,27 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length):
         Returns
         -------
         out
-            Shape (batch_size, 1, C_out)
+            Shape (batch_size, C_out)
         new_states
             The updated states, contain a list of
-            - new_key
-                Shape (batch_size, prev_seq_length + 1, num_heads, C_key)
-            - new_value
-                Shape (batch_size, prev_seq_length + 1, num_heads, C_value)
+            1. layout = 'NT'
+                - new_key
+                    Shape (batch_size, prev_seq_length + 1, num_heads, C_key)
+            2. layout = 'TN'
+                - new_value
+                    Shape (prev_seq_length + 1, batch_size, num_heads, C_value)
         """
         # 1. Embed the data
         out = self.dropout_layer(data)
         if self._data_norm:
             out = self.ln_data(out)
-        mem_attn_mask = gen_mem_attn_mask(F, mem, mem_valid_length, data, None,
-                                          dtype=self._dtype)
+        time_axis = 0 if self.layout == 'TN' else 1
+        # Generate the mem_attn_mask
+        time_steps = F.npx.arange_like(mem, axis=time_axis)  # (mem_length,)
+        mem_attn_mask = F.np.reshape(time_steps, (1, 1, -1))\
+                        < F.np.reshape(mem_valid_length, (-1, 1, 1))
+        # TODO(sxjscience) Try with boolean masking
+        mem_attn_mask = mem_attn_mask.astype(self._dtype)
         new_states = []
         for i in range(self.num_layers):
             if self.recurrent:
@@ -814,7 +939,8 @@ def __init__(self, src_vocab_size: int,
                  embed_initializer=mx.init.Xavier('gaussian', 'in', 1),
                  weight_initializer=mx.init.Xavier('uniform', 'avg', 3),
                  bias_initializer='zeros',
-                 dtype='float32'):
+                 dtype='float32',
+                 layout='NT'):
         """
 
         Parameters
@@ -885,6 +1011,8 @@ def __init__(self, src_vocab_size: int,
             Initializer of the bias
         dtype
             Data type of the weights
+        layout
+            The layout of the input + target
         """
         super().__init__()
         assert src_vocab_size > 0 and tgt_vocab_size > 0,\
@@ -904,6 +1032,9 @@ def __init__(self, src_vocab_size: int,
         self.enc_units = enc_units
         self.dec_units = dec_units
         self.weight_initializer = weight_initializer
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
         if max_src_length is not None and max_src_length < 0:
             max_src_length = None
         if max_tgt_length is not None and max_tgt_length < 0:
@@ -950,7 +1081,8 @@ def __init__(self, src_vocab_size: int,
                                           activation=enc_activation,
                                           data_norm=data_norm,
                                           pre_norm=enc_pre_norm,
-                                          dtype=self._dtype)
+                                          dtype=self._dtype,
+                                          layout=layout)
         self.decoder = TransformerDecoder(num_layers=dec_num_layers,
                                           recurrent=dec_recurrent,
                                           units=dec_units,
@@ -966,7 +1098,8 @@ def __init__(self, src_vocab_size: int,
                                           activation=dec_activation,
                                           data_norm=data_norm,
                                           pre_norm=dec_pre_norm,
-                                          dtype=self._dtype)
+                                          dtype=self._dtype,
+                                          layout=layout)
         if tie_weights:
             self.tgt_final_layer = \
                 nn.Dense(tgt_vocab_size, flatten=False,
@@ -985,6 +1118,10 @@ def __init__(self, src_vocab_size: int,
         self.encoder.hybridize()
         self.decoder.hybridize()
 
+    @property
+    def layout(self) -> str:
+        return self._layout
+
     @property
     def src_vocab_size(self):
         return self._src_vocab_size
@@ -1001,25 +1138,35 @@ def encode(self, F, src_data, src_valid_length):
         Parameters
         ----------
         F
-        src_data :
-            Shape (batch_size, src_length)
-        src_valid_length :
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        enc_out :
-            Shape (batch_size, src_length, C_out)
+        enc_out
+            - layout = 'NT'
+                Shape (batch_size, src_length, C_out)
+            - layout = 'TN'
+                Shape (src_length, batch_size, C_out)
         """
-        embeddings = self.src_embed_layer(src_data)
+        src_data = self.src_embed_layer(src_data)
         if self.scaled_embed:
-            embeddings = embeddings * np.sqrt(self.enc_units)
+            src_data = src_data * np.sqrt(self.enc_units)
         if self.pos_embed_type is not None:
-            positional_embedding = self.src_pos_embed_layer(F.npx.arange_like(src_data, axis=1))
-            embeddings = embeddings + positional_embedding
+            if self.layout == 'NT':
+                src_data = src_data + self.src_pos_embed_layer(F.npx.arange_like(src_data, axis=1))
+            else:
+                src_data = src_data + F.np.expand_dims(self.src_pos_embed_layer(
+                    F.npx.arange_like(src_data, axis=0)), axis=1)
         if self.layernorm_embedding:
-            embeddings = self.src_embed_ln(embeddings)
-        enc_out = self.encoder(embeddings, src_valid_length)
+            src_data = self.src_embed_ln(src_data)
+
+        enc_out = self.encoder(src_data, src_valid_length)
         return enc_out
 
     def decode_seq(self, F, tgt_data, tgt_valid_length, mem_data, mem_valid_length):
@@ -1028,29 +1175,42 @@ def decode_seq(self, F, tgt_data, tgt_valid_length, mem_data, mem_valid_length):
         Parameters
         ----------
         F
-        tgt_data :
-            Shape (batch_size, tgt_length)
-        tgt_valid_length :
+        tgt_data
+            - layout = 'NT'
+                Shape (batch_size, tgt_length)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size)
+        tgt_valid_length
             Shape (batch_size,)
-        mem_data :
-            Shape (batch_size, src_length, C_out)
+        mem_data
+            - layout = 'NT'
+                Shape (batch_size, src_length, C_out)
+            - layout = 'TN'
+                Shape (src_length, batch_size, C_out)
         mem_valid_length :
             Shape (batch_size,)
 
         Returns
         -------
-        dec_out :
-            Shape (batch_size, tgt_length, tgt_vocab_size)
+        dec_out
+            - layout = 'NT'
+                Shape (batch_size, tgt_length, tgt_vocab_size)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size, tgt_vocab_size)
         """
-        embeddings = self.tgt_embed_layer(tgt_data)
+        tgt_data = self.tgt_embed_layer(tgt_data)
         if self.scaled_embed:
-            embeddings = embeddings * np.sqrt(self.dec_units)
-        if self.tgt_pos_embed_layer is not None:
-            positional_embedding = self.tgt_pos_embed_layer(F.npx.arange_like(tgt_data, axis=1))
-            embeddings = embeddings + positional_embedding
+            tgt_data = tgt_data * np.sqrt(self.dec_units)
+        if self.pos_embed_type is not None:
+            if self.layout == 'NT':
+                tgt_data = tgt_data + self.tgt_pos_embed_layer(
+                    F.npx.arange_like(tgt_data, axis=1))
+            else:
+                tgt_data = tgt_data + F.np.expand_dims(self.tgt_pos_embed_layer(
+                    F.npx.arange_like(tgt_data, axis=0)), axis=1)
         if self.layernorm_embedding:
-            embeddings = self.tgt_embed_ln(embeddings)
-        dec_out = self.decoder(embeddings, tgt_valid_length, mem_data, mem_valid_length)
+            tgt_data = self.tgt_embed_ln(tgt_data)
+        dec_out = self.decoder(tgt_data, tgt_valid_length, mem_data, mem_valid_length)
         return dec_out
 
     def hybrid_forward(self, F, src_data, src_valid_length, tgt_data, tgt_valid_length):
@@ -1059,19 +1219,28 @@ def hybrid_forward(self, F, src_data, src_valid_length, tgt_data, tgt_valid_leng
         Parameters
         ----------
         F
-        src_data :
-            Shape (batch_size, src_length)
-        src_valid_length :
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
             Shape (batch_size,)
-        tgt_data :
-            Shape (batch_size, tgt_length)
-        tgt_valid_length :
+        tgt_data
+            - layout = 'NT'
+                Shape (batch_size, tgt_length)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size)
+        tgt_valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        out :
-            Shape (batch_size, tgt_length, tgt_vocab_size)
+        out
+            - layout = 'NT'
+                Shape (batch_size, tgt_length, tgt_vocab_size)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size, tgt_vocab_size)
         """
         enc_out = self.encode(F, src_data, src_valid_length)
         dec_out = self.decode_seq(F, tgt_data, tgt_valid_length, enc_out, src_valid_length)
@@ -1087,11 +1256,13 @@ def get_cfg(cls, key=None):
             return transformer_cfg_reg.create(key)
 
     @classmethod
-    def from_cfg(cls, cfg):
+    def from_cfg(cls, cfg, dtype=None):
         cfg = cls.get_cfg().clone_merge(cfg)
         embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
         weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
         bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
         return cls(src_vocab_size=cfg.MODEL.src_vocab_size,
                    tgt_vocab_size=cfg.MODEL.tgt_vocab_size,
                    max_src_length=cfg.MODEL.max_src_length,
@@ -1117,10 +1288,11 @@ def from_cfg(cls, cfg):
                    dec_recurrent=cfg.MODEL.DECODER.recurrent,
                    dec_activation=cfg.MODEL.DECODER.activation,
                    dec_pre_norm=cfg.MODEL.DECODER.pre_norm,
+                   layout=cfg.MODEL.layout,
                    embed_initializer=embed_initializer,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer,
-                   dtype=cfg.MODEL.dtype)
+                   dtype=dtype)
 
 
 @use_np
@@ -1154,33 +1326,45 @@ def state_batch_axis(self) -> Tuple[int, int, int, List]:
         position_batch_axis : int
         dec_layer_batch_axis : list
         """
-        return 0, 0, 0, self.model.decoder.state_batch_axis
+        if self.model.layout == 'NT':
+            return 0, 0, 0, self.model.decoder.state_batch_axis
+        else:
+            return 1, 0, 0, self.model.decoder.state_batch_axis
 
     def init_states(self, src_data, src_valid_length):  # TODO(sxjscience) Revisit here, support auxiliary states?
         """Initialize the states required for sequence sampling
 
         Parameters
         ----------
-        src_data :
-            Shape (batch_size, src_length)
-        src_valid_length :
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
             Shape (batch_size,)
 
         Returns
         -------
-        enc_out :
-            Shape (batch_size, src_length, C_mem)
-        src_valid_length :
+        enc_out
+            - layout = 'NT'
+                Shape (batch_size, src_length, C_mem)
+            - layout = 'TN'
+                Shape (src_length, batch_size, C_mem)
+        src_valid_length
             Shape (batch_size,)
-        position :
+        position
             Shape (batch_size,)
         dec_states: list
             The states of the decoder
         """
-        batch_size = src_data.shape[0]
+        if self.model.layout == 'NT':
+            batch_size = src_data.shape[0]
+        else:
+            batch_size = src_data.shape[1]
         ctx = src_data.ctx
-        enc_out = self.model.encode(mx.nd, src_data, src_valid_length)
-        position = mx.np.zeros((batch_size, 1), dtype=np.int32, ctx=ctx)
+        enc_out = self.model.encode(mx, src_data, src_valid_length)
+        position = mx.np.zeros((batch_size,), dtype=np.int32, ctx=ctx)
         dtype = enc_out.dtype
         dec_states = self.model.decoder.init_states(batch_size, ctx, dtype)
         return enc_out, src_valid_length, position, dec_states
@@ -1190,32 +1374,38 @@ def hybrid_forward(self, F, step_data, states):
 
         Parameters
         ----------
-        step_data :
+        step_data
             Shape (batch_size,)
-        states : tuple
+        states
             It includes :
-                mem_data : (batch_size, src_length, C_mem)
-                mem_valid_length : (batch_size,)
-                position : (batch_size,)
-                dec_states : list
+                - layout = 'NT'
+                    mem_data : (batch_size, src_length, C_mem)
+                    mem_valid_length : (batch_size,)
+                    position : (batch_size,)
+                    dec_states : list
+                - layout = 'TN'
+                    mem_data : (src_length, batch_size, C_mem)
+                    mem_valid_length : (batch_size,)
+                    position : (batch_size,)
+                    dec_states : list
         Returns
         -------
-        out :
+        out
             Shape (batch_size, C)
-        new_states : tuple
+        new_states
             Has the same structure as the states
         """
         mem_data, mem_valid_length, position, dec_states = states
         # 1. Get the embedding
-        step_data = F.np.expand_dims(step_data, axis=1)
         step_data = self.model.tgt_embed_layer(step_data)
         if self.model.scaled_embed:
             step_data = step_data * np.sqrt(self.model.dec_units)
         if self.model.pos_embed_type is not None:
             step_data = step_data + self.model.tgt_pos_embed_layer(position)
+        if self.model.layernorm_embedding:
+            step_data = self.tgt_embed_ln(step_data)
         out, new_states =\
             self.model.decoder.incremental_decode(F, step_data, dec_states,
                                                   mem_data, mem_valid_length)
         out = self.model.tgt_final_layer(out)
-        out = F.npx.reshape(out, (-2, -1))
         return out, (mem_data, mem_valid_length, position + 1, new_states)
diff --git a/src/gluonnlp/models/transformer_xl.py b/src/gluonnlp/models/transformer_xl.py
index a232ec8c37..b6ff44c5df 100644
--- a/src/gluonnlp/models/transformer_xl.py
+++ b/src/gluonnlp/models/transformer_xl.py
@@ -81,6 +81,10 @@ def __init__(self, units: int = 512,
                                    pre_norm=pre_norm,
                                    dtype=dtype)
 
+    @property
+    def layout(self):
+        return self._layout
+
     def hybrid_forward(self, F, data, mem, rel_positions, mask, query_r_bias, query_k_bias):
         """
 
@@ -118,7 +122,10 @@ def hybrid_forward(self, F, data, mem, rel_positions, mask, query_r_bias, query_
         Returns
         -------
         out
-            Shape (batch_size, query_length, units)
+            - layout = 'NT'
+                Shape (batch_size, query_length, units)
+            - layout = 'TN'
+                Shape (query_length, batch_size, units)
         """
         if self._layout == 'NT':
             context = F.np.concatenate([mem, data], axis=1)
diff --git a/src/gluonnlp/models/xlmr.py b/src/gluonnlp/models/xlmr.py
index b433d34157..66a3784557 100644
--- a/src/gluonnlp/models/xlmr.py
+++ b/src/gluonnlp/models/xlmr.py
@@ -39,23 +39,6 @@
 from ..data.tokenizers import SentencepieceTokenizer
 
 
-PRETRAINED_URL = {
-    'fairseq_xlmr_base': {
-        'cfg': 'fairseq_xlmr_base/model-b893d178.yml',
-        'sentencepiece.model': 'fairseq_xlmr_base/sentencepiece-18e17bae.model',
-        'params': 'fairseq_xlmr_base/model-3fa134e9.params',
-        'mlm_params': 'fairseq_xlmr_base/model_mlm-86e37954.params',
-        'lowercase': False,
-    },
-    'fairseq_xlmr_large': {
-        'cfg': 'fairseq_xlmr_large/model-01fc59fb.yml',
-        'sentencepiece.model': 'fairseq_xlmr_large/sentencepiece-18e17bae.model',
-        'params': 'fairseq_xlmr_large/model-b62b074c.params',
-        'mlm_params': 'fairseq_xlmr_large/model_mlm-887506c2.params',
-        'lowercase': False,
-    }
-}
-
 FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'xlmr.txt'))
 xlmr_cfg_reg = Registry('xlmr_cfg')
 
@@ -86,10 +69,31 @@ def get_cfg(key=None):
             return xlmr_cfg_reg.create(key)
         else:
             return xlmr_base()
+
+
+PRETRAINED_URL = {
+    'fairseq_xlmr_base': {
+        'cfg': xlmr_base(),
+        'sentencepiece.model': 'fairseq_xlmr_base/sentencepiece-18e17bae.model',
+        'params': 'fairseq_xlmr_base/model-3fa134e9.params',
+        'mlm_params': 'fairseq_xlmr_base/model_mlm-86e37954.params',
+        'lowercase': False,
+    },
+    'fairseq_xlmr_large': {
+        'cfg': xlmr_large(),
+        'sentencepiece.model': 'fairseq_xlmr_large/sentencepiece-18e17bae.model',
+        'params': 'fairseq_xlmr_large/model-b62b074c.params',
+        'mlm_params': 'fairseq_xlmr_large/model_mlm-887506c2.params',
+        'lowercase': False,
+    }
+}
+
+
 @use_np
 class XLMRForMLM(RobertaForMLM):
     pass
 
+
 def list_pretrained_xlmr():
     return sorted(list(PRETRAINED_URL.keys()))
 
@@ -98,7 +102,7 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base',
                         root: str = get_model_zoo_home_dir(),
                         load_backbone: bool = True,
                         load_mlm: bool = False) \
-        -> Tuple[CN, SentencepieceTokenizer, str]:
+        -> Tuple[CN, SentencepieceTokenizer, str, str]:
     """Get the pretrained XLM-R weights
 
     Parameters
@@ -126,11 +130,18 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base',
     assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
         model_name, list_pretrained_xlmr())
     cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
     sp_model_path = PRETRAINED_URL[model_name]['sentencepiece.model']
     params_path = PRETRAINED_URL[model_name]['params']
     mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
     local_paths = dict()
-    for k, path in [('cfg', cfg_path), ('sentencepiece.model', sp_model_path)]:
+    download_jobs = [('sentencepiece.model', sp_model_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
         local_paths[k] = download(url=get_repo_model_zoo_url() + path,
                                   path=os.path.join(root, path),
                                   sha1_hash=FILE_STATS[path])
@@ -152,7 +163,8 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base',
     tokenizer = SentencepieceTokenizer(
                     model_path=local_paths['sentencepiece.model'],
                     lowercase=do_lower)
-    cfg = XLMRModel.get_cfg().clone_merge(local_paths['cfg'])
+    if cfg is None:
+        cfg = XLMRModel.get_cfg().clone_merge(local_paths['cfg'])
     return cfg, tokenizer, local_params_path, local_mlm_params_path
 
 
diff --git a/src/gluonnlp/op.py b/src/gluonnlp/op.py
index ba5bf7607a..a4762b4ad3 100644
--- a/src/gluonnlp/op.py
+++ b/src/gluonnlp/op.py
@@ -290,3 +290,22 @@ def relative_position_bucket(F, relative_position,
     val_if_large = F.np.minimum(val_if_large, num_buckets - 1)
     ret = ret + F.np.where(is_small, relative_position, val_if_large)
     return ret
+
+
+def l2_normalize(F, data, axis=-1, eps=1e-6):
+    """Normalize the data by L2 normalization.
+    
+    Parameters
+    ----------
+    F : mx.sym or mx.nd
+    data : symbol or ndarray
+    axis : int, default -1
+    eps : float, default 1e-6
+    
+    Returns
+    -------
+    ret : mx.sym or mx.nd
+    """
+    ret = data / (F.np.linalg.norm(data, axis=axis, keepdims=True) + eps)
+    return ret
+
diff --git a/src/gluonnlp/utils/testing.py b/src/gluonnlp/utils/testing.py
index 00b2d4901d..abae1a804e 100644
--- a/src/gluonnlp/utils/testing.py
+++ b/src/gluonnlp/utils/testing.py
@@ -3,19 +3,56 @@
 from mxnet.util import use_np
 
 
+def is_match_states_batch_size(states, states_batch_axis, batch_size) -> bool:
+    """Test whether the generated states have the specified batch size
+
+    Parameters
+    ----------
+    states
+        The states structure
+    states_batch_axis
+        The states batch axis structure
+    batch_size
+        The batch size
+
+    Returns
+    -------
+    ret
+    """
+    if states_batch_axis is None:
+        return True
+    if isinstance(states_batch_axis, int):
+        if states.shape[states_batch_axis] == batch_size:
+            return True
+    for ele_states_batch_axis, ele_states in zip(states_batch_axis, states):
+        ret = is_match_states_batch_size(ele_states, ele_states_batch_axis, batch_size)
+        if ret is False:
+            return False
+    return True
+
+
 @use_np
-def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10,
-                     atol=1E-5, rtol=1E-5):
+def verify_nmt_model(model, batch_size: int = 4,
+                     src_seq_length: int = 5,
+                     tgt_seq_length: int = 10,
+                     atol: float = 1E-4,
+                     rtol: float = 1E-4):
     """Verify the correctness of an NMT model. Raise error message if it detects problems.
 
     Parameters
     ----------
-    model :
-    batch_size :
-    src_seq_length :
-    tgt_seq_length :
-    atol :
-    rtol :
+    model
+        The machine translation model
+    batch_size
+        The batch size to test the nmt model
+    src_seq_length
+        Length of the source sequence
+    tgt_seq_length
+        Length of the target sequence
+    atol
+        Absolute tolerance.
+    rtol
+        Relative tolerance.
 
     """
     src_word_sequence = mx.np.random.randint(0, model.src_vocab_size, (batch_size, src_seq_length))
@@ -23,7 +60,13 @@ def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10,
     src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,))
     min_tgt_seq_length = max(1, tgt_seq_length - 5)
     tgt_valid_length = mx.np.random.randint(min_tgt_seq_length, tgt_seq_length, (batch_size,))
-    full_out = model(src_word_sequence, src_valid_length, tgt_word_sequence, tgt_valid_length)
+
+    if model.layout == 'NT':
+        full_out = model(src_word_sequence, src_valid_length, tgt_word_sequence, tgt_valid_length)
+    else:
+        full_out = model(src_word_sequence.T, src_valid_length,
+                         tgt_word_sequence.T, tgt_valid_length)
+        full_out = mx.np.swapaxes(full_out, 0, 1)
     if full_out.shape != (batch_size, tgt_seq_length, model.tgt_vocab_size):
         raise AssertionError('The output of NMT model does not match the expected output.'
                              ' Model output shape = {}, Expected (B, T, V) = {}'
@@ -31,11 +74,19 @@ def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10,
                                      (batch_size, tgt_seq_length, model.tgt_vocab_size)))
     for partial_batch_size in range(1, batch_size + 1):
         for i in range(1, min_tgt_seq_length):
-            partial_out = model(src_word_sequence[:partial_batch_size, :],
-                                src_valid_length[:partial_batch_size],
-                                tgt_word_sequence[:partial_batch_size, :(-i)],
-                                tgt_valid_length[:partial_batch_size]
-                                - mx.np.array(i, dtype=tgt_valid_length.dtype))
+            if model.layout == 'NT':
+                partial_out = model(src_word_sequence[:partial_batch_size, :],
+                                    src_valid_length[:partial_batch_size],
+                                    tgt_word_sequence[:partial_batch_size, :(-i)],
+                                    tgt_valid_length[:partial_batch_size]
+                                    - mx.np.array(i, dtype=tgt_valid_length.dtype))
+            else:
+                partial_out = model(src_word_sequence[:partial_batch_size, :].T,
+                                    src_valid_length[:partial_batch_size],
+                                    tgt_word_sequence[:partial_batch_size, :(-i)].T,
+                                    tgt_valid_length[:partial_batch_size]
+                                    - mx.np.array(i, dtype=tgt_valid_length.dtype))
+                partial_out = mx.np.swapaxes(partial_out, 0, 1)
             # Verify that the partial output matches the full output
             for b in range(partial_batch_size):
                 partial_vl = tgt_valid_length.asnumpy()[b] - i
@@ -45,37 +96,66 @@ def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10,
 
 @use_np
 def verify_nmt_inference(train_model, inference_model,
-                         batch_size=4, src_seq_length=5, tgt_seq_length=10, atol=1E-5, rtol=1E-5):
+                         batch_size=4, src_seq_length=5,
+                         tgt_seq_length=10, atol=1E-4, rtol=1E-4):
     """Verify the correctness of an NMT inference model. Raise error message if it detects
     any problems.
 
     Parameters
     ----------
-    train_model :
-    inference_model :
-    batch_size :
-    src_seq_length :
-    tgt_seq_length :
-    atol :
-    rtol :
+    train_model
+    inference_model
+    batch_size
+    src_seq_length
+    tgt_seq_length
+    atol
+        Absolute tolerance
+    rtol
+        Relative tolerance
 
     """
-    src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size,
-                                              (batch_size, src_seq_length))
-    tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size,
-                                              (batch_size, tgt_seq_length))
+    if train_model.layout == 'NT':
+        src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size,
+                                                  (batch_size, src_seq_length))
+        tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size,
+                                                  (batch_size, tgt_seq_length))
+    else:
+        src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size,
+                                                  (src_seq_length, batch_size))
+        tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size,
+                                                  (tgt_seq_length, batch_size))
     src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,))
     min_tgt_seq_length = max(1, tgt_seq_length - 5)
     tgt_valid_length = mx.np.random.randint(min_tgt_seq_length, tgt_seq_length, (batch_size,))
     full_out = train_model(src_word_sequences, src_valid_length,
                            tgt_word_sequences, tgt_valid_length)
-    for partial_batch_size in range(1, batch_size + 1):
-        step_out_l = []
-        states = inference_model.init_states(src_word_sequences[:partial_batch_size, :],
-                                             src_valid_length[:partial_batch_size])
-        for i in range(min_tgt_seq_length):
-            step_out, states = inference_model(tgt_word_sequences[:partial_batch_size, i], states)
-            step_out_l.append(step_out)
-        partial_out = mx.np.stack(step_out_l, axis=1)
-        npt.assert_allclose(full_out[:partial_batch_size, :min_tgt_seq_length].asnumpy(),
-                            partial_out[:partial_batch_size, :].asnumpy(), atol, rtol)
+    if train_model.layout == 'NT':
+        for partial_batch_size in range(1, batch_size + 1):
+            step_out_l = []
+            states = inference_model.init_states(src_word_sequences[:partial_batch_size, :],
+                                                 src_valid_length[:partial_batch_size])
+            assert is_match_states_batch_size(states, inference_model.state_batch_axis,
+                                              partial_batch_size)
+            for i in range(min_tgt_seq_length):
+                step_out, states = inference_model(tgt_word_sequences[:partial_batch_size, i],
+                                                   states)
+                step_out_l.append(step_out)
+            partial_out = mx.np.stack(step_out_l, axis=1)
+            npt.assert_allclose(full_out[:partial_batch_size, :min_tgt_seq_length].asnumpy(),
+                                partial_out[:partial_batch_size, :].asnumpy(), atol, rtol)
+    elif train_model.layout == 'TN':
+        for partial_batch_size in range(1, batch_size + 1):
+            step_out_l = []
+            states = inference_model.init_states(src_word_sequences[:, :partial_batch_size],
+                                                 src_valid_length[:partial_batch_size])
+            assert is_match_states_batch_size(states, inference_model.state_batch_axis,
+                                              partial_batch_size)
+            for i in range(min_tgt_seq_length):
+                step_out, states = inference_model(tgt_word_sequences[i, :partial_batch_size],
+                                                   states)
+                step_out_l.append(step_out)
+            partial_out = mx.np.stack(step_out_l, axis=0)
+            npt.assert_allclose(full_out[:min_tgt_seq_length, :partial_batch_size].asnumpy(),
+                                partial_out[:, :partial_batch_size].asnumpy(), atol, rtol)
+    else:
+        raise NotImplementedError
diff --git a/test_batch.sh b/test_batch.sh
deleted file mode 100644
index e12efd96a6..0000000000
--- a/test_batch.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-python3 tools/batch/submit-job.py \
-  --region us-east-1 \
-  --job-type g4dn.4x \
-  --name test_conversion \
-  --work-dir scripts/conversion_toolkits/ \
-  --command 'bash convert_bert_from_tf_hub.sh | tee stdout.log' \
-  --wait 
diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py
index 489f566beb..3b874b0d55 100644
--- a/tests/test_attention_cell.py
+++ b/tests/test_attention_cell.py
@@ -173,23 +173,27 @@ def test_dot_product_attention(scaled, normalized):
 @pytest.mark.seed(123)
 def test_gen_attn_mask():
     class GenSelfAttnMask(HybridBlock):
-        def __init__(self, dtype, attn_type):
+        def __init__(self, dtype, layout, attn_type):
             super().__init__()
             self._dtype = dtype
+            self._layout = layout
             self._attn_type = attn_type
 
         def hybrid_forward(self, F, data, valid_length):
             return gen_self_attn_mask(F, data, valid_length,
-                                      dtype=self._dtype, attn_type=self._attn_type)
+                                      dtype=self._dtype,
+                                      layout=self._layout,
+                                      attn_type=self._attn_type)
 
     class GenMemAttnMask(HybridBlock):
-        def __init__(self, dtype):
+        def __init__(self, dtype, layout):
             super().__init__()
             self._dtype = dtype
+            self._layout = layout
 
         def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
             return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length,
-                                     dtype=self._dtype)
+                                     dtype=self._dtype, layout=self._layout)
 
     batch_size = 4
     query_length = 8
@@ -203,11 +207,17 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
 
     for hybridize in [False, True]:
         # Test Full Attention Mask
-        mask_gen = GenSelfAttnMask(dtype=np.float32, attn_type='full')
+        mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full')
+        mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full')
         if hybridize:
-            mask_gen.hybridize()
-        mask = mask_gen(data, valid_length)
-        mask = mask.asnumpy()
+            mask_gen_nt.hybridize()
+            mask_gen_tn.hybridize()
+        mask_nt = mask_gen_nt(data, valid_length)
+        mask_nt = mask_nt.asnumpy()
+        mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
+        mask_tn = mask_tn.asnumpy()
+        mask = mask_nt
+        assert_allclose(mask_nt, mask_tn)
         for b in range(batch_size):
             v_l = valid_length.asnumpy()[b]
             for i in range(v_l):
@@ -217,11 +227,15 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
                 assert (mask[b, i, :] == 0).all()
 
         # Test Causal Attention Mask
-        mask_gen = GenSelfAttnMask(dtype=np.float32, attn_type='causal')
+        mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal')
+        mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal')
         if hybridize:
-            mask_gen.hybridize()
-        mask = mask_gen(data, valid_length)
-        mask = mask.asnumpy()
+            mask_gen_nt.hybridize()
+            mask_gen_tn.hybridize()
+        mask_nt = mask_gen_nt(data, valid_length)
+        mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
+        assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
+        mask = mask_nt.asnumpy()
         for b in range(batch_size):
             v_l = valid_length.asnumpy()[b]
             for i in range(v_l):
@@ -231,11 +245,16 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
                 assert (mask[b, i, :] == 0).all()
 
         # Test Mem Attention Mask
-        mask_gen = GenMemAttnMask(dtype=np.float32)
+        mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT')
+        mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN')
         if hybridize:
-            mask_gen.hybridize()
-        mask = mask_gen(mem, mem_valid_length, data, valid_length)
-        mask = mask.asnumpy()
+            mask_gen_nt.hybridize()
+            mask_gen_tn.hybridize()
+        mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length)
+        mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length,
+                              mx.np.swapaxes(data, 0, 1), valid_length)
+        mask = mask_nt.asnumpy()
+        assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
         for b in range(batch_size):
             data_v_l = valid_length.asnumpy()[b]
             mem_v_l = mem_valid_length.asnumpy()[b]
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
new file mode 100644
index 0000000000..b9be912339
--- /dev/null
+++ b/tests/test_embedding.py
@@ -0,0 +1,50 @@
+import numpy as np
+import collections
+import os
+import tempfile
+import pytest
+from gluonnlp.embedding import load_embeddings, get_fasttext_model
+from gluonnlp.data import Vocab
+
+def test_load_embeddings():
+    text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world', 'sadgood']
+    counter = collections.Counter(text_data)
+    vocab1 = Vocab(counter)
+    # load with vocab
+    matrix1 = load_embeddings(vocab1)
+    assert len(matrix1) == len(vocab1)
+    # load without vocab
+    matrix2, vocab2 = load_embeddings()
+    assert len(matrix2) == len(vocab2)
+    np.testing.assert_almost_equal(matrix1[vocab1["hello"]], matrix2[vocab2["hello"]])
+
+    # test_unk_method
+    def simple(words):
+        return np.ones((len(words), 50))
+    matrix3 = load_embeddings(vocab1, unk_method=simple)
+    assert sum(matrix3[vocab1['sadgood']] == 1) == matrix3.shape[-1]
+    np.testing.assert_almost_equal(matrix3[vocab1["hello"]], matrix2[vocab2["hello"]])
+
+    # load txt
+    with tempfile.TemporaryDirectory() as root:
+        path = os.path.join(root, "tmp.txt")
+        with open(path, "w") as f:
+            f.write("{} {}\n".format(matrix1.shape[0], matrix1.shape[1]))
+            for word, vec in zip(vocab1.all_tokens, matrix1):
+                f.write(word + " ")
+                f.write(" ".join([str(num) for num in vec.tolist()]))
+                f.write("\n")
+        matrix4 = load_embeddings(vocab1, path)
+        np.testing.assert_almost_equal(matrix4, matrix1)
+
+        
+def test_get_fasttext_model():
+    text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
+    counter = collections.Counter(text_data)
+    vocab1 = Vocab(counter)
+    matrix1 = load_embeddings(vocab1, 'wiki.en')
+    ft = get_fasttext_model('wiki.en')
+    np.testing.assert_almost_equal(matrix1[vocab1["hello"]], ft['hello'], decimal=4)
+    with pytest.raises(ValueError):
+        get_fasttext_model('wiki.multi.ar')
+
diff --git a/tests/test_models_albert.py b/tests/test_models_albert.py
index 2fd7bbdba5..f428a85569 100644
--- a/tests/test_models_albert.py
+++ b/tests/test_models_albert.py
@@ -30,17 +30,36 @@ def get_test_cfg():
     return cfg
 
 
-def test_albert_backbone():
+@pytest.mark.parametrize('static_alloc,static_shape', [(False, False),
+                                                       (True, True)])
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_albert_backbone(static_alloc, static_shape, compute_layout):
     batch_size = 3
     cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
     model = AlbertModel.from_cfg(cfg, use_pooler=True)
     model.initialize()
-    model.hybridize(static_alloc=True, static_shape=True)
+    model.hybridize(static_alloc=static_alloc, static_shape=static_shape)
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    model_tn = AlbertModel.from_cfg(cfg_tn, use_pooler=True)
+    model_tn.share_parameters(model.collect_params())
+    model_tn.hybridize(static_alloc=static_alloc, static_shape=static_shape)
+
     for seq_length in [64, 96]:
         valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,))
         inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
         token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length))
         contextual_embedding, pooled_out = model(inputs, token_types, valid_length)
+        contextual_embedding_tn, pooled_out_tn = model_tn(inputs.T, token_types.T, valid_length)
+        # Verify layout
+        assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        contextual_embedding.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
         assert contextual_embedding.shape == (batch_size, seq_length, cfg.MODEL.units)
         assert pooled_out.shape == (batch_size, cfg.MODEL.units)
         # Ensure the embeddings that exceed valid_length are masked
@@ -65,35 +84,72 @@ def test_albert_backbone():
         assert_allclose(new_pooled_out_np, pooled_out_np, 1E-4, 1E-4)
 
 
-def test_albert_for_mlm_model():
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_albert_for_mlm_model(compute_layout):
     batch_size = 3
     cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
     albert_mlm_model = AlbertForMLM(backbone_cfg=cfg)
     albert_mlm_model.initialize()
     albert_mlm_model.hybridize()
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    albert_mlm_tn_model = AlbertForMLM(backbone_cfg=cfg_tn)
+    albert_mlm_tn_model.share_parameters(albert_mlm_model.collect_params())
+    albert_mlm_tn_model.hybridize()
+
     num_mask = 16
     seq_length = 64
     inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
     token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length))
     valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,))
     masked_positions = mx.np.random.randint(0, seq_length // 2, (batch_size, num_mask))
-    _, _, mlm_scores = albert_mlm_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embeddings, pooled_out, mlm_scores = albert_mlm_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embeddings_tn, pooled_out_tn, mlm_scores_tn = albert_mlm_tn_model(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
     assert mlm_scores.shape == (batch_size, num_mask, cfg.MODEL.vocab_size)
 
 
-def test_albert_for_pretrain_model():
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_albert_for_pretrain_model(compute_layout):
     batch_size = 3
     cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
     albert_pretrain_model = AlbertForPretrain(backbone_cfg=cfg)
     albert_pretrain_model.initialize()
     albert_pretrain_model.hybridize()
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    albert_pretrain_model_tn = AlbertForPretrain(backbone_cfg=cfg_tn)
+    albert_pretrain_model_tn.share_parameters(albert_pretrain_model.collect_params())
+    albert_pretrain_model_tn.hybridize()
+
     num_mask = 16
     seq_length = 64
     inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
     token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length))
     valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,))
     masked_positions = mx.np.random.randint(0, seq_length // 2, (batch_size, num_mask))
-    _, _, sop_score, mlm_scores = albert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embeddings, pooled_out, sop_score, mlm_scores =\
+        albert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embeddings_tn, pooled_out_tn, sop_score_tn, mlm_scores_tn = \
+        albert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(sop_score.asnumpy(), sop_score_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)
     assert mlm_scores.shape == (batch_size, num_mask, cfg.MODEL.vocab_size)
     assert sop_score.shape == (batch_size, 2)
 
diff --git a/tests/test_models_bart.py b/tests/test_models_bart.py
index 9ff66f7c0b..d6130b63fb 100644
--- a/tests/test_models_bart.py
+++ b/tests/test_models_bart.py
@@ -3,8 +3,8 @@
 import mxnet as mx
 import tempfile
 from gluonnlp.models.bart import BartModel, \
-    list_pretrained_bart, get_pretrained_bart
-from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+    list_pretrained_bart, get_pretrained_bart, bart_cfg_reg
+from gluonnlp.utils.testing import verify_nmt_model
 
 mx.npx.set_np()
 
@@ -22,42 +22,31 @@ def test_bart(model_name):
         cfg, tokenizer, params_path, _ =\
             get_pretrained_bart(model_name, load_backbone=True, root=root)
         assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
-        # test backbone
+        # test standard bart encoder and decoder
         bart_model = BartModel.from_cfg(cfg)
         bart_model.load_parameters(params_path)
-        # test mlm model
+        # test bart encoder and decoder with pooler
         bart_model_with_pooler = BartModel.from_cfg(
             cfg, use_pooler=True, classifier_activation=False)
         bart_model_with_pooler.load_parameters(params_path)
 
-    # test forward
-    batch_size = 3
-    seq_length = 32
-    vocab_size = len(tokenizer.vocab)
-    input_ids = mx.np.array(
-        np.random.randint(
-            2,
-            vocab_size,
-            (batch_size, seq_length)
-        ),
-        dtype=np.int32
-    )
-    valid_length = mx.np.array(
-        np.random.randint(
-            seq_length // 2,
-            seq_length,
-            (batch_size,)
-        ),
-        dtype=np.int32
-    )
-    contextual_embeddings, pooled_out = bart_model_with_pooler(
-        input_ids, valid_length, input_ids, valid_length)
-    mx.npx.waitall()
-    # test backward
-    label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
-    with mx.autograd.record():
-        contextual_embeddings, pooled_out = bart_model_with_pooler(
-            input_ids, valid_length, input_ids, valid_length)
-        loss = label_smooth_loss(contextual_embeddings, input_ids)
-        loss.backward()
+
+def test_bart_cfg_registry():
+    assert len(bart_cfg_reg.list_keys()) > 0
+
+@pytest.mark.parametrize('cfg_key', bart_cfg_reg.list_keys())
+def test_bart_cfg(cfg_key):
+    cfg = BartModel.get_cfg(cfg_key)
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 32
+    cfg.freeze()
+    model = BartModel.from_cfg(cfg)
+    model.initialize()
+    model.hybridize()
+    cfg.defrost()
+    cfg.MODEL.layout = 'TN'
+    cfg.freeze()
+    model_tn = BartModel.from_cfg(cfg)
+    model_tn.share_parameters(model.collect_params())
+    model_tn.hybridize()
     mx.npx.waitall()
diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py
index cb1feedc66..a0d9a8d742 100644
--- a/tests/test_models_bert.py
+++ b/tests/test_models_bert.py
@@ -1,5 +1,4 @@
 import pytest
-import numpy as np
 from numpy.testing import assert_allclose
 import mxnet as mx
 import tempfile
@@ -12,6 +11,83 @@ def test_list_pretrained_bert():
     assert len(list_pretrained_bert()) > 0
 
 
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_bert_small_cfg(compute_layout):
+    cfg = BertModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 100
+    cfg.MODEL.units = 12 * 8
+    cfg.MODEL.hidden_size = 64
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    # Sample data
+    batch_size = 4
+    sequence_length = 16
+    num_mask = 3
+    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+    # Test for BertModel
+    bert_model = BertModel.from_cfg(cfg)
+    bert_model.initialize()
+    bert_model.hybridize()
+    contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length)
+    bert_model_tn = BertModel.from_cfg(cfg_tn)
+    bert_model_tn.share_parameters(bert_model.collect_params())
+    bert_model_tn.hybridize()
+    contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+
+    # Test for BertForMLM
+    bert_mlm_model = BertForMLM(cfg)
+    bert_mlm_model.initialize()
+    bert_mlm_model.hybridize()
+    contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types,
+                                                                 valid_length, masked_positions)
+    bert_mlm_model_tn = BertForMLM(cfg_tn)
+    bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params())
+    bert_mlm_model_tn.hybridize()
+    contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\
+        bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+
+    # Test for BertForPretrain
+    bert_pretrain_model = BertForPretrain(cfg)
+    bert_pretrain_model.initialize()
+    bert_pretrain_model.hybridize()
+    contextual_embedding, pooled_out, nsp_score, mlm_scores =\
+        bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+    bert_pretrain_model_tn = BertForPretrain(cfg_tn)
+    bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params())
+    bert_pretrain_model_tn.hybridize()
+    contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
+        bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+
+
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_bert())
 def test_bert_get_pretrained(model_name):
diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py
index 8866cd7921..17f9420a07 100644
--- a/tests/test_models_electra.py
+++ b/tests/test_models_electra.py
@@ -3,14 +3,68 @@
 from numpy.testing import assert_allclose
 import mxnet as mx
 import tempfile
-from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator, ElectraGenerator,\
+from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator,\
+    ElectraGenerator,\
     list_pretrained_electra, get_pretrained_electra, get_generator_cfg
 mx.npx.set_np()
 
 
+def test_list_pretrained_electra():
+    assert len(list_pretrained_electra()) > 0
+
+
+def get_test_cfg():
+    cfg = ElectraModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 100
+    cfg.MODEL.units = 12 * 8
+    cfg.MODEL.hidden_size = 128
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.num_layers = 2
+    cfg.freeze()
+    return cfg
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_electra_model(compute_layout):
+    cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    # Sample data
+    batch_size = 4
+    sequence_length = 16
+    num_mask = 3
+    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+    electra_model = ElectraModel.from_cfg(cfg)
+    electra_model.initialize()
+    electra_model.hybridize()
+    contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length)
+    electra_model_tn = ElectraModel.from_cfg(cfg_tn)
+    electra_model_tn.share_parameters(electra_model.collect_params())
+    electra_model_tn.hybridize()
+    contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(),
+                    1E-4, 1E-4)
+
+
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_electra())
-def test_bert_get_pretrained(model_name):
+def test_electra_get_pretrained(model_name):
     assert len(list_pretrained_electra()) > 0
     with tempfile.TemporaryDirectory() as root:
         cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\
@@ -34,6 +88,5 @@ def test_bert_get_pretrained(model_name):
             electra_disc_model.backbone_model.token_pos_embed.collect_params(),
             electra_disc_model.backbone_model.embed_layer_norm.collect_params())
 
-
         electra_gen_model = ElectraGenerator(cfg)
         electra_gen_model.backbone_model.load_parameters(backbone_params_path)
diff --git a/tests/test_models_mobilebert.py b/tests/test_models_mobilebert.py
index bfd1e3d882..d7f22ac533 100644
--- a/tests/test_models_mobilebert.py
+++ b/tests/test_models_mobilebert.py
@@ -12,9 +12,85 @@ def test_list_pretrained_mobilebert():
     assert len(list_pretrained_mobilebert()) > 0
 
 
+@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
+def test_mobilebert_model_small_cfg(compute_layout):
+    cfg = MobileBertModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 100
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.hidden_size = 128
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    batch_size = 4
+    sequence_length = 16
+    num_mask = 3
+    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+    mobile_bert_model = MobileBertModel.from_cfg(cfg)
+    mobile_bert_model.initialize()
+    mobile_bert_model.hybridize()
+    mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn)
+    mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params())
+    mobile_bert_model_tn.hybridize()
+    contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length)
+    contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T,
+                                                                  token_types.T, valid_length)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+
+    # Test for MobileBertForMLM
+    mobile_bert_mlm_model = MobileBertForMLM(cfg)
+    mobile_bert_mlm_model.initialize()
+    mobile_bert_mlm_model.hybridize()
+    mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn)
+    mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params())
+    mobile_bert_model_tn.hybridize()
+    contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types,
+                                                                         valid_length,
+                                                                         masked_positions)
+    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
+        mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+
+    # Test for MobileBertForPretrain
+    mobile_bert_pretrain_model = MobileBertForPretrain(cfg)
+    mobile_bert_pretrain_model.initialize()
+    mobile_bert_pretrain_model.hybridize()
+    mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn)
+    mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params())
+    mobile_bert_pretrain_model_tn.hybridize()
+    contextual_embedding, pooled_out, nsp_score, mlm_scores =\
+        mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
+        mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)
+
+
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_mobilebert())
-def test_bert_get_pretrained(model_name):
+def test_mobilebert_get_pretrained(model_name):
     with tempfile.TemporaryDirectory() as root:
         cfg, tokenizer, backbone_params_path, mlm_params_path =\
             get_pretrained_mobilebert(model_name, load_backbone=True, load_mlm=True, root=root)
diff --git a/tests/test_models_roberta.py b/tests/test_models_roberta.py
index 9511c51472..bedf85f027 100644
--- a/tests/test_models_roberta.py
+++ b/tests/test_models_roberta.py
@@ -2,6 +2,7 @@
 import numpy as np
 import mxnet as mx
 import tempfile
+from numpy.testing import assert_allclose
 from gluonnlp.models.roberta import RobertaModel, RobertaForMLM, \
     list_pretrained_roberta, get_pretrained_roberta
 from gluonnlp.loss import LabelSmoothCrossEntropyLoss
@@ -13,6 +14,59 @@ def test_list_pretrained_roberta():
     assert len(list_pretrained_roberta()) > 0
 
 
+@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
+def test_robert_small_config(compute_layout):
+    cfg = RobertaModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 1000
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.hidden_size = 128
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    batch_size = 4
+    sequence_length = 16
+    num_mask = 3
+    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+    roberta_model = RobertaModel.from_cfg(cfg)
+    roberta_model.initialize()
+    roberta_model.hybridize()
+    contextual_embeddings, pooled_out = roberta_model(inputs, valid_length)
+    roberta_model_tn = RobertaModel.from_cfg(cfg_tn)
+    roberta_model_tn.share_parameters(roberta_model.collect_params())
+    roberta_model_tn.hybridize()
+    contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length)
+    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+
+    # Test for RobertaForMLM
+    roberta_mlm_model = RobertaForMLM(cfg)
+    roberta_mlm_model.initialize()
+    roberta_mlm_model.hybridize()
+    contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length,
+                                                                     masked_positions)
+    roberta_mlm_model_tn = RobertaForMLM(cfg_tn)
+    roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params())
+    roberta_mlm_model_tn.hybridize()
+    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
+        roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions)
+    assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    contextual_embedding.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+
+
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_roberta())
 def test_roberta(model_name):
diff --git a/tests/test_models_transformer.py b/tests/test_models_transformer.py
index 2727ad107c..96cb60ee1d 100644
--- a/tests/test_models_transformer.py
+++ b/tests/test_models_transformer.py
@@ -33,6 +33,23 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers):
     encoded_mem = enc(src_data, src_valid_length)
     full_decode_out = dec(dst_data, dst_valid_length, encoded_mem, src_valid_length)
 
+    # Test for the TN layout
+    enc_tn = TransformerEncoder(units=units, hidden_size=64, num_layers=num_enc_layers, num_heads=4,
+                                dropout=0.0, pre_norm=pre_norm, layout='TN')
+    enc_tn.share_parameters(enc.collect_params())
+    dec_tn = TransformerDecoder(units=units, hidden_size=64, num_layers=num_dec_layers, num_heads=4,
+                                dropout=0.0, pre_norm=pre_norm, layout='TN')
+    dec_tn.share_parameters(dec.collect_params())
+    enc_tn.hybridize()
+    dec_tn.hybridize()
+    encoded_mem_tn = enc_tn(mx.np.swapaxes(src_data, 0, 1), src_valid_length)
+    full_decode_out_tn = dec_tn(mx.np.swapaxes(dst_data, 0, 1), dst_valid_length,
+                                encoded_mem_tn, src_valid_length)
+    assert_allclose(encoded_mem_tn.asnumpy(),
+                    mx.np.swapaxes(encoded_mem, 0, 1).asnumpy(), 1E-5, 1E-5)
+    assert_allclose(full_decode_out_tn.asnumpy(),
+                    mx.np.swapaxes(full_decode_out, 0, 1).asnumpy(), 1E-5, 1E-5)
+
     # Test the consistency via shifting the data and the valid_length
     for i in range(1, dst_valid_length.asnumpy().min()):
         for partial_decode_out in [dec(dst_data[:, :(-i), :],
@@ -52,11 +69,11 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers):
     states = dec.layers[0].init_states(batch_size, h_out.ctx, h_out.dtype)
     h_out_from_incremental = []
     for i in range(tgt_seq_length):
-        ele_h_out, states = dec.layers[0].incremental_decode(mx, dst_data[:, i:(i + 1), :], states,
+        ele_h_out, states = dec.layers[0].incremental_decode(mx, dst_data[:, i, :], states,
                                                              encoded_mem, src_valid_length,
                                                              enc_mem_attn_mask)
         h_out_from_incremental.append(ele_h_out)
-    h_out_from_incremental = mx.np.concatenate(h_out_from_incremental, axis=1)
+    h_out_from_incremental = mx.np.stack(h_out_from_incremental, axis=1)
 
     for i in range(batch_size):
         val_length = dst_valid_length[i].asnumpy()
@@ -66,10 +83,10 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers):
     states = dec.init_states(batch_size, src_data.ctx, src_data.dtype)
     final_out_from_incremental = []
     for i in range(tgt_seq_length):
-        ele_final_out, states = dec.incremental_decode(mx, dst_data[:, i:(i + 1), :],
+        ele_final_out, states = dec.incremental_decode(mx, dst_data[:, i, :],
                                                        states, encoded_mem, src_valid_length)
         final_out_from_incremental.append(ele_final_out)
-    final_out_from_incremental = mx.np.concatenate(final_out_from_incremental, axis=1)
+    final_out_from_incremental = mx.np.stack(final_out_from_incremental, axis=1)
     for i in range(batch_size):
         val_length = dst_valid_length[i].asnumpy()
         assert_allclose(final_out_from_incremental[i, :val_length, :].asnumpy(),
@@ -85,12 +102,13 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers):
                           (2, 3, 16, 24)])
 @pytest.mark.parametrize('enc_recurrent', [False, True])
 @pytest.mark.parametrize('dec_recurrent', [False, True])
-@pytest.mark.parametrize('tie_weights', [False, True])
-def test_transformer_model(train_hybridize, inference_hybridize,
+@pytest.mark.parametrize('tie_weights,layout', [(False, 'NT'), (True, 'NT'), (True, 'TN')])
+def test_transformer_nmt_model(train_hybridize, inference_hybridize,
                                enc_pre_norm, dec_pre_norm,
                                enc_units, dec_units,
                                enc_num_layers, dec_num_layers,
-                               enc_recurrent, dec_recurrent, tie_weights):
+                               enc_recurrent, dec_recurrent, tie_weights,
+                               layout):
     src_seq_length = 20
     tgt_seq_length = 15
     src_vocab_size = 32
@@ -100,24 +118,25 @@ def test_transformer_model(train_hybridize, inference_hybridize,
     else:
         shared_embed = True
     model = TransformerModel(src_vocab_size=src_vocab_size,
-                                tgt_vocab_size=tgt_vocab_size,
-                                max_src_length=src_seq_length,
-                                max_tgt_length=tgt_seq_length,
-                                enc_units=enc_units,
-                                enc_hidden_size=64,
-                                enc_num_heads=4,
-                                enc_num_layers=enc_num_layers,
-                                enc_pre_norm=enc_pre_norm,
-                                enc_recurrent=enc_recurrent,
-                                dec_units=dec_units,
-                                dec_hidden_size=64,
-                                dec_num_heads=4,
-                                dec_num_layers=dec_num_layers,
-                                dec_pre_norm=dec_pre_norm,
-                                dec_recurrent=dec_recurrent,
-                                shared_embed=shared_embed,
-                                tie_weights=tie_weights,
-                                dropout=0.0)
+                             tgt_vocab_size=tgt_vocab_size,
+                             max_src_length=src_seq_length,
+                             max_tgt_length=tgt_seq_length,
+                             enc_units=enc_units,
+                             enc_hidden_size=64,
+                             enc_num_heads=4,
+                             enc_num_layers=enc_num_layers,
+                             enc_pre_norm=enc_pre_norm,
+                             enc_recurrent=enc_recurrent,
+                             dec_units=dec_units,
+                             dec_hidden_size=64,
+                             dec_num_heads=4,
+                             dec_num_layers=dec_num_layers,
+                             dec_pre_norm=dec_pre_norm,
+                             dec_recurrent=dec_recurrent,
+                             shared_embed=shared_embed,
+                             tie_weights=tie_weights,
+                             dropout=0.0,
+                             layout=layout)
     inference_model = TransformerNMTInference(model=model)
     model.initialize()
     if train_hybridize:
@@ -136,10 +155,16 @@ def test_transformer_cfg_registry():
 def test_transformer_cfg(cfg_key):
     cfg = TransformerModel.get_cfg(cfg_key)
     cfg.defrost()
-    cfg.MODEL.src_vocab_size = 1000
-    cfg.MODEL.tgt_vocab_size = 1000
+    cfg.MODEL.src_vocab_size = 32
+    cfg.MODEL.tgt_vocab_size = 32
     cfg.freeze()
     model = TransformerModel.from_cfg(cfg)
     model.initialize()
     model.hybridize()
+    cfg.defrost()
+    cfg.MODEL.layout = 'TN'
+    cfg.freeze()
+    model_tn = TransformerModel.from_cfg(cfg)
+    model_tn.share_parameters(model.collect_params())
+    model_tn.hybridize()
     mx.npx.waitall()
diff --git a/tests/test_models_xlmr.py b/tests/test_models_xlmr.py
index f8f9ec76fe..ff9c41fdfd 100644
--- a/tests/test_models_xlmr.py
+++ b/tests/test_models_xlmr.py
@@ -2,7 +2,7 @@
 import numpy as np
 import mxnet as mx
 import tempfile
-from gluonnlp.models.xlmr import XLMRModel, XLMRForMLM, \
+from gluonnlp.models.xlmr import XLMRModel, \
     list_pretrained_xlmr, get_pretrained_xlmr
 from gluonnlp.loss import LabelSmoothCrossEntropyLoss
 
@@ -29,7 +29,7 @@ def test_xlmr():
 
         # test forward
         batch_size = 1
-        seq_length = 8
+        seq_length = 4
         vocab_size = len(tokenizer.vocab)
         input_ids = mx.np.array(
             np.random.randint(
diff --git a/tools/batch/docker/Dockerfile b/tools/batch/docker/Dockerfile
index d2868239b3..a9ef4aaad4 100644
--- a/tools/batch/docker/Dockerfile
+++ b/tools/batch/docker/Dockerfile
@@ -4,6 +4,8 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
           build-essential \
           locales \
           cmake \
+          wget \
+          subversion \
           git \
           curl \
           vim \
diff --git a/tools/batch/docker/README.md b/tools/batch/docker/README.md
new file mode 100644
index 0000000000..80efb0d9d1
--- /dev/null
+++ b/tools/batch/docker/README.md
@@ -0,0 +1,22 @@
+# Updating the Docker for AWS Batch.
+
+Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To
+update the docker:
+- update the Dockerfile
+- Make sure docker and docker-compose, as well as the docker python package are installed.
+- Export the AWS account credentials as environment variables
+- CD to the same folder as the Dockerfile and execute the following:
+
+```
+# this executes a command that logs into ECR.
+$(aws ecr get-login --no-include-email --region us-east-1)
+
+# builds the Dockerfile as gluon-nlp-1 docker.
+docker build -t gluon-nlp-1 .
+
+# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from.
+docker tag gluon-nlp-1:latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest
+
+# pushes the change
+docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest
+```