In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from fastai import *
from fastai.text import *

In [4]:
path = Path('/home/jupyter/chat-data')

### Setup Learner

In [5]:
data_bunch = 'data_clas_export_7_14.pkl'
trained_model = 'chat-clas-7-12_2'
encoder = 'chat-lm-encoder-7-12_2'

In [6]:
data_clas = load_data(path, data_bunch, bs=32)
clas_learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
clas_learn.load(trained_model)
clas_learn.load_encoder(encoder)

In [7]:
clas_learn.predict('good day')

(Category greet,
 tensor(37),
 tensor([1.7734e-03, 3.4433e-05, 8.3470e-04, 9.5550e-05, 2.4194e-05, 6.0020e-04,
         1.5237e-03, 2.3544e-04, 1.4157e-01, 2.4288e-05, 1.8255e-04, 7.9431e-05,
         3.7898e-04, 9.6639e-05, 2.4013e-01, 3.4481e-04, 9.8123e-05, 6.4609e-06,
         3.0558e-04, 4.8295e-03, 2.8677e-02, 3.3994e-02, 1.6960e-04, 6.2135e-05,
         2.8435e-04, 1.5340e-04, 7.2886e-05, 1.9466e-04, 2.7943e-04, 4.1249e-05,
         2.4435e-05, 5.0937e-02, 1.5473e-03, 1.5553e-03, 1.1050e-04, 1.6630e-02,
         5.7694e-05, 4.2145e-01, 2.6813e-03, 4.3913e-04, 2.7011e-04, 8.7943e-05,
         7.3565e-04, 5.2236e-03, 2.8214e-03, 1.6263e-03, 3.2046e-04, 2.4389e-03,
         3.3950e-02]))

In [8]:
clas_learn.data.classes

['affirm',
 'ask_builder',
 'ask_faq_channels',
 'ask_faq_community_size',
 'ask_faq_languages',
 'ask_faq_opensource',
 'ask_faq_platform',
 'ask_faq_python_version',
 'ask_faq_slots',
 'ask_faq_tutorials',
 'ask_faq_voice',
 'ask_faq_what_is_forum',
 'ask_how_contribute',
 'ask_howbuilt',
 'ask_howdoing',
 'ask_howold',
 'ask_isbot',
 'ask_languagesbot',
 'ask_question_in_forum',
 'ask_restaurant',
 'ask_time',
 'ask_weather',
 'ask_whatismyname',
 'ask_whatisrasa',
 'ask_whatspossible',
 'ask_when_next_event',
 'ask_wherefrom',
 'ask_which_events',
 'ask_whoami',
 'ask_whoisit',
 'ask_why_contribute',
 'bye',
 'canthelp',
 'contact_sales',
 'deny',
 'enter_data',
 'explain',
 'greet',
 'how_to_get_started',
 'human_handoff',
 'install_rasa',
 'next_step',
 'nlu_generation_tool_recommendation',
 'out_of_scope',
 'pipeline_recommendation',
 'signup_newsletter',
 'switch',
 'technical_question',
 'thank']

In [17]:
len(clas_learn.data.classes)

49

### Install rasa

In [9]:
# !pip install rasa

### Create a config using fastai classifier

In [11]:
# make config which uses the pretrained_embeddings_spacy pipeline except switches in FastaiClassifier for the SklearnIntentClassifier

fastai_config = """language: "en"

pipeline:
- name: "SpacyNLP"
- name: "SpacyTokenizer"
- name: "SpacyFeaturizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "FastaiClassifier"
    
policies:
  - name: MemoizationPolicy
  - name: KerasPolicy
  - name: MappingPolicy"""

In [12]:
store fastai_config > ../fastai_config.yml

Writing 'fastai_config' (str) to file '../fastai_config.yml'.


### Hack to make rasa pick up our new component
Just added the import for FastaiClassifier and added this to the list of component_classes

In [13]:
%%writefile /opt/anaconda3/lib/python3.7/site-packages/rasa/nlu/registry.py
"""This is a somewhat delicate package. It contains all registered components
and preconfigured templates.

Hence, it imports all of the components. To avoid cycles, no component should
import this in module scope."""

import logging
import typing
from typing import Any, Dict, List, Optional, Text, Type

from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
# added this
from rasa.nlu.classifiers.fastai_nlu import FastaiClassifier


from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
from rasa.nlu.extractors.duckling_http_extractor import DucklingHTTPExtractor
from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
from rasa.nlu.featurizers.mitie_featurizer import MitieFeaturizer
from rasa.nlu.featurizers.ngram_featurizer import NGramFeaturizer
from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
from rasa.nlu.model import Metadata
from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from rasa.nlu.utils.mitie_utils import MitieNLP
from rasa.nlu.utils.spacy_utils import SpacyNLP
from rasa.utils.common import class_from_module_path

if typing.TYPE_CHECKING:
    from rasa.nlu.components import Component
    from rasa.nlu.config import RasaNLUModelConfig, RasaNLUModelConfig

logger = logging.getLogger(__name__)


# Classes of all known components. If a new component should be added,
# its class name should be listed here.
component_classes = [
    # utils
    SpacyNLP,
    MitieNLP,
    # tokenizers
    MitieTokenizer,
    SpacyTokenizer,
    WhitespaceTokenizer,
    JiebaTokenizer,
    # extractors
    SpacyEntityExtractor,
    MitieEntityExtractor,
    CRFEntityExtractor,
    DucklingHTTPExtractor,
    EntitySynonymMapper,
    # featurizers
    SpacyFeaturizer,
    MitieFeaturizer,
    NGramFeaturizer,
    RegexFeaturizer,
    CountVectorsFeaturizer,
    # classifiers
    SklearnIntentClassifier,
    MitieIntentClassifier,
    KeywordIntentClassifier,
    EmbeddingIntentClassifier,
    # added this
    FastaiClassifier,
]

# Mapping from a components name to its class to allow name based lookup.
registered_components = {c.name: c for c in component_classes}

# DEPRECATED ensures compatibility, will be remove in future versions
old_style_names = {
    "nlp_spacy": "SpacyNLP",
    "nlp_mitie": "MitieNLP",
    "ner_spacy": "SpacyEntityExtractor",
    "ner_mitie": "MitieEntityExtractor",
    "ner_crf": "CRFEntityExtractor",
    "ner_duckling_http": "DucklingHTTPExtractor",
    "ner_synonyms": "EntitySynonymMapper",
    "intent_featurizer_spacy": "SpacyFeaturizer",
    "intent_featurizer_mitie": "MitieFeaturizer",
    "intent_featurizer_ngrams": "NGramFeaturizer",
    "intent_entity_featurizer_regex": "RegexFeaturizer",
    "intent_featurizer_count_vectors": "CountVectorsFeaturizer",
    "tokenizer_mitie": "MitieTokenizer",
    "tokenizer_spacy": "SpacyTokenizer",
    "tokenizer_whitespace": "WhitespaceTokenizer",
    "tokenizer_jieba": "JiebaTokenizer",
    "intent_classifier_sklearn": "SklearnIntentClassifier",
    "intent_classifier_mitie": "MitieIntentClassifier",
    "intent_classifier_keyword": "KeywordIntentClassifier",
    "intent_classifier_tensorflow_embedding": "EmbeddingIntentClassifier",
}

# To simplify usage, there are a couple of model templates, that already add
# necessary components in the right order. They also implement
# the preexisting `backends`.
registered_pipeline_templates = {
    "pretrained_embeddings_spacy": [
        "SpacyNLP",
        "SpacyTokenizer",
        "SpacyFeaturizer",
        "RegexFeaturizer",
        "CRFEntityExtractor",
        "EntitySynonymMapper",
        "SklearnIntentClassifier",
    ],
    "keyword": ["KeywordIntentClassifier"],
    "supervised_embeddings": [
        "WhitespaceTokenizer",
        "RegexFeaturizer",
        "CRFEntityExtractor",
        "EntitySynonymMapper",
        "CountVectorsFeaturizer",
        "EmbeddingIntentClassifier",
    ],
}


def pipeline_template(s: Text) -> Optional[List[Dict[Text, Text]]]:
    components = registered_pipeline_templates.get(s)

    if components:
        # converts the list of components in the configuration
        # format expected (one json object per component)
        return [{"name": c} for c in components]

    else:
        return None


def get_component_class(component_name: Text) -> Type["Component"]:
    """Resolve component name to a registered components class."""

    if component_name not in registered_components:
        if component_name not in old_style_names:
            try:
                return class_from_module_path(component_name)
            except Exception:
                raise Exception(
                    "Failed to find component class for '{}'. Unknown "
                    "component name. Check your configured pipeline and make "
                    "sure the mentioned component is not misspelled. If you "
                    "are creating your own component, make sure it is either "
                    "listed as part of the `component_classes` in "
                    "`rasa.nlu.registry.py` or is a proper name of a class "
                    "in a module.".format(component_name)
                )
        else:
            # DEPRECATED ensures compatibility, remove in future versions
            logger.warning(
                "DEPRECATION warning: your nlu config file "
                "contains old style component name `{}`, "
                "you should change it to its class name: `{}`."
                "".format(component_name, old_style_names[component_name])
            )
            component_name = old_style_names[component_name]

    return registered_components[component_name]


def load_component_by_meta(
    component_meta: Dict[Text, Any],
    model_dir: Text,
    metadata: Metadata,
    cached_component: Optional["Component"],
    **kwargs: Any
) -> Optional["Component"]:
    """Resolves a component and calls its load method.

    Inits it based on a previously persisted model.
    """

    # try to get class name first, else create by name
    component_name = component_meta.get("class", component_meta["name"])
    component_class = get_component_class(component_name)
    return component_class.load(
        component_meta, model_dir, metadata, cached_component, **kwargs
    )


def create_component_by_config(
    component_config: Dict[Text, Any], config: "RasaNLUModelConfig"
) -> Optional["Component"]:
    """Resolves a component and calls it's create method.

    Inits it based on a previously persisted model.
    """

    # try to get class name first, else create by name
    component_name = component_config.get("class", component_config["name"])
    component_class = get_component_class(component_name)
    return component_class.create(component_config, config)

Overwriting /opt/anaconda3/lib/python3.7/site-packages/rasa/nlu/registry.py


### Actually create our FastaiClassifer component

In [26]:
%%writefile /opt/anaconda3/lib/python3.7/site-packages/rasa/nlu/classifiers/fastai_nlu.py
from rasa.nlu.components import Component
from rasa.nlu import utils
from rasa.nlu.model import Metadata

from fastai import *
from fastai.text import *
import os

import typing
from typing import Any, Optional, Text, Dict


class FastaiClassifier(Component):
    """A pre-trained fastai classifier component"""

    name = "fastai-nlu"
    provides = ["intent"]
    requires = []
    defaults = {}
    language_list = ["en"]
    path = Path('/home/jupyter/chat-data')
    data_bunch = 'data_clas_export_7_14.pkl'
    trained_model = 'chat-clas-7-12_2'
    encoder = 'chat-lm-encoder-7-12_2'
    data_clas = load_data(path, data_bunch, bs=32)
    clas_learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
    clas_learn.load(trained_model)
    clas_learn.load_encoder(encoder)

    def __init__(self, component_config=None):
        super(FastaiClassifier, self).__init__(component_config)

    def train(self, training_data, cfg, **kwargs):
        """Not needed, because the the model is pretrained"""
        pass


    def process(self, message, **kwargs):
        """Retrieve the text message, pass it to the classifier
            and append the prediction results to the message class."""
        
#         see example: https://github.com/RasaHQ/rasa/blob/master/rasa/nlu/classifiers/sklearn_intent_classifier.py#L136
        
#         print("message.text!!", message.text)
        predictions = FastaiClassifier.clas_learn.predict(message.text)
        sorted_predictions = sorted(
            zip(FastaiClassifier.clas_learn.data.classes, map(float, predictions[2])),
            key=lambda p: p[1],
            reverse=True
        )
#         print("sorted_predictions!", sorted_predictions)
        intent = {"name": sorted_predictions[0][0], "confidence": sorted_predictions[0][1]}

        intent_ranking = [
            {"name": intent_name, "confidence": score}
            for intent_name, score in sorted_predictions
        ]
#         print("fastai intent_ranking!!", intent_ranking)
        
        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)

    def persist(self, file_name, model_dir):
        """Pass because a pre-trained model is already persisted"""

        pass




Overwriting /opt/anaconda3/lib/python3.7/site-packages/rasa/nlu/classifiers/fastai_nlu.py


### Install spacy en libraries since using in the pipeline

In [None]:
# !python -m spacy download en_core_web_md
# !python -m spacy link en_core_web_md en

### Even though our fastai model is pre-trained, need to run rasa train so that pipeline as a whole is trained and rasa creates a zipped model file which will then be used in testing

In [85]:
! rasa train nlu --config ../fastai_config.yml --nlu ../data/nlu.md

[94mTraining NLU model...[0m
2019-07-22 01:20:47 [1;30mINFO    [0m [34mrasa.nlu.utils.spacy_utils[0m  - Trying to load spacy model with name 'en'
2019-07-22 01:20:59 [1;30mINFO    [0m [34mrasa.nlu.components[0m  - Added 'SpacyNLP' to component cache. Key 'SpacyNLP-en'.
2019-07-22 01:20:59 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpeoac2qcz/693e8fac4d6b47ce9a26aeec80e1e5f1_nlu.md' is 'md'.
2019-07-22 01:20:59 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 2333 (51 distinct intents)
	- Found intents: 'ask_whatspossible', 'ask_time', 'explain', 'ask_builder', 'greet', 'bye', 'ask_whoami', 'contact_sales', 'affirm', 'nlu_generation_tool_recommendation', 'ask_faq_channels', 'enter_data', 'ask_faq_voice', 'pipeline_recommendation', 'deny', 'out_of_scope', 'ask_faq_platform', 'ask_whatismyname', 'ask_faq_community_size', 'ask_isbot', 'human_handoff', 'ask_faq_tutorial

### Test our pipeline with the fastai classifier

In [86]:
! rasa test nlu --config ../fastai_config.yml --nlu ../data/nlu.md --model /models/nlu-20190722-012104.tar.gz

2019-07-22 01:21:38 [1;30mINFO    [0m [34mrasa.nlu.components[0m  - Added 'SpacyNLP' to component cache. Key 'SpacyNLP-en'.
2019-07-22 01:21:38 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpgw_zutae/3c337b8bdbc04721b5d47aec5ddecc7a_nlu.md' is 'md'.
2019-07-22 01:21:38 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 2333 (51 distinct intents)
	- Found intents: 'affirm', 'next_step', 'ask_how_contribute', 'enter_data', 'ask_whoami', 'human_handoff', 'install_rasa', 'ask_whatspossible', 'canthelp', 'ask_restaurant', 'ask_why_contribute', 'deny', 'ask_faq_community_size', 'thank', 'ask_languagesbot', 'ask_question_in_forum', 'ask_faq_tutorials', 'ask_faq_python_version', 'out_of_scope', 'ask_when_next_event', 'pipeline_recommendation', 'ask_howdoing', 'nlu_info', 'ask_faq_platform', 'ask_which_events', 'explain', 'greet', 'ask_faq_slots', 'ask_whatismyname', 'ask_faq_languag

### Accuracy for the fastai_classifier: 0.7453921988855551
Getting 0 precision and recall on ask_faq_differencecorenlu and nlu_info which makes sense since those weren't in the trained classes. Also, getting 0 precision and recall on switch even though this was at least a class on the learner (though only trained with one example).

This is because when I created my json training data, I filtered out anything that had a bracket since I didn't want to confuse the intent classification model with the syntax that happened to be in named entity recognition training data. 

In [89]:
!head -n 40 errors.json

[
  {
    "text": "No, I mean how it is possible to use Skype as channel?",
    "intent": "ask_faq_channels",
    "intent_prediction": {
      "name": "technical_question",
      "confidence": 0.812785267829895
    }
  },
  {
    "text": "What makes core and nlu different?",
    "intent": "ask_faq_differencecorenlu",
    "intent_prediction": {
      "name": "technical_question",
      "confidence": 0.649851381778717
    }
  },
  {
    "text": "what is the main difference between core and nlu?",
    "intent": "ask_faq_differencecorenlu",
    "intent_prediction": {
      "name": "nlu_generation_tool_recommendation",
      "confidence": 0.2822674512863159
    }
  },
  {
    "text": "what is the primary difference between core and nlu?",
    "intent": "ask_faq_differencecorenlu",
    "intent_prediction": {
      "name": "technical_question",
      "confidence": 0.20713557302951813
    }
  },
  {
    "text": "what would you say the difference is between cor

In [18]:
pretrained_embeddings_spacy_config = """language: "en"

pipeline:
- name: "SpacyNLP"
- name: "SpacyTokenizer"
- name: "SpacyFeaturizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "SklearnIntentClassifier"
    
policies:
  - name: MemoizationPolicy
  - name: KerasPolicy
  - name: MappingPolicy"""

In [19]:
store pretrained_embeddings_spacy_config > ../pretrained_embeddings_spacy_config.yml

Writing 'pretrained_embeddings_spacy_config' (str) to file '../pretrained_embeddings_spacy_config.yml'.


In [20]:
! rasa train nlu --config ../pretrained_embeddings_spacy_config.yml --nlu ../data

[94mTraining NLU model...[0m
2019-07-21 23:21:05 [1;30mINFO    [0m [34mrasa.nlu.utils.spacy_utils[0m  - Trying to load spacy model with name 'en'
2019-07-21 23:21:18 [1;30mINFO    [0m [34mrasa.nlu.components[0m  - Added 'SpacyNLP' to component cache. Key 'SpacyNLP-en'.
2019-07-21 23:21:18 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpe4hcojgh/f570a500047341d58c489edfa64b6ce8_nlu.md' is 'md'.
2019-07-21 23:21:18 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 2333 (51 distinct intents)
	- Found intents: 'ask_wherefrom', 'switch', 'ask_faq_platform', 'ask_whatismyname', 'install_rasa', 'ask_howdoing', 'ask_whoisit', 'ask_howold', 'ask_faq_python_version', 'ask_builder', 'canthelp', 'nlu_info', 'nlu_generation_tool_recommendation', 'next_step', 'ask_whoami', 'ask_when_next_event', 'ask_how_contribute', 'ask_faq_slots', 'affirm', 'thank', 'ask_question_in_forum', 'ask_

In [22]:
! rasa test nlu --config ../pretrained_embeddings_spacy_config.yml --nlu ../data --model /models/nlu-20190721-232225.tar.gz

2019-07-21 23:27:26 [1;30mINFO    [0m [34mrasa.nlu.components[0m  - Added 'SpacyNLP' to component cache. Key 'SpacyNLP-en'.
2019-07-21 23:27:26 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpgb7w6l5k/f22855a099eb45b59334a9a49f119865_nlu.md' is 'md'.
2019-07-21 23:27:26 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 2333 (51 distinct intents)
	- Found intents: 'ask_whatisrasa', 'ask_time', 'technical_question', 'ask_faq_python_version', 'affirm', 'ask_faq_community_size', 'canthelp', 'human_handoff', 'ask_languagesbot', 'signup_newsletter', 'ask_faq_channels', 'greet', 'explain', 'ask_isbot', 'ask_whatspossible', 'ask_why_contribute', 'ask_whoami', 'enter_data', 'ask_how_contribute', 'ask_faq_opensource', 'ask_howdoing', 'ask_restaurant', 'ask_faq_differencecorenlu', 'deny', 'switch', 'pipeline_recommendation', 'ask_wherefrom', 'ask_whoisit', 'next_step', 'thank', 'how_to

### Accuracy for the sklearn:  0.9652807543934848
### But not good test since did not split the data for training/testing

### Let's actually be thoughtful about our training and test data

In [30]:
# will split 80% training/20% test by default
! rasa data split nlu --nlu ../data --out ../data/train-test-split

2019-07-21 23:58:09 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmp7_8vuook/342aa43845ee4bc5abb0dfbed6599f3e_nlu.md' is 'md'.
2019-07-21 23:58:09 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 2333 (51 distinct intents)
	- Found intents: 'ask_how_contribute', 'signup_newsletter', 'technical_question', 'ask_languagesbot', 'ask_builder', 'ask_faq_opensource', 'ask_whoami', 'explain', 'pipeline_recommendation', 'ask_restaurant', 'ask_whatspossible', 'ask_faq_differencecorenlu', 'ask_faq_python_version', 'ask_faq_platform', 'contact_sales', 'ask_faq_community_size', 'ask_isbot', 'ask_faq_what_is_forum', 'ask_faq_channels', 'ask_howdoing', 'canthelp', 'deny', 'nlu_generation_tool_recommendation', 'how_to_get_started', 'ask_question_in_forum', 'human_handoff', 'switch', 'thank', 'ask_why_contribute', 'ask_faq_slots', 'ask_wherefrom', 'greet', 'ask_weather', 'ask_faq_tutorials', 'a

In [31]:
!ls ../data/train-test-split

test_data.md  training_data.md


### Train and test sklearn model with split dataset

In [79]:
! rasa train nlu --config ../pretrained_embeddings_spacy_config.yml --nlu ../data/train-test-split/training_data.md

[94mTraining NLU model...[0m
2019-07-22 01:14:54 [1;30mINFO    [0m [34mrasa.nlu.utils.spacy_utils[0m  - Trying to load spacy model with name 'en'
2019-07-22 01:15:06 [1;30mINFO    [0m [34mrasa.nlu.components[0m  - Added 'SpacyNLP' to component cache. Key 'SpacyNLP-en'.
2019-07-22 01:15:06 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpgquqf7r7/f2b21bfdc5d4486dbd22a62cd5a9ff97_training_data.md' is 'md'.
2019-07-22 01:15:06 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 1845 (51 distinct intents)
	- Found intents: 'ask_faq_python_version', 'ask_isbot', 'nlu_generation_tool_recommendation', 'ask_why_contribute', 'next_step', 'ask_how_contribute', 'ask_whoisit', 'ask_faq_languages', 'ask_faq_opensource', 'contact_sales', 'signup_newsletter', 'out_of_scope', 'ask_faq_differencecorenlu', 'ask_faq_channels', 'ask_faq_slots', 'ask_faq_tutorials', 'ask_wherefrom', 'bye', 'a

In [80]:
! rasa test nlu --config ../pretrained_embeddings_spacy_config.yml --nlu ../data/train-test-split/test_data.md --model /models/nlu-20190722-011555.tar.gz

2019-07-22 01:16:41 [1;30mINFO    [0m [34mrasa.nlu.components[0m  - Added 'SpacyNLP' to component cache. Key 'SpacyNLP-en'.
2019-07-22 01:16:41 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpz3m54df2/c5ed54271c0543a3b404a3ce3bd946b8_test_data.md' is 'md'.
2019-07-22 01:16:41 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 488 (51 distinct intents)
	- Found intents: 'ask_howbuilt', 'enter_data', 'out_of_scope', 'ask_howold', 'ask_faq_platform', 'ask_which_events', 'ask_faq_what_is_forum', 'ask_when_next_event', 'canthelp', 'ask_isbot', 'ask_howdoing', 'ask_faq_python_version', 'ask_faq_voice', 'nlu_info', 'ask_faq_tutorials', 'affirm', 'ask_faq_languages', 'ask_question_in_forum', 'ask_faq_channels', 'ask_why_contribute', 'ask_restaurant', 'next_step', 'ask_whoisit', 'ask_faq_slots', 'ask_how_contribute', 'signup_newsletter', 'ask_whoami', 'ask_builder', 'ask_weather', 'as

### Accuracy for sklearn after doing proper train/test split of data:  0.7704918032786885

In [82]:
!head -n 40 errors.json

[
  {
    "text": "sure thing",
    "intent": "affirm",
    "intent_prediction": {
      "name": "out_of_scope",
      "confidence": 0.20372628719007818
    }
  },
  {
    "text": "yop",
    "intent": "affirm",
    "intent_prediction": {
      "name": "greet",
      "confidence": 0.5586580846377027
    }
  },
  {
    "text": "hm, i'd like that",
    "intent": "affirm",
    "intent_prediction": {
      "name": "enter_data",
      "confidence": 0.18092331036937292
    }
  },
  {
    "text": "definitely yes without a doubt",
    "intent": "affirm",
    "intent_prediction": {
      "name": "out_of_scope",
      "confidence": 0.30912818975232753
    }
  },
  {
    "text": "yas",
    "intent": "affirm",
    "intent_prediction": {
      "name": "greet",
      "confidence": 0.2328896679673974
    }


### Let's try rasa's other default NLU pipeline while we're at it -- the supervised_embeddings pipeline.
This is intended to be used when you have more training data than with the pretrained_embeddings_spacy pipeline. 
See: https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8
This model's approach is more similar to fastai since it is not using pre-trained embeddings.

In [34]:
supervised_embeddings_config = """language: "en"

pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
- name: "EmbeddingIntentClassifier"
    
policies:
  - name: MemoizationPolicy
  - name: KerasPolicy
  - name: MappingPolicy"""

In [35]:
store supervised_embeddings_config > ../supervised_embeddings_config.yml

Writing 'supervised_embeddings_config' (str) to file '../supervised_embeddings_config.yml'.


In [36]:
! rasa train nlu --config ../supervised_embeddings_config.yml --nlu ../data/train-test-split/training_data.md

[94mTraining NLU model...[0m
2019-07-22 00:24:43 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpbylofgch/85b666b9d150430386513fd8989d04f6_training_data.md' is 'md'.
2019-07-22 00:24:43 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 1845 (51 distinct intents)
	- Found intents: 'ask_faq_languages', 'ask_whatismyname', 'nlu_info', 'out_of_scope', 'signup_newsletter', 'bye', 'ask_faq_slots', 'ask_whoisit', 'ask_builder', 'switch', 'canthelp', 'ask_faq_opensource', 'greet', 'ask_which_events', 'ask_wherefrom', 'thank', 'ask_howbuilt', 'enter_data', 'deny', 'affirm', 'ask_restaurant', 'ask_whatisrasa', 'ask_isbot', 'ask_faq_what_is_forum', 'ask_faq_differencecorenlu', 'ask_faq_python_version', 'pipeline_recommendation', 'ask_when_next_event', 'ask_faq_community_size', 'contact_sales', 'ask_how_contribute', 'ask_why_contribute', 'ask_faq_platform', 'ask_faq_tutorials', 'ask_weath

In [73]:
! rasa test nlu --config ../supervised_embeddings_config.yml --nlu ../data/train-test-split/test_data.md --model /models/nlu-20190722-002541.tar.gz



2019-07-22 01:04:39 [1;30mINFO    [0m [34mrasa.nlu.training_data.loading[0m  - Training data format of '/tmp/tmpyenmit9_/2d763df6e21043f0bf5aaf5c66999ca9_test_data.md' is 'md'.
2019-07-22 01:04:39 [1;30mINFO    [0m [34mrasa.nlu.training_data.training_data[0m  - Training data stats: 
	- intent examples: 488 (51 distinct intents)
	- Found intents: 'ask_question_in_forum', 'ask_time', 'ask_builder', 'explain', 'nlu_info', 'nlu_generation_tool_recommendation', 'ask_faq_languages', 'ask_languagesbot', 'ask_when_next_event', 'ask_faq_python_version', 'ask_whoisit', 'deny', 'ask_howbuilt', 'human_handoff', 'technical_question', 'ask_whatisrasa', 'out_of_scope', 'ask_how_contribute', 'ask_weather', 'switch', 'ask_howdoing', 'ask_howold', 'ask_faq_voice', 'bye', 'pipeline_recommendation', 'greet', 'ask_why_contribute', 'how_to_get_started', 'ask_faq_slots', 'ask_restaurant', 'enter_data', 'ask_isbot', 'affirm', 'ask_whoami', 'ask_whatspossible', 'signup_newsletter', 'ask_faq_opensource'

### Accuracy for supervised embeddings classifier after doing proper train/test split of data:  0.7950819672131147

In [65]:
%%html
<img src="hist.png" width="500" height="500">

In [67]:
%%html
<img src="confmat.png" width="1000" height="1000">

In [72]:
!head -n 40 errors.json

[
  {
    "text": "yop",
    "intent": "affirm",
    "intent_prediction": {
      "name": "",
      "confidence": 0.0
    }
  },
  {
    "text": "hm, i'd like that",
    "intent": "affirm",
    "intent_prediction": {
      "name": "enter_data",
      "confidence": 0.8679656386375427
    }
  },
  {
    "text": "yas",
    "intent": "affirm",
    "intent_prediction": {
      "name": "",
      "confidence": 0.0
    }
  },
  {
    "text": "who build yoi",
    "intent": "ask_builder",
    "intent_prediction": {
      "name": "ask_whoisit",
      "confidence": 0.7670379281044006
    }
  },
  {
    "text": "diffrence between rasa core and rasa nlu",
    "intent": "ask_faq_differencecorenlu",
    "intent_prediction": {
      "name": "how_to_get_started",
      "confidence": 0.7743602991104126
    }


## Note that it looks like rasa test nlu is ignoring the --models argument and using last trained model so be cafeful about how run tests and evaluate results