# REFERENCES

**Information Extraction**
- https://www.analyticsvidhya.com/blog/2020/06/nlp-project-information-extraction/
- https://medium.com/analytics-vidhya/introduction-to-information-extraction-using-python-and-spacy-858f5d6416ca

**Chatbot**
- https://medium.com/predict/create-your-chatbot-using-python-nltk-761cd0aeaed3
- https://medium.com/swlh/a-chatbot-in-python-using-nltk-938a37a9eacc

**Intent**
- https://medium.com/walmartglobaltech/joint-intent-classification-and-entity-recognition-for-conversational-commerce-35bf69195176
- https://medium.com/analytics-vidhya/machine-learning-intent-classification-221ecded7c74
- https://colab.research.google.com/github/deepmipt/dp_notebooks/blob/master/DP_autoFAQ.ipynb (!)
- https://towardsdatascience.com/a-brief-introduction-to-intent-classification-96fda6b1f557
- https://medium.com/artefact-engineering-and-data-science/nlu-benchmark-for-intent-detection-and-named-entity-recognition-in-call-center-conversations-f58e5b4c8d3d
- https://medium.com/iambot/ai-assistance-with-pytext-6308d896566d

**NER**
· Simple Entities
· Composite Entities
· Entity Roles
· Entity Lists
· Regular Expressions
· Prebuilt Models
- https://github.com/DhruvilKarani/NER-Blog/blob/master/analysis.ipynb
- https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
- https://towardsdatascience.com/named-entity-recognition-ner-using-keras-bidirectional-lstm-28cd3f301f54
- https://towardsdatascience.com/named-entity-recognition-ner-meeting-industrys-requirement-by-applying-state-of-the-art-deep-698d2b3b4ede
- https://towardsdatascience.com/deep-learning-for-named-entity-recognition-3-reusing-a-bidirectional-lstm-cnn-on-clinical-text-e84bd28052df
- https://medium.com/@b.terryjack/nlp-pretrained-named-entity-recognition-7caa5cd28d7b

## NLP

In [3]:
import os
import nltk
import random
import string
import re

from nltk.stem import wordnet, PorterStemmer # to perform lemmitization
from nltk.corpus import stopwords # for stop words
from nltk import pos_tag # for parts of speech
from nltk import word_tokenize # to create tokens

from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
# from sklearn.datasets import fetch_openml
# import stanfordnlp

In [4]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10256)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)

In [5]:
# stanfordnlp.download('en') 

In [None]:
# intent + entity type
intent_sets = ['greet', 'new', 'update', 'finish', 'query', 'no']
# possible data types: tabular, image, and text (?) -- training and testing set for each
slot_sets = {
    "task": [], # regression or classification
    "data_source": [], # upload, url, or built-in
    "target_variable": [], # specific name or undefined
    "dataset": [], # dataset name or filepath
    "delivery": [], # on-web or email
}

# from user query sentences
constructed_pipeline = ""

In [None]:
# possible tasks: tabular classification, tabular regression, image classification, image regression, text classification, and text regression
states = ['standby', 'inquire', 'inference', 'running', 'deliver'] # possible states of the CA and AutoML Engine
user_slot = {"method": None, "task": None, "data_source": None, "dataset": None, "target": None, "delivery": 'chat'}

def intent_classification():
    return intent

In [None]:
train_data = {
    'start': [],
    'ongoing': [],
    'interrupt': [],
    'end': [],
}

# error handling

In [1]:
# generatin data set (may use dictionary based synonym replacement)

99

In [2]:
# simple keyword matching?
# Rule-based grammar matching	

In [3]:
# list of available Datasets (built-in or .csv, .txt, .xls, folders with .png or .jpg), Algorithms (just simply ML or DL), and Tasks

In [5]:
def text_normalization(text):
#     text=str(text).lower() # text to lower case
    stop_words = set(stopwords.words('english'))
    stop_words.add('please')
    text = re.sub('[^a-zA-z0-9]', ' ', text) # removing special characters
    text = nltk.word_tokenize(text) # word tokenizing
    lema = wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list = pos_tag(text, tagset=None) # parts of speech
    lema_words = []   # empty list 
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val = 'v'
        elif pos_token.startswith('J'): # Adjective
            pos_val = 'a'
        elif pos_token.startswith('R'): # Adverb
            pos_val = 'r'
        else:
            pos_val = 'n' # Noun
        lema_token = lema.lemmatize(token, pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    lema_words = pos_tag(lema_words)
    text = [item for item in lema_words if item[0].lower() not in stop_words]
    return text # returns the lemmatized tokens as a sentence

In [6]:
user_query = 'Hi! Please build an ML model that can classify 10 digits with MNIST data. Thanks~'

GREETING_INPUTS = ["hello", "hi", "greetings", "sup", "what's up","hey", "morning", "afternoon", "evening", "night"]
CONFIRM_WORDS = ["yes", "yep", "okay", "ok", "sure", "certainly", "definitely", "absolutely", "go ahead", "cool", "right", "of course"]
DENY_WORDS = ["no", "nope", "na", "not yet", "not sure", "more", "not", "don't", "do not", "again"]
END_WORDS = ["goodbye", "bye", "see you later", "thanks", "thank", "thank you"]
ML_METHODS = ['machine learning', 'ml', 'machine']
DL_METHODS = ['deep learning', 'dl', 'neural network', 'nn', 'neural', 'network', 'deep']
CLASSIFICATIONS = ['class', 'classify', 'classification', 'classifier', 'discrete output']
REGRESSIONS = ['regress', 'regression', 'regressor', 'continuous output']
DATA_SOURCES = ['upload', '<url>', 'this data', 'this dataset', 'my data', 'my dataset']
IMAGE_TYPES = ['image', 'picture', 'figure', 'art', 'draw', 'photo', 'photograph', 'portrait', 'painting', 'visual', 'illustration', 'symbol', 'view', 'vision', 'sketch', 'icon']
TEXT_TYPES = ['text', 'word', 'message', 'writing', 'script', 'content', 'document', 'passage', 'context', 'essay', 'manuscript', 'paper', 'language', 'letter', 'written', 'write', 'character', 'note', 'darft']
TABLE_TYPES = ['structured', 'structure', 'tabular', 'table', 'relation', 'database', 'dataframe', 'frame', 'normal', 'excel', 'csv', 'file', 'summary', 'process']
# AVAIL_DATASETS = pd.read_csv('openml_datasets.csv')['name'].apply(lambda x: x.lower()).to_list()
AVAIL_DATASETS = tfds.list_builders()
TARGET_VAR = ['<name> value', 'predict <name>', 'forecast <name>', 'classify <name>'] # regex ?
DELIVERY = ['by email', 'by e-mail', 'email', 'e-mail']

GREETING_RESPONSES = ["Yep, It's nice to see you here! 🙌🏻", "Hey~", "*nods*", "Hi there!", "Hello", "I am glad! You are talking to me~~~"]
END_RESPONSES = ["See you then! 🙌🏻", "Bye~", "Goodluck!", "Hope to see you again~", "Goodbye!~", "Thanks~"]

In [9]:
user_slot = {"method": None, "task": None, "data_source": None, "data_type": None, "dataset": None, "target": None, "delivery": 'chat'}

def standby_state(user_message):
    text = ''
    current_state = 'standby'
    global user_slot
    
    for message in user_message:
        if message.lower() in GREETING_INPUTS:
            text += random.choice(GREETING_RESPONSES) + '\n'
        elif message.lower() in END_WORDS:
            current_state = 'end'
            text = random.choice(END_RESPONSES)
        elif message.lower() in ML_METHODS:
            user_slot['method'] = 'ml'
        elif message.lower() in DL_METHODS:
            user_slot['method'] = 'dl'
        elif message.lower() in IMAGE_TYPES:
            user_slot['data_type'] = 'image'
        elif message.lower() in TEXT_TYPES:
            user_slot['data_type'] = 'text'
        elif message.lower() in TABLE_TYPES:
            user_slot['data_type'] = 'table'
        elif message.lower() in CLASSIFICATIONS:
            user_slot['task'] = 'cls'
        elif message.lower() in REGRESSIONS:
            user_slot['task'] = 'reg'
        elif message.lower() in AVAIL_DATASETS:
            user_slot['dataset'] = message.lower()
            user_slot['data_source'] = 'built_in'
            user_slot['target'] = 'label'
            tfds.load(message.lower())
        elif message.lower() in DATA_SOURCES:
            user_slot['data_source'] = 'user_define'
            user_slot['dataset'] = 'filepath'
        elif message.lower() in DELIVERY:
            user_slot['delivery'] = 'email'

    
    if current_state != 'end':
        if user_slot['method'] != None and user_slot['task'] != None and user_slot['data_source'] != None and user_slot['dataset'] != None and user_slot['target'] != None and user_slot['delivery'] != None:
            text += f"All you requested are well received! \n These are your requirements: {user_slot['method']}, {user_slot['task']}, {user_slot['dataset']}, {user_slot['target']}, and {user_slot['delivery']} \n Do you want to proceed?"
            current_state = 'await'

        else:
            text += 'I need more things to complete. Please specify the following list:\n'
            for key, value in user_slot.items():
                if value == None:
                    text += '- ' + re.sub('_', ' ', key) + '\n'

            current_state = 'active'
    else:
        current_state = 'standby'
        
    return text, current_state, user_slot


def active_state(user_message):
    text = ''
    current_state = 'active'
    global user_slot

    for message in user_message:
        if message.lower() in GREETING_INPUTS:
            text += random.choice(["Yep! We've already greeted!", "I've already known you~", "Please go to the next step!"]) + '\n'
        elif message.lower() in END_WORDS:
            current_state = 'standby'
            text = random.choice(END_RESPONSES)
        elif message.lower() in ML_METHODS:
            user_slot['method'] = 'ml'
        elif message.lower() in DL_METHODS:
            user_slot['method'] = 'dl'
        elif message.lower() in IMAGE_TYPES:
            user_slot['data_type'] = 'image'
        elif message.lower() in TEXT_TYPES:
            user_slot['data_type'] = 'text'
        elif message.lower() in TABLE_TYPES:
            user_slot['data_type'] = 'table'
        elif message.lower() in CLASSIFICATIONS:
            user_slot['task'] = 'cls'
        elif message.lower() in REGRESSIONS:
            user_slot['task'] = 'reg'
        elif message.lower() in AVAIL_DATASETS:
            user_slot['dataset'] = message.lower()
            user_slot['data_source'] = 'built_in'
            user_slot['target'] = 'label'
            tfds.load(message.lower())
        elif message.lower() in DATA_SOURCES:
            user_slot['data_source'] = 'user_define'
            user_slot['dataset'] = 'filepath'
        elif message.lower() in DELIVERY:
            user_slot['delivery'] = 'email'
    
    if current_state != 'standby':
        if user_slot['method'] != None and user_slot['task'] != None and user_slot['data_source'] != None and user_slot['dataset'] != None and user_slot['target'] != None and user_slot['delivery'] != None:
            text += f"All you requested are well received! \n These are your requirements: {user_slot['method']}, {user_slot['task']}, {user_slot['dataset']}, {user_slot['target']}, and {user_slot['delivery']} \n Do you want to proceed?"
            current_state = 'await'
        else:
            text += 'I need more things to complete. Please specify the following list:\n'
            for key, value in user_slot.items():
                if value == None:
                    text += '- ' + re.sub('_', ' ', key) + '\n'
            current_state = 'active'

    return text, current_state, user_slot

def await_state(user_message):
    text = ''
    current_state = 'await'
    global user_slot
    
    for message in user_message:
        if message.lower() in GREETING_INPUTS:
            text += random.choice(["Yep! We've already greeted!", "I've already known you~", "Please go to the next step!"]) + '\n'
        elif message.lower() in END_WORDS:
            current_state = 'standby'
            text = random.choice(END_RESPONSES)
        elif message.lower() in CONFIRM_WORDS:
            current_state = 'building'
        elif message.lower() in DENY_WORDS:
            current_state = 'await'
            text += 'Umm... Please check your requirement~'
        elif message.lower() in ML_METHODS:
            user_slot['method'] = 'ml'
        elif message.lower() in DL_METHODS:
            user_slot['method'] = 'dl'
        elif message.lower() in IMAGE_TYPES:
            user_slot['data_type'] = 'image'
        elif message.lower() in TEXT_TYPES:
            user_slot['data_type'] = 'text'
        elif message.lower() in TABLE_TYPES:
            user_slot['data_type'] = 'table'
        elif message.lower() in CLASSIFICATIONS:
            user_slot['task'] = 'cls'
        elif message.lower() in REGRESSIONS:
            user_slot['task'] = 'reg'
        elif message.lower() in AVAIL_DATASETS:
            user_slot['dataset'] = message.lower()
            user_slot['data_source'] = 'built_in'
            user_slot['target'] = 'label'
            tfds.load(message.lower())
        elif message.lower() in DATA_SOURCES:
            user_slot['data_source'] = 'user_define'
            user_slot['dataset'] = 'filepath'
        elif message.lower() in DELIVERY:
            user_slot['delivery'] = 'email'
    
    
    if current_state != 'building' and current_state != 'standby':   
        if user_slot['method'] != None and user_slot['task'] != None and user_slot['data_source'] != None and user_slot['dataset'] != None and user_slot['target'] != None and user_slot['delivery'] != None:
            text += f"All you requested are well received! \n These are your requirements: {user_slot['method']}, {user_slot['task']}, {user_slot['dataset']}, {user_slot['target']}, and {user_slot['delivery']} \n Do you want to proceed?"
            current_state = 'await'
    elif current_state == 'building':
        
        
    return text, current_state, user_slot

def building_state(user_message):
    text = ''
    current_state = 'building'
    global user_slot

    return text, current_state, user_slot

def response_text(current_state, user_message):    
    filtered_text = [token[0] for token in text_normalization(user_message)]

    response = {
        'standby': standby_state(filtered_text),
        'active': active_state(filtered_text),
        'await': await_state(filtered_text),
        'building': building_state(filtered_text),
    }
    
    return "I am sorry. I don't understand you. Please refer to the following examples." if response[current_state]  == '' else response[current_state]

In [None]:
current_state = 'standby'
print("Bot:", "Hi 👋🏻! I'm your model builder🧑🏻‍💻~ Just tell me which model do you want by simply following the examples below👇🏻.")

while True:
    user_query = input()
    response, current_state, user_slots = response_text(current_state, user_query)
    print(response, current_state)

Bot: Hi 👋🏻! I'm your model builder🧑🏻‍💻~ Just tell me which model do you want by simply following the examples below👇🏻.


 


I need more things to complete. Please specify the following list:
- method
- task
- data source
- data type
- dataset
- target
 active


 


I need more things to complete. Please specify the following list:
- method
- task
- data source
- data type
- dataset
- target
 active


 


I need more things to complete. Please specify the following list:
- method
- task
- data source
- data type
- dataset
- target
 active


 I want an image classifier with MNIST data set.


I need more things to complete. Please specify the following list:
- method
 active


 I want a deep learning-based image classifier with MNIST data set.


I need more things to complete. Please specify the following list:
- method
 active


 I want a DL model for image classification with MNIST data set.


All you requested are well received! 
 These are your requirements: dl, cls, mnist, label, and chat 
 Do you want to proceed? await


 yes


 building


## AutoML

In [None]:
import tensorflow as tf
import autokeras as ak
import autosklearn as ask

In [None]:
def build_model(specs):
    model = None
    return model