# Chatbot
by Muhammad Jaysy Ansharulloh
## Import datasets

In [None]:
entities_filepath = 'datasets/entities.json'
intents_filepath = 'datasets/intents.json'

In [None]:
import json

In [None]:
with open(intents_filepath) as fi:
    intents = json.load(fi)

with open(entities_filepath) as fe:
    entities = json.load(fe)

In [None]:
print(intents)

In [None]:
print(entities)

## JSON to dataframe

In [None]:
import pandas as pd

In [None]:
df_intents = pd.json_normalize(intents)
raw_df_intents = df_intents

In [None]:
df_intents

## Process Text
### download and import required libraries and dictionaries

In [None]:
import nltk

language = 'english'
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

stop_words = set(stopwords.words(language))

In [None]:
import re

### text preprocessing

In [None]:
intent_dictionary = {}
for index, row in df_intents.iterrows():
    if row['intent'] not in intent_dictionary:
        intent_dictionary[row['intent']] = []
    
    # regex remove non-word
    regex = r'[^\w]'

    # sentence tokenization
    sentences = nltk.sent_tokenize(re.sub(regex, ' ', row['text'].lower()))

    # word tokenization
    temp_words = [nltk.word_tokenize(sentence) for sentence in sentences]
    words = []
    for word in temp_words:
        # text lemmatization and stemming
        lemmatizer = WordNetLemmatizer()
        # stop words filter
        words.extend([lemmatizer.lemmatize(w, wordnet.VERB) for w in word if not w in stop_words])

    intent_dictionary[row['intent']].extend(words)

In [None]:
intent_dictionary

### dictionary to json to dataframe

In [None]:
json_intent = []
for key in intent_dictionary:    
    d = {}
    d['intent'] = key
    d['text'] = ' '.join(list(intent_dictionary[key]))
    json_intent.append(d)

In [None]:
print(json_intent)

In [None]:
df_intents = pd.DataFrame(json_intent)

In [None]:
df_intents

## Scoring

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# list of text
texts = df_intents['text'].tolist()

In [None]:
texts

### create model

In [None]:
count_vectorizer = CountVectorizer()
intent_values = count_vectorizer.fit_transform(texts)

### model to dataframe

In [None]:
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(intent_values.toarray(), columns=feature_names)

## Create ML Model

In [None]:
X = intent_values.toarray()
y = df_intents.iloc[:, 0]

In [None]:
print(X)
print(y)

### train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)