NLP BASED CHATBOT

In [9]:
#TOKENIZATION
from nltk.tokenize import word_tokenize
#STOPWORDS LIST
from nltk.corpus import stopwords
#stemming
from nltk.stem import PorterStemmer
#lemmatization
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:

training_data=[
    ("hello","greet"),
    ("hi","greet"),
    ("hello there","greet"),
    ("hey","greet"),
    ("good morning","greet"),
    ("good afternoon","greet"),
    ("good evening","greet"),
    ("what is the weather today","weather"),
    ("what is the temperature today","weather"),
    ("what is the humidity today","weather"),
    ("is it raining","weather"),
    ("open google","open_web"),
    ("open youtube","open_web"),
    ("open facebook","open_web"),
    ("go to google","open_web"),
    ("go to youtube","open_web"),
    ("go to facebook","open_web"),
    ("bye","exit"),
    ("goodbye","exit"),
    ("see you later","exit"),
    ("bye bye","exit"),
    ("quit","exit")
]

In [11]:
sentences=[]  #features/ X
labels=[]   #features/ Y
for text, intent in training_data:
    sentences.append(text)
    labels.append(intent)

In [12]:
def preprocess_text(documents):
    english_stopwords = stopwords.words("english")
    punctuations = string.punctuation
    cleaned_documents = []
    for doc in documents:
        # Step-1 : Lowercase
        raw_text = doc.lower()
        print("After lowercase: ",raw_text)

        tokens = word_tokenize(raw_text)
        print("Tokens:",tokens)

        filtered_tokens = []
        for word in tokens:
            if word not in english_stopwords:
                filtered_tokens.append(word)

        print("Filtered Tokens :",filtered_tokens)

        clean_tokens = [word for word in filtered_tokens if word not in punctuations]
        print("After removing punctuations:",clean_tokens)

        lemmatized_words = []
        wnet = WordNetLemmatizer()
        for word in clean_tokens:
            lemmatized_words.append(wnet.lemmatize(word,"v"))

        print("After Lemmatization :",lemmatized_words)

        final_tokens = []
        for word in lemmatized_words:
            if word.isalpha():
                final_tokens.append(word)

        print("Final Tokens:",final_tokens)

        cleaned_text = " ".join(final_tokens)
        print("Cleaned Text:",cleaned_text)

        cleaned_documents.append(cleaned_text)
        print("="*50)
    return cleaned_documents

cleaned_sentences = preprocess_text(sentences)

After lowercase:  hello
Tokens: ['hello']
Filtered Tokens : ['hello']
After removing punctuations: ['hello']
After Lemmatization : ['hello']
Final Tokens: ['hello']
Cleaned Text: hello
After lowercase:  hi
Tokens: ['hi']
Filtered Tokens : ['hi']
After removing punctuations: ['hi']
After Lemmatization : ['hi']
Final Tokens: ['hi']
Cleaned Text: hi
After lowercase:  hello there
Tokens: ['hello', 'there']
Filtered Tokens : ['hello']
After removing punctuations: ['hello']
After Lemmatization : ['hello']
Final Tokens: ['hello']
Cleaned Text: hello
After lowercase:  hey
Tokens: ['hey']
Filtered Tokens : ['hey']
After removing punctuations: ['hey']
After Lemmatization : ['hey']
Final Tokens: ['hey']
Cleaned Text: hey
After lowercase:  good morning
Tokens: ['good', 'morning']
Filtered Tokens : ['good', 'morning']
After removing punctuations: ['good', 'morning']
After Lemmatization : ['good', 'morning']
Final Tokens: ['good', 'morning']
Cleaned Text: good morning
After lowercase:  good afternoo

In [13]:
#cleaned data
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(cleaned_sentences)
model=LogisticRegression()
model.fit(X,labels)

In [14]:
user_msg="open linked.in"
processed=preprocess_text([user_msg])
user_vector=vectorizer.transform(processed)
prediction=model.predict(user_vector)
print("Prediction is :",prediction)

After lowercase:  open linked.in
Tokens: ['open', 'linked.in']
Filtered Tokens : ['open', 'linked.in']
After removing punctuations: ['open', 'linked.in']
After Lemmatization : ['open', 'linked.in']
Final Tokens: ['open']
Cleaned Text: open
Prediction is : ['open_web']


In [15]:
import joblib

In [16]:
joblib.dump(model,"intent_model.pkl")
joblib.dump(vectorizer,"vectorizer.pkl")

['vectorizer.pkl']