# Context Classification

In [1]:
# importing spacy library
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# making a list of stop words
import string
punct = string.punctuation
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS) # list of stopwords

In [3]:
# creating a function for data cleaning
def text_data_cleaning(sentence):
  doc = nlp(sentence)

  tokens = [] # list of tokens
  for token in doc:
    if token.lemma_ != "-PRON-":
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
 
  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punct:    # Stopwords and punctuation removal
      cleaned_tokens.append(token)
  return cleaned_tokens

In [4]:
# import dependencies
import pickle
# load the saved model from file
with open('Context_selector.pkl', 'rb') as f:
    model = pickle.load(f)

  model = pickle.load(f)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### Testing with user examples

In [5]:
sent = ["give me some eastern food suggestions"]
print (model.predict(sent))

['suggestions']


In [6]:
sent = ["Can you add some sci-fi novels ?"]
print (model.predict(sent))

['suggestions']


## Using a Bir-gram Model to Predict the suitable response

##### Using ngrams - SpaCy

In [15]:
# Using N-grams (Bi-gram) Model to predict the suitble resonse
import spacy

# Load the spacy model
nlp = spacy.load('en_core_web_sm')

# Define the list of possible responses
response_list = ["Michael Crichton wrote Jurassic Park", "Lee Child wrote Killing Floor", "Michael Crichton wrote Andromeda Strain"]

# Define the user input
user_input = "Who wrote Andromeda Strain?"

# Convert the user input to a spacy Doc object
doc = nlp(user_input)

# Define the number of n-grams to generate
n = 2

# Generate the n-grams from the user input
user_ngrams = []
for i in range(len(doc)-n+1):
    ngram = doc[i:i+n]
    user_ngrams.append(' '.join(str(token) for token in ngram))

# Loop through the possible responses and find the one with the highest n-gram overlap
max_overlap = 0
best_response = ""
for response in response_list:
    # Convert the response to a spacy Doc object
    response_doc = nlp(response)
    
    # Generate the n-grams from the response
    response_ngrams = []
    for i in range(len(response_doc)-n+1):
        ngram = response_doc[i:i+n]
        response_ngrams.append(' '.join(str(token) for token in ngram))
    
    # Calculate the n-gram overlap
    overlap = len(set(user_ngrams) & set(response_ngrams))
    
    # Update the best response if the current response has a higher overlap
    if overlap > max_overlap:
        max_overlap = overlap
        best_response = response

# Print the best response
print(best_response)


Michael Crichton wrote Andromeda Strain


In [18]:
response_ngrams

['Michael Crichton', 'Crichton wrote', 'wrote Andromeda', 'Andromeda Strain']

In [19]:
user_ngrams

['Who wrote', 'wrote Andromeda', 'Andromeda Strain', 'Strain ?']

In [20]:
max_overlap

2

#### Using ngrams - NLTK

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Define the list of possible responses
response_list = ["Michael Crichton wrote Jurassic Park", "Lee Child wrote Killing Floor", "Michael Crichton wrote Andromeda Strain"]

# Define the user input
user_input = "Who wrote Andromeda Strain ?"

# Tokenize the user input
user_tokens = word_tokenize(user_input)

# Define the number of n-grams to generate
n = 2

# Generate the n-grams from the user input
user_ngrams = list(ngrams(user_tokens, n))

# Loop through the possible responses and find the one with the highest n-gram overlap
max_overlap = 0
best_response = ""
for response in response_list:
    # Tokenize the response
    response_tokens = word_tokenize(response)
    
    # Generate the n-grams from the response
    response_ngrams = list(ngrams(response_tokens, n))
    
    # Calculate the n-gram overlap
    overlap = len(set(user_ngrams) & set(response_ngrams))
    
    # Update the best response if the current response has a higher overlap
    if overlap > max_overlap:
        max_overlap = overlap
        best_response = response

# Print the best response
print(best_response)


Michael Crichton wrote Andromeda Strain


## Using a Bi-gram Model for Probability Estimation

In [11]:
# probability estimation using N-grams Model
import collections

# Define the input text
text = "The quick brown fox jumps over the lazy dog"

# Tokenize the text
tokens = text.split()

# Define the n-gram size
n = 2

# Generate the n-grams
ngrams = []
for i in range(len(tokens)-n+1):
    ngram = tuple(tokens[i:i+n])
    ngrams.append(ngram)

# Count the frequency of each n-gram
freq_table = collections.Counter(ngrams)

# Calculate the total number of n-grams
total_ngrams = len(ngrams)

# Estimate the probability of each n-gram
prob_table = {}
for ngram, freq in freq_table.items():
    prob = freq / total_ngrams
    prob_table[ngram] = prob

# Print the frequency and probability table for the n-grams
print("Frequency table:")
print(freq_table)
print("Probability table:")
print(prob_table)


Frequency table:
Counter({('The', 'quick'): 1, ('quick', 'brown'): 1, ('brown', 'fox'): 1, ('fox', 'jumps'): 1, ('jumps', 'over'): 1, ('over', 'the'): 1, ('the', 'lazy'): 1, ('lazy', 'dog'): 1})
Probability table:
{('The', 'quick'): 0.125, ('quick', 'brown'): 0.125, ('brown', 'fox'): 0.125, ('fox', 'jumps'): 0.125, ('jumps', 'over'): 0.125, ('over', 'the'): 0.125, ('the', 'lazy'): 0.125, ('lazy', 'dog'): 0.125}
