In [1]:
import pandas as pd
import sklearn
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arvee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
columns = ['sent', 'class']
rows = []

rows = [['This is my book', 'stmt'], 
        ['They are novels', 'stmt'],
        ['have you read this book', 'question'],
        ['who is the author', 'question'],
        ['what are the characters', 'question'],
        ['This is how I bought the book', 'stmt'],
        ['I like fictions', 'stmt'],
        ['what is your favorite book', 'question']]

training_data = pd.DataFrame(rows, columns=columns)
training_data

Unnamed: 0,sent,class
0,This is my book,stmt
1,They are novels,stmt
2,have you read this book,question
3,who is the author,question
4,what are the characters,question
5,This is how I bought the book,stmt
6,I like fictions,stmt
7,what is your favorite book,question


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

stmt_docs = [row['sent'] for index,row in training_data.iterrows() if row['class'] == 'stmt']

vec_s = CountVectorizer()
X_s = vec_s.fit_transform(stmt_docs)
tdm_s = pd.DataFrame(X_s.toarray(), columns=vec_s.get_feature_names())

tdm_s

Unnamed: 0,are,book,bought,fictions,how,is,like,my,novels,the,they,this
0,0,1,0,0,0,1,0,1,0,0,0,1
1,1,0,0,0,0,0,0,0,1,0,1,0
2,0,1,1,0,1,1,0,0,0,1,0,1
3,0,0,0,1,0,0,1,0,0,0,0,0


In [4]:
q_docs = [row['sent'] for index,row in training_data.iterrows() if row['class'] == 'question']

vec_q = CountVectorizer()
X_q = vec_q.fit_transform(q_docs)
tdm_q = pd.DataFrame(X_q.toarray(), columns=vec_q.get_feature_names())

tdm_q

Unnamed: 0,are,author,book,characters,favorite,have,is,read,the,this,what,who,you,your
0,0,0,1,0,0,1,0,1,0,1,0,0,1,0
1,0,1,0,0,0,0,1,0,1,0,0,1,0,0
2,1,0,0,1,0,0,0,0,1,0,1,0,0,0
3,0,0,1,0,1,0,1,0,0,0,1,0,0,1


In [5]:
word_list_s = vec_s.get_feature_names();    
count_list_s = X_s.toarray().sum(axis=0) 
freq_s = dict(zip(word_list_s,count_list_s))
freq_s

{'are': 1,
 'book': 2,
 'bought': 1,
 'fictions': 1,
 'how': 1,
 'is': 2,
 'like': 1,
 'my': 1,
 'novels': 1,
 'the': 1,
 'they': 1,
 'this': 2}

In [6]:
word_list_q = vec_q.get_feature_names();    
count_list_q = X_q.toarray().sum(axis=0) 
freq_q = dict(zip(word_list_q,count_list_q))
freq_q

{'are': 1,
 'author': 1,
 'book': 2,
 'characters': 1,
 'favorite': 1,
 'have': 1,
 'is': 2,
 'read': 1,
 'the': 2,
 'this': 1,
 'what': 2,
 'who': 1,
 'you': 1,
 'your': 1}

In [7]:
prob_s = []
for word,count in zip(word_list_s, count_list_s):
    prob_s.append(count/len(word_list_s))
dict(zip(word_list_s, prob_s))

{'are': 0.08333333333333333,
 'book': 0.16666666666666666,
 'bought': 0.08333333333333333,
 'fictions': 0.08333333333333333,
 'how': 0.08333333333333333,
 'is': 0.16666666666666666,
 'like': 0.08333333333333333,
 'my': 0.08333333333333333,
 'novels': 0.08333333333333333,
 'the': 0.08333333333333333,
 'they': 0.08333333333333333,
 'this': 0.16666666666666666}

In [8]:
prob_q = []
for count in count_list_q:
    prob_q.append(count/len(word_list_q))
dict(zip(word_list_q, prob_q))

{'are': 0.07142857142857142,
 'author': 0.07142857142857142,
 'book': 0.14285714285714285,
 'characters': 0.07142857142857142,
 'favorite': 0.07142857142857142,
 'have': 0.07142857142857142,
 'is': 0.14285714285714285,
 'read': 0.07142857142857142,
 'the': 0.14285714285714285,
 'this': 0.07142857142857142,
 'what': 0.14285714285714285,
 'who': 0.07142857142857142,
 'you': 0.07142857142857142,
 'your': 0.07142857142857142}

In [9]:

from sklearn.feature_extraction.text import CountVectorizer

docs = [row['sent'] for index,row in training_data.iterrows()]

vec = CountVectorizer()
X = vec.fit_transform(docs)

total_features = len(vec.get_feature_names())
total_features

21

In [10]:
total_cnts_features_s = count_list_s.sum(axis=0)
total_cnts_features_q = count_list_q.sum(axis=0)

print(total_cnts_features_q)
print(total_cnts_features_s)

18
15


In [11]:
from nltk.tokenize import word_tokenize
new_sentence = 'what is the price of the book'
new_word_list = word_tokenize(new_sentence)

In [12]:
prob_s_with_ls = []
for word in new_word_list:
    if word in freq_s.keys():
        count = freq_s[word]
    else:
        count = 0
    prob_s_with_ls.append((count + 1)/(total_cnts_features_s + total_features))
stmt_prob = dict(zip(new_word_list,prob_s_with_ls))
print(stmt_prob)

{'what': 0.027777777777777776, 'is': 0.08333333333333333, 'the': 0.05555555555555555, 'price': 0.027777777777777776, 'of': 0.027777777777777776, 'book': 0.08333333333333333}


In [13]:
prob_q_with_ls = []
for word in new_word_list:
    if word in freq_q.keys():
        count = freq_q[word]
    else:
        count = 0
    prob_q_with_ls.append((count + 1)/(total_cnts_features_q + total_features))
quest_prob = dict(zip(new_word_list,prob_q_with_ls))
print(quest_prob)

{'what': 0.07692307692307693, 'is': 0.07692307692307693, 'the': 0.07692307692307693, 'price': 0.02564102564102564, 'of': 0.02564102564102564, 'book': 0.07692307692307693}


In [14]:
from nltk.tokenize import word_tokenize
sentence = "What is the price of the book?"
sentence = sentence.lower()
tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(sentence)
print(new_words)
prob_quest = 1
prob_stmt = 1
for word in new_words:
    prob_quest *= quest_prob[word]
    prob_stmt *= stmt_prob[word]
print("P(What is the price of the book|Question) = ",prob_quest)
print("P(What is the price of the book|Stmt) = ",prob_stmt)

['what', 'is', 'the', 'price', 'of', 'the', 'book']
P(What is the price of the book|Question) =  1.7707368464359263e-09
P(What is the price of the book|Stmt) =  4.5939365799778324e-10


In [15]:
print("P(Stmt|What is the price of the book) = P(what is the price of the book|Stmt)*P(Stmt)")
print("P(Stmt|What is the price of the book) = ",prob_stmt*0.5)

P(Stmt|What is the price of the book) = P(what is the price of the book|Stmt)*P(Stmt)
P(Stmt|What is the price of the book) =  2.2969682899889162e-10


In [16]:
print("P(Question|What is the price of the book) = P(what is the price of the book|Question)*P(Question)")
print("P(Question|What is the price of the book) = ",prob_quest*0.5)

P(Question|What is the price of the book) = P(what is the price of the book|Question)*P(Question)
P(Question|What is the price of the book) =  8.853684232179632e-10


In [17]:
assert(prob_quest>prob_stmt)
print("Therefore the new sentence ‘What is the price of the book’ will be classified as ‘Question’")

Therefore the new sentence ‘What is the price of the book’ will be classified as ‘Question’
