# Bag of Words

In [66]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
stop = set(stopwords.words("english"))

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [67]:
def bog(corpus):
    filtered_corpus = [" ".join([word.lower() for word in word_tokenize(document) if word.lower() not in stop]) for document in corpus]
    return np.array(vectorizer.fit_transform(filtered_corpus).toarray())

allsentences = ["Joe waited for the train", "The train was late", "Mary and Samantha took the bus"]
filtered_allsentences = [" ".join([word.lower() for word in word_tokenize(sentence) if word.lower() not in stop]) for sentence in allsentences]

X = vectorizer.fit_transform(allsentences).toarray()
print(X,'\n')
X = vectorizer.fit_transform(filtered_allsentences).toarray()
print(X)

[[0 0 1 1 0 0 0 1 0 1 1 0]
 [0 0 0 0 1 0 0 1 0 1 0 1]
 [1 1 0 0 0 1 1 1 1 0 0 0]] 

[[0 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 1 0]
 [1 0 0 1 1 1 0 0]]


In [68]:
# Why the len diff?
print(allsentences)
X = vectorizer.fit_transform(allsentences).toarray()
print(len(X[0]))

# count number of unique words
counts = set()
for s in allsentences:
    counts.update(s.split())
len(counts)

['Joe waited for the train', 'The train was late', 'Mary and Samantha took the bus']
12


13

## Look up best sentence given query

In [69]:
import numpy as np
corpus = np.array(X)
corpus

array([[0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1],
       [1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0]], dtype=int64)

In [70]:
bog_corpus = bog(allsentences)

In [72]:
def look_up(query, top_hits=0):
    assert top_hits <= len(corpus), "top_hits must be less than or equal to the number of documents in the corpus"
    
    # filter query
    query = " ".join([word.lower() for word in word_tokenize(query) if word.lower() not in stop])
    query = vectorizer.transform([query]).toarray()
    query = np.array(query[0])

    product = query @ bog_corpus.T
    # return top 5 results
    print(product)
    return np.argpartition(product, -top_hits)[-top_hits:][::-1] if top_hits else [product.argmax()]
    # return product.argsort()[-top_hits:][::-1] if top_hits else [product.argmax()]

    # calculate cosine similarity
    return query @ corpus.T / (np.linalg.norm(query) * np.linalg.norm(corpus, axis=1))

print(look_up("What is Joe waiting for?"), '\n')
print(look_up("What is Joe waiting for?", 3), '\n')
print(look_up("Was Mary too late for the the?"), '\n')
print(look_up("Was Mary too late for the the?", 3), '\n')

[1 0 0]
[0] 

[1 0 0]
[2 0 1] 

[0 1 1]
[1] 

[0 1 1]
[2 1 0] 



In [73]:
A = np.array([9, 4, 4, 3, 3, 9, 0, 4, 6, 0])
A = np.arange(10)
# shuffle A
np.random.shuffle(A)
print(A)
ind = np.argpartition(A, -4)[-4:][::-1]
print(ind)
# print(A[ind])
# print(ind[np.argsort(A[ind])])
# sort A
# A.argsort()
# print(A)
# A.sort()[-5:][::-1]

[4 5 2 1 9 0 7 8 3 6]
[4 6 7 9]


## Applied to history data set

In [31]:
# Get data
import pickle
with open(f'Neural Search/data/ww2.pkl', 'rb') as f:
    df_data = pickle.load(f)
df_data.reset_index(inplace=True)
df_data.head()

Unnamed: 0,index,context,question
0,0,The Origins of the Second World War 1933–194...,"What is the focus of the book ""The Origins of ..."
1,1,The Origins of the Second World War 1933–194...,Who is Ruth Henig and what is her background i...
2,2,The Origins of the Second World War 1933–194...,What are some of the factors that the book ana...
3,3,The Origins of the Second World War 1933–194...,How has the second edition of the book been up...
4,4,The Origins of the Second World War 1933–194...,What is included in the Guide to Further Readi...


In [32]:
# Create the corpus
corpus = df_data['context'].values

In [33]:
corpus = list(set(corpus))[0:10]
bog_corpus = bog(corpus)
bog_corpus.shape

(10, 580)

In [34]:
look_up("The Origins of the Second World War")

(10, 580)


7

In [35]:
list(set(corpus))[7]

' As the American historian Sally Marks has noted, Hitler’s aims  were more vast, his ideology very different and his methods much more confrontational  than those of any previous German leaders By the 1990s, Taylor’s interpretation of  Hitler and of the origins of the Second World War had been vigorously rejected by the  great majority of historians researching inter-war history David Kaiser’s verdict, in  Modern Germany Reconsidered, edited by Gordon Martel and published in 1992, was  that Taylor’s views that ‘Hitler did not intend war in 1939 and lacked a real plan for the  conquest of Europe and of the world, and that other governments played a crucial role in  unleashing German expansionism’ can no longer be regarded as valid  However, by this time, a new debate had been raging for some years about the extent  to which the course of Nazi foreign policy had been decisively and exclusively shaped by  Hitler'