## Development

In [5]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
stop = set(stopwords.words("english"))

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [5]:
def bow(corpus):
    filtered_corpus = [" ".join([word.lower() for word in word_tokenize(document) if word.lower() not in stop]) for document in corpus]
    return np.array(vectorizer.fit_transform(filtered_corpus).toarray())

allsentences = ["Joe waited for the train", "The train was late", "Mary and Samantha took the bus"]
filtered_allsentences = [" ".join([word.lower() for word in word_tokenize(sentence) if word.lower() not in stop]) for sentence in allsentences]

X = vectorizer.fit_transform(allsentences).toarray()
print(X,'\n')
X = vectorizer.fit_transform(filtered_allsentences).toarray()
print(X)

[[0 0 1 1 0 0 0 1 0 1 1 0]
 [0 0 0 0 1 0 0 1 0 1 0 1]
 [1 1 0 0 0 1 1 1 1 0 0 0]] 

[[0 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 1 0]
 [1 0 0 1 1 1 0 0]]


In [6]:
# Why the len diff?
print(allsentences)
X = vectorizer.fit_transform(allsentences).toarray()
print(len(X[0]))

# count number of unique words
counts = set()
for s in allsentences:
    counts.update(s.split())
len(counts)

['Joe waited for the train', 'The train was late', 'Mary and Samantha took the bus']
12


13

## Look up best sentence given query

In [7]:
corpus = bow(allsentences)
corpus

array([[0, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 1, 1, 1, 0, 0]], dtype=int64)

In [8]:
def look_up(query, corpus, top_hits=1):
    assert top_hits <= len(corpus), "top_hits must be less than or equal to the number of documents in the corpus"
    
    # filter query
    query = " ".join([word.lower() for word in word_tokenize(query) if word.lower() not in stop])
    query = vectorizer.transform([query]).toarray()
    query = np.array(query[0])
    product = query @ corpus.T
    return product.argsort()[-top_hits:][::-1]

look_up("Joe waited for the train", corpus, top_hits=1)

array([0], dtype=int64)

## Applied to history data set

In [9]:
import pickle

In [10]:
# Get book
book = 'ww2'
embedding_length = 1000
with open(f'neural_search/Embeddings/{book}_context_embeddings_{embedding_length}.pkl', 'rb') as f:
    context_embeddings = pickle.loads(f.read())
book = list(context_embeddings.keys())

In [11]:
corpus = bow(book)
corpus.shape

(251, 4736)

In [12]:
look_up("The Origins of the Second World War", corpus)

array([239], dtype=int64)

In [13]:
book[239]

' Bell’s The Origins of the Second World War in Europe (London, 1986) Slightly dated but still useful is Esmonde Robertson’s edited collection  of essays, The Origins of the Second World War (London, 1971), with contributions from  A.J.P.Taylor, Alan Bullock, Hugh Trevor-Roper and Tim Mason amongst others Clearly  students will want to see why there was so much controversy over Taylor’s interpretation  by reading A.J.P.Taylor, The Origins of the Second World War (London, 1961; second  edition, with new introduction, 1963) This should be read in conjunction with two  stimulating collections of essays edited by Gordon Martel, The Origins of the Second  World War Reconsidered: The A.J.P.Taylor Debate After Twenty-Five Years (London,  1986) and The Origins of the Second World War Reconsidered: A.J.P.Taylor and the  Historians (London, 1999)'