# Semantic search

Using sentence_transformers

In [3]:
from sentence_transformers import SentenceTransformer, util
import torch 


In [4]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(corpus, convert_to_tensor=True)

In [6]:
queries = ['A man is eating pasta.', 
    'Someone in a gorilla costume is playing a set of drums.', 
    'A cheetah chases prey on across a field.']

In [7]:
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)

    scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(scores, k=top_k)

    print(f"Query: {query}")
    print(f"Top 5 most similar sentences in corpus:")

    for score, i in zip(top_results[0], top_results[1]):
        print(corpus[i], f"(Score: {score:.4f})")

Query: A man is eating pasta.
Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.7035)
A man is eating a piece of bread. (Score: 0.5272)
A man is riding a horse. (Score: 0.1889)
A man is riding a white horse on an enclosed ground. (Score: 0.1047)
A cheetah is running behind its prey. (Score: 0.0980)
Query: Someone in a gorilla costume is playing a set of drums.
Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.6433)
A woman is playing violin. (Score: 0.2564)
A man is riding a horse. (Score: 0.1389)
A man is riding a white horse on an enclosed ground. (Score: 0.1191)
A cheetah is running behind its prey. (Score: 0.1080)
Query: A cheetah chases prey on across a field.
Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.8253)
A man is eating food. (Score: 0.1399)
A monkey is playing drums. (Score: 0.1292)
A man is riding a white horse on an enclosed ground. (Score: 0.1097)
A man is riding a horse. (Score: 

In [8]:
# Digging deeper - what is in embeddings?
embeddings.size()

# The tensor is 9x384. 9 sentences in the corpus with each embedding of length
# 384. This is what an individual embedding looks like. We will compute cosine
# similarity against these vectors given the query embedding.
embeddings[0]

tensor([ 3.3242e-02,  4.4061e-03, -6.2769e-03,  4.8379e-02, -1.3870e-01,
        -3.3617e-02,  1.0113e-01, -5.4385e-02, -4.3248e-02, -3.9941e-02,
         7.7863e-03, -1.2749e-02, -6.6830e-02, -1.7387e-02,  4.7451e-02,
        -5.7724e-02,  1.0189e-01, -9.1167e-04,  8.2261e-02, -5.0342e-02,
         6.7730e-02,  4.0877e-02, -3.5802e-02, -1.0068e-01, -6.6935e-03,
        -5.3169e-02,  1.0034e-01, -5.4614e-02, -2.2848e-02,  1.3839e-02,
         7.4866e-02, -6.1788e-02,  6.3922e-02,  1.6239e-02, -5.3230e-02,
        -3.8608e-02,  3.1528e-02, -8.1153e-02, -3.3143e-02, -5.3849e-04,
        -3.9607e-03, -1.5273e-02, -9.8642e-04,  9.5799e-02, -5.4292e-02,
         1.8457e-02, -1.0714e-01,  1.3888e-02,  3.9407e-02, -2.6924e-02,
        -9.1599e-02, -1.1420e-02,  3.3814e-02, -2.5844e-02,  6.4262e-02,
         1.2114e-02,  2.1777e-02,  9.1483e-02, -1.0504e-01, -2.1919e-02,
         3.1334e-02, -5.5160e-02,  2.8510e-02, -2.4123e-02,  4.9336e-02,
        -6.8366e-02, -1.9275e-02, -1.2098e-02, -2.4

In [1]:
# But for long text, we need to chunk and compute embeddings for each chunk.
text = """
I’ve long thought that Zig was an interesting programming language,
potentially more interesting than Rust in many respects given that Zig seems
to be targetting a more modern C-like language replacement whereas Rust firmly
looks like it is trying to take C++ out back like ol' yeller. Rust is
powerful, but the language is complicated, and no I’m not talking about the
borrow-checker (a completely genius idea) but the language itself is vast and
complex. Try and read a moderately complex Rust crate and it can be mind
boggling to work out what is going on.

On the other hand Zig, with a strong ethos guided by Andrew Kelley, has this
guiding light that there should be one way to do something in the language,
and that is something that I really appreciate in language design.

Last year I attempted to do Advent of Code 2020 in Zig, but the language was
just a little too fresh for me to get into. The documentation was basically
non-existent, and even getting the tools and working how to use them was too
confusing for me. On day one I gave up and switched to Rust instead. This year
though I was determined to try the whole challenge in Zig, and what a
difference a year has made to the language! The community is now massive,
there are GitHub templates for Advent of Code to just get you coding, and the
Zig documentation is so rich and detailed that I could pick up some of the
basic concepts quite quickly.

So now that I’ve completed Advent of Code 2021, I thought I’d share the good
and the bad about Zig, and some summary thoughts on the language.

Note: I’m assuming a base level of understanding about what Zig is here, there
are plenty of guides on the language available elsewhere!

The Good⌗ The best thing about Zig is that the language is small. There isn’t
even a foreach for like structure and Andrew has stated ‘While loops work, why
add another way?' and I really appreciate this approach. It means I am not
wondering about what tool to reach for when I want to do something, there is a
single tool with a single use. Especially when learning a language (for myself
and for anyone else that would want to pick it up) - brevity is key. I think
Rust got lost in trying to nicely provide so much of what C++ badly provides
that random users of the language looking at any arbitrary code written in
Rust suffer for the sheer breadth of the language. Zig’s approach here meant I
could read the standard library code and understand what it was doing (even
with all the comptime type fun!).

Nullability is fun in Zig - the fact that optionality is built into the
language with the ? prefix on types (so ?i32 is maybe-a-32-bit-integer) and
that they have combined this with pointers so that you can assume that any
pointer (*i32 for instance) isn’t null. This is great for the compiler, great
for the optimizer, and I think also great for the user.

How things are brought in from the standard library or general foreign code is
interesting:

const std = @import("std"); const print = std.debug.print; There is a builtin
compiler marco @import that does the heavy lifting of pulling in the code, and
then you assign this into a const some_var variable. This is really neat
because you could call that whatever you wanted (to avoid naming conflicts).
Also when you want to pull in definitions from within an imported package you
just use the same mechanism of assigning the package.with.a.thing.in-it into a
constant variable. Most other languages have a using foo::bar::haz::baz; type
mechanism for this, but having it use the same mechanism for a bunch of
different things means that you don’t have to switch in your head to another
tool. I hadn’t considered this language concept before using Zig, and its a
very good idea!

The fact all containers take an allocator on intialization, and you can only
get a heap pointer via an allocator is genius in Zig. Memory isn’t free, and
allocations are not cheap, and so making getting at heap allocations harder by
explicitly getting them through an allocator is a great thing.

Also the error mechanism in Zig is wonderful. Zig has this special prefix for
a type (for example !u32 means ‘an error or a u32') and you can cascade errors
from deep in Zig code with the try statement. So var x = try foo(); means x is
equal to the result of foo() unless there was an error in the result. If there
was an error, return from the function with the error now. This meant that you
don’t have the messy littering of if conditionals after every function that
you typically get in C, but you also don’t have the complete disaster that is
exceptions in C++/C#. Rust has a similar mechanism to this, but they use the
clunkier Result<T, E>. While Zig has effectively added another thing for the
frontend to handle by adding in a ! prefix on the types, the language is
certainly nicer for it.

The Bad⌗ There are a collection of things in Zig that I didn’t like. All
languages have things that any random subset of users won’t like, so I am not
saying Zig should change any of these or anything like that.

Initializing arrays is weird in Zig. Lets say you want to have a 0 initialized
array, you declare it like [_]u8{0} ** 4 which means I want an array, of type
u8, that is initialized to 0 and is 4 elements long. You get used to the
syntax, but it’s not intuitive.

For loops are a bit strange too - you write for (items) |item| {}, which means
you specify the container before the per-element variable. Mentally I think of
for as for something in many_things {} and so in Zig I constantly had to write
it wrong and then rewrite. Also you use the | character in Zig quite a lot,
and while this may just be a problem with Apple UK keyboards, actually getting
to the | character on my laptop was uncomfortable. When doing C/C++ or Rust,
you use the | character much less and so the pain of writing the character was
something I never noticed before. Jonathan Blow has gone on the record to say
that with his language, Jai, he spent a lot of time working out how easy it
would be to type common things, such that the more common an operation in the
language, the easier it would be to type. That seems to be missing here (well
at least for Apple UK keyboard layouts, I’d need to write Zig extensively on
another layout to know whether this was a universal thing!).

Switch statements where you want to have multiple arguments resolve to the
same code I wrote as a | b, whereas in Zig it is a, b. Nothing major with
this, but I constantly tripped up on this.

Zig test was a bit clunky - you have to specify the file you want to test. So
to test src/foo.zig, you’d do zig test src/foo.zig. I wanted something more
like Rust’s cargo test that’d find all tests and run them. Maybe Zig does have
this but I just didn’t find it?

And how you declare functions is a little strange. Like a function in a struct
would be:

const foo = struct { pub fn bar() {}
};
Everything in Zig is const x = blah;, so why are functions not const bar =
function() {};?

The Ugly⌗ Zig is still a little raw in a few areas. Some compile errors are
less than useful. For instance if you forgot to put !T on a return type, but
were using try in the body of the function, the compiler error was very
confusing. This is only really an issue for new Zig users (like I was when I
first hit this), because you quickly learn that when the compiler spits out
something less than useful and you are using try, check the return type first.
Occassionally Zig would spit out 100’s of lines of notes after an error,
giving me flashbacks to the C++ template mess errors you’d get.

The builtin compiler macros (that start with @) are a bit confusing. Some of
them have a leading uppercase, others a lowercase, and I never did work out
any pattern to them. Is it @as or @As? I still couldn’t tell you without
looking at the manual.

The type system in Zig is loose in some ways and tight in others. If Zig can
detect the type of the right hand side of a variable declaration, you don’t
need an explicit type. But if you had something like var x = 0; you have to
specify a type. It’d be nice for users (but obviously harder for the compiler
team!) if the compiler would be able to deduce these types too.

But the worst bit about Zig at present is the standard library documentation
is broken and non-existent. This is probably the one reason I wouldn’t
recommend Zig more generally at present, because I resorted to looking at the
source files of the standard library on GitHub to work out what I could do
with what provided stuff in the standard library. I know there is a plan that
with the new compiler frontend (written in Zig!) to fix this, so its just a
time problem.

Conclusion⌗ Overall my gut feeling is that Zig is about ready for developing
with for people like myself (coders that don’t mind a bit of pain to a lot of
benefit), but it is not quite ready for more general usage. Fixing the
standard library documentation would be my biggest priority if I worked on
Zig, because I think that is the only thing holding back general usage of the
toolchain.

One nugget of knowledge I’ve worked out though - Zig is not a replacement for
C. It is another replacement for C++. comptime which while amazingly powerful,
already has echoes of the hard to reason about C++ template code or Rust
generic mess, and there are still quite a few bits of syntatic sugar hiding
the real cost of certain operations (like the try error handling, there is
implicit branches everywhere when you use that).

This isn’t to say Zig is any lesser by being a much better C++ replacement
rather than a C replacement in my estimation, infact I’d argue that aslong as
Zig doesn’t fall into Rust’s trap of constantly adding yet more ways to do the
same damn thing and making the language that little bit harder for new people
to onboard with, then Zig once it hits a stable language around 1.0 will be my
recommended tool going forward.

I really enjoyed doing Advent of Code in Zig, and I think I’ll be writing more
software in Zig going forward. I’d highly recommend you check out the language
and the community around the language are a great group of people that have
been super helpful with my dumb onboarding questions."""

In [10]:
# Experiment (this may not be very efficient) - use an existing tokenizer
# to compute the chunks with an overlapping stride, convert those input_ids
# back into sentences and then compute the embeddings of those sentences

max_length = 384
stride = 128

tokens = model.tokenizer(
        text=text, 
        max_length=max_length, 
        truncation=True, 
        padding=True,
        return_overflowing_tokens=True, 
        stride=stride)


In [17]:
chunk_embeddings = []
for chunk in tokens["input_ids"]:
    sentence = model.tokenizer.decode(chunk)
    chunk_embeddings.append(model.encode(sentence, convert_to_tensor=True))


In [18]:
chunk_embeddings[0]

tensor([-1.4138e-01,  6.6156e-02,  3.2931e-02,  3.6521e-02,  4.7144e-02,
        -6.8262e-02,  7.5102e-03,  1.0493e-01, -2.3423e-02, -7.1747e-02,
        -8.0774e-02, -4.4349e-03,  2.6063e-02,  2.7176e-02, -3.3186e-02,
        -4.5205e-02, -1.8922e-02, -2.7432e-03, -8.8667e-03, -6.5326e-02,
        -3.9385e-02,  7.4160e-03,  8.4470e-02, -7.0141e-02,  6.5789e-02,
         1.7664e-02, -3.7019e-02,  3.7525e-02,  5.5856e-02,  3.7320e-02,
        -3.8438e-02,  1.0208e-01,  1.7007e-02, -2.7974e-02, -2.7059e-02,
         2.9268e-02,  1.6998e-02, -9.2559e-02, -6.0728e-02, -3.6281e-02,
        -9.5547e-02,  3.1904e-02, -2.9157e-02, -4.7804e-02,  2.8787e-02,
         2.6205e-02,  2.6024e-02, -5.3515e-02, -8.4800e-02, -3.4277e-02,
        -9.1479e-02, -4.9731e-02,  5.8830e-02, -4.1282e-02,  5.7669e-02,
         3.2735e-02,  1.3112e-03, -1.9877e-02, -3.2481e-03, -1.6073e-02,
        -2.0735e-02, -1.1423e-02,  1.3211e-02,  2.8932e-02,  4.0752e-02,
        -1.0955e-02,  1.0758e-02,  5.9942e-02, -5.8