##**Environment detection and dependency installation**

In [None]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
#     !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.


##**Import libraries**

In [None]:
import pandas as pd
import random
from spacy.lang.en import English
from tqdm import tqdm
import re
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
re.compile('<title>(.*)title>')

re.compile(r'<title>(.*)title>', re.UNICODE)

## **Data Input**

In [None]:
# Get TEXT document
text_path = pd.read_csv("IELTS.txt", sep="\t")

In [None]:
text_path

Unnamed: 0,Skip to main content
0,Texts
1,Video
2,Audio
3,Software
4,Images
...,...
19926,University Printing House
19927,Shaftesbury Road
19928,Cambridge CB2 8BS
19929,UK


In [None]:
def open_and_read_txt(txt_path: str) -> list[dict]:

    pages_and_texts = []

    # Open a text file and read it line by line
    with open(txt_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    for line_number, line in enumerate(lines):
        text = line.strip()  # Remove extra whitespace characters
        pages_and_texts.append({
            "line_number": line_number + 1,  # Current line number, starting from 1
            "line_char_count": len(text),  # Number of characters
            "line_word_count": len(text.split(" ")),  # Number of words
            "line_sentence_count_raw": len(text.split(". ")),  # Number of sentences
            "line_token_count": len(text) / 4,  # Estimated number of tokens (1 token is about 4 characters)
            "text": text  # The text content of the current line
        })

    return pages_and_texts


# Call the function to read the IELTS.txt file
txt_path = "IELTS.txt"
lines_and_texts = open_and_read_txt(txt_path)

# View the results of the first two rows
lines_and_texts[:2]

[{'line_number': 1,
  'line_char_count': 20,
  'line_word_count': 4,
  'line_sentence_count_raw': 1,
  'line_token_count': 5.0,
  'text': 'Skip to main content'},
 {'line_number': 2,
  'line_char_count': 0,
  'line_word_count': 1,
  'line_sentence_count_raw': 1,
  'line_token_count': 0.0,
  'text': ''}]

In [None]:
random.sample(lines_and_texts, k=3)

[{'line_number': 3075,
  'line_char_count': 0,
  'line_word_count': 1,
  'line_sentence_count_raw': 1,
  'line_token_count': 0.0,
  'text': ''},
 {'line_number': 3155,
  'line_char_count': 0,
  'line_word_count': 1,
  'line_sentence_count_raw': 1,
  'line_token_count': 0.0,
  'text': ''},
 {'line_number': 24954,
  'line_char_count': 0,
  'line_word_count': 1,
  'line_sentence_count_raw': 1,
  'line_token_count': 0.0,
  'text': ''}]

In [None]:
df = pd.DataFrame(lines_and_texts)
df.head()

Unnamed: 0,line_number,line_char_count,line_word_count,line_sentence_count_raw,line_token_count,text
0,1,20,4,1,5.0,Skip to main content
1,2,0,1,1,0.0,
2,3,5,1,1,1.25,Texts
3,4,0,1,1,0.0,
4,5,5,1,1,1.25,Video


In [None]:
# Get stats
df.describe().round(2)

Unnamed: 0,line_number,line_char_count,line_word_count,line_sentence_count_raw,line_token_count
count,40321.0,40321.0,40321.0,40321.0,40321.0
mean,20161.0,20.31,4.24,1.09,5.08
std,11639.81,26.61,4.55,0.31,6.65
min,1.0,0.0,1.0,1.0,0.0
25%,10081.0,0.0,1.0,1.0,0.0
50%,20161.0,3.0,1.0,1.0,0.75
75%,30241.0,42.0,7.0,1.0,10.5
max,40321.0,117.0,32.0,6.0,29.25


##**Data and text processing**

#### **Use spaCy and tqdm for sentence segmentation and statistics on text**

In [None]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [None]:
# Step 2: Initialize the English model of spaCy
nlp = English()
nlp.add_pipe("sentencizer")  # Add the sentencizer to the spaCy processing pipeline

# Step 3: Split the text column of the dataset into sentences
pages_and_texts = [{"text": str(row[0]).strip()} for row in text_path.values]  # Extract the text column

# Step 4: Iterate over each row of text, split it into sentences, and count the number of sentences
for item in tqdm(pages_and_texts):
    # Use spaCy to split sentences
    item["sentences"] = list(nlp(item["text"]).sents)

    # Ensure sentences are in string format
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the number of sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

# Step 5: View the results (for example, the first two rows)
for page in pages_and_texts[:2]:
    print(page)

100%|██████████| 19931/19931 [00:02<00:00, 8967.13it/s]

{'text': 'Texts', 'sentences': ['Texts'], 'page_sentence_count_spacy': 1}
{'text': 'Video', 'sentences': ['Video'], 'page_sentence_count_spacy': 1}





In [None]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'text': 'C', 'sentences': ['C'], 'page_sentence_count_spacy': 1}]

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_sentence_count_spacy
count,19931.0
mean,1.19
std,2.12
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,295.0


#### **Chunking ten sentences together**

In [None]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 19931/19931 [00:00<00:00, 680671.52it/s]


In [None]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'text': "fh ii ii really doesn't suit ihe way we work these* days. Its",
  'sentences': ["fh ii ii really doesn't suit ihe way we work these* days.",
   'Its'],
  'page_sentence_count_spacy': 2,
  'sentence_chunks': [["fh ii ii really doesn't suit ihe way we work these* days.",
    'Its']],
  'num_chunks': 1}]

In [None]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_sentence_count_spacy,num_chunks
count,19931.0,19931.0
mean,1.19,1.0
std,2.12,0.21
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,295.0,30.0


#### **Splitting each chunk into its own item**

In [None]:
# Create a new list to store information about each sentence chunk
pages_and_chunks = []

# Iterate over each text block
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:  # Iterate over each chunk
        chunk_dict = {}  # Store information about the current chunk

        # Optionally, add page number information
        chunk_dict["page_number"] = item.get("page_number", None)  # Default to None if no page number

        # Join the sentences in the chunk into a single string
        joined_sentence_chunk = " ".join(sentence_chunk).replace("\n", " ").strip()
        # Regular expression replacement: replace ". A" with ".\nA" to handle sentence separators
        joined_sentence_chunk = re.sub(r"\. ([A-Z])", r". \1", joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Gather statistics about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)  # Character count
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))  # Word count
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4  # Estimate token count (1 token ≈ 4 characters)

        # Add the current chunk to the list
        pages_and_chunks.append(chunk_dict)

# View statistics: how many chunks there are
print(f"Total chunks: {len(pages_and_chunks)}")

# Example print of the first two chunks
for chunk in pages_and_chunks[:2]:
    print(chunk)


100%|██████████| 19931/19931 [00:00<00:00, 254172.03it/s]


Total chunks: 19960
{'page_number': None, 'sentence_chunk': 'Texts', 'chunk_char_count': 5, 'chunk_word_count': 1, 'chunk_token_count': 1.25}
{'page_number': None, 'sentence_chunk': 'Video', 'chunk_char_count': 5, 'chunk_word_count': 1, 'chunk_token_count': 1.25}


In [None]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': None,
  'sentence_chunk': 'Test Tip Pay attention',
  'chunk_char_count': 22,
  'chunk_word_count': 4,
  'chunk_token_count': 5.5}]

Now we've broken our whole textbook into chunks of 10 sentences or less as well as the page number they came from.

In [None]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,chunk_char_count,chunk_word_count,chunk_token_count
count,19960.0,19960.0,19960.0
mean,41.26,7.67,10.31
std,58.21,11.59,14.55
min,1.0,1.0,0.25
25%,16.0,3.0,4.0
50%,41.0,7.0,10.25
75%,60.0,11.0,15.0
max,2346.0,438.0,586.5


Here,because we foung that Chunks that are too short (token count ≤ 30) may lack sufficient contextual information, resulting in embeddings generated that are not meaningful enough.So,we selcet token_length more than 30.

#### **Select token_length >30**

In [None]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 1.75 | Text: Writing
Chunk token count: 3.25 | Text: party starter
Chunk token count: 2.75 | Text: attach it):
Chunk token count: 5.75 | Text: Choose TWO letters. A-E


Hmm looks like some of our chunks have quite a low token count.

How about we check for samples with less than 30 tokens (about the length of a sentence) and see if they are worth keeping?

In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': None,
  'sentence_chunk': '[fudging from] the complexity of the material that has been collected from different parts of the landscape \r and brought to the site, they | the people] must have had an elementary knowledge of chemistry to be able to \r combine these materials to produce ibis form. Its not a straightforward process,™ said Henshilwood. \r \r \r 1 *2 Scanning involves searching a text quickly for a specific piece \r of information. Practise scanning the passage for the words/ \r numbers in the box. \r \r \r 75,000 100,000 200,000 artefacts ochre \r \r \r 48 \r \r \r \r \r \r \r \r \r Reading skills \r \r \r 2 Using words from the passage \r \r Their are several types of question that ask you to write a word and/or \r number from the passage. \r \r * You will be told the maximum number of words to write. \r \r * You must only write words that are in the passage. Make sure you \r copy the spelling correctly, \r \r 1 ^ ^ need to change the words in the passage 

#### **Embedding our text chunks**
Our goal is to turn each of our chunks into a numerical representation (an embedding vector, where a vector is a sequence of numbers arranged in order).

In [None]:
 !pip install sentence-transformers



In [None]:
# !pip install --upgrade --force-reinstall torchvision torchaudio torchtext torch

In [None]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


How about we add an embedding field to each of our chunk items in Single processing?

In [None]:
%%time

# Send the model to the GPU
embedding_model.to("cuda") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 30/30 [00:01<00:00, 24.88it/s]

CPU times: user 1.83 s, sys: 281 ms, total: 2.12 s
Wall time: 1.45 s





How about batch processing?

In [None]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [None]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: user 465 ms, sys: 7.39 ms, total: 472 ms
Wall time: 395 ms


tensor([[ 0.0071, -0.0755, -0.0205,  ...,  0.0258, -0.0396,  0.0141],
        [ 0.0259, -0.0702, -0.0171,  ...,  0.0283, -0.0479, -0.0148],
        [ 0.0564, -0.0397, -0.0207,  ...,  0.0192, -0.0396, -0.0039],
        ...,
        [ 0.0109,  0.0319, -0.0289,  ...,  0.0763,  0.0237, -0.0272],
        [ 0.0297, -0.0098, -0.0201,  ...,  0.0699,  0.0285, -0.0238],
        [ 0.0270, -0.0286,  0.0103,  ...,  0.0457, -0.0323, -0.0239]],
       device='cuda:0')

#### **Save embeddings to file**


In [None]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,,[fudging from] the complexity of the material ...,1381,293,345.25,[ 7.10453186e-03 -7.55081177e-02 -2.05419790e-...
1,,You do not \r need to write full sentences or ...,1113,220,278.25,[ 2.58541796e-02 -7.01962784e-02 -1.70616377e-...
2,,49 \r \r \r \r \r \r \r \r \r \r \r \r \r \r \...,722,153,180.5,[ 5.64004555e-02 -3.96810472e-02 -2.07439456e-...
3,,"1 For Question 4, which word/s in the passage ...",1628,302,407.0,[ 6.36236519e-02 -6.75108954e-02 -3.08494326e-...
4,,50 \r \r \r \r \r \r \r \r \r Reading skills \...,993,204,248.25,[-1.61707476e-02 -7.01474622e-02 -4.12495732e-...


#### **Chunking and embedding questions**

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([30, 768])

####**Similarity search**

In [None]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,,[fudging from] the complexity of the material ...,1381,293,345.25,"[0.00710453186, -0.0755081177, -0.020541979, 0..."
1,,You do not \r need to write full sentences or ...,1113,220,278.25,"[0.0258541796, -0.0701962784, -0.0170616377, 0..."
2,,49 \r \r \r \r \r \r \r \r \r \r \r \r \r \r \...,722,153,180.5,"[0.0564004555, -0.0396810472, -0.0207439456, 0..."
3,,"1 For Question 4, which word/s in the passage ...",1628,302,407.0,"[0.0636236519, -0.0675108954, -0.0308494326, 0..."
4,,50 \r \r \r \r \r \r \r \r \r Reading skills \...,993,204,248.25,"[-0.0161707476, -0.0701474622, -0.0412495732, ..."


In [None]:
embeddings[0]

tensor([ 7.1045e-03, -7.5508e-02, -2.0542e-02,  5.3325e-02, -7.2728e-02,
        -2.5395e-02,  1.0261e-02,  5.3646e-02, -1.3570e-02,  4.7879e-03,
         6.0068e-02, -1.1683e-02,  8.1880e-02,  1.5494e-02, -4.0406e-02,
        -1.0858e-02,  4.4307e-02, -2.1993e-02, -2.7438e-02,  2.4333e-02,
        -2.9353e-02,  2.8078e-02, -8.1387e-03, -5.5367e-02, -2.4519e-02,
         1.2610e-02, -3.0707e-02, -3.1068e-02,  1.2954e-02, -6.4856e-02,
        -1.5680e-02,  3.6647e-02, -3.9773e-02, -1.8977e-02,  2.2121e-06,
        -5.6490e-02, -2.3480e-02,  1.3385e-02, -5.0297e-02,  2.9676e-02,
         7.0152e-02,  5.9260e-02,  4.5085e-02, -9.0790e-03, -2.8074e-03,
         4.1529e-03,  2.0545e-02,  5.0574e-02,  3.6151e-02,  1.8902e-02,
         1.0677e-02, -1.4560e-02,  4.3371e-02, -1.5906e-02,  9.5423e-02,
         9.4999e-03,  6.2746e-03,  1.3461e-02,  5.6207e-02,  1.2438e-01,
        -2.8209e-02,  3.6967e-02, -1.4678e-02,  1.2529e-02,  1.0953e-02,
        -1.3343e-02,  3.8065e-02, -8.5211e-02,  5.8