In [1]:
import os

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist.")

In [2]:
import fitz #PyMuPDF
from tqdm.auto import tqdm #Progress bars

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip() 

    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number -41,
                               "page_char_count": len(text),
                               "page_word_count": len(text.split(" ")),
                               "page_sentence_count_raw": len(text.split(". ")),
                               "page_token_count": len(text) /4, 
                               "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]                    

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import random 

random.sample(pages_and_texts, k=3)

[{'page_number': 405,
  'page_char_count': 1140,
  'page_word_count': 209,
  'page_sentence_count_raw': 9,
  'page_token_count': 285.0,
  'text': 'Image by  Braden  Collum on  unsplash.co m / CC0  Proteins in a Nutshell  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM    Proteins are long chains of amino acids folded into precise  structures that determine their functions, which are in the tens of  thousands. They are the primary construction materials of the body  serving as building blocks for bone, skin, hair, muscle, hormones,  and antibodies. Without them we cannot breakdown or build  macromolecules, grow, or heal from a wound. Too little protein  impairs bodily functions and too much can lead to chronic disease.  Eat proteins in moderation, at least 10 percent of the calories you  take in and not more than 35 percent. Proteins are in a variety  of foods. More complete sources are in animal-based foods, but  choose those low in 

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head(10) 

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...
5,-36,976,179,3,244.0,Lifestyles and Nutrition University of Hawai‘...
6,-35,1037,191,1,259.25,The Cardiovascular System University of Hawai...
7,-34,1047,188,3,261.75,"Indicators of Health: Body Mass Index, Body Fa..."
8,-33,947,170,3,236.75,Chloride University of Hawai‘i at Mānoa Food ...
9,-32,1024,189,3,256.0,The Functions of Carbohydrates in the Body Un...


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


In [6]:
from spacy.lang.en import English

nlp = English()

#sentencizer pipeline
nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [7]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1131,
  'page_char_count': 227,
  'page_word_count': 34,
  'page_sentence_count_raw': 2,
  'page_token_count': 56.75,
  'text': 'An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=568  Undernutrition, Overnutrition, and Malnutrition  |  1131',
  'sentences': ['An interactive or media element has been  excluded from this version of the text.',
   'You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=568  Undernutrition, Overnutrition, and Malnutrition  |  1131'],
  'page_sentence_count_spacy': 2}]

In [8]:
#chunking
num_sentence_chunk_size = 10 

def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [10]:
import re

#splitting each chunk into its own item (for references)
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        #stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 
        
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [11]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [12]:
#remove smallest chunks
min_token_length = 30
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Preprocessing done, now onto embeddings

In [13]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", #Can use AI server for this
                                      device="cpu")



In [14]:
%%time

#Test with AI server
embedding_model.to("cpu")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 7min 48s, sys: 3min, total: 10min 48s
Wall time: 1min 46s


In [15]:
#Save to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)