In [1]:
def remove_escape_characters(text: str) -> str:
    return text.replace("\n", " ").replace("·", "")

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader

RESUME_FOLDER_PATH = "../data/resumes"

# Step 1: Read documents
resume_docs = []
for filename in os.listdir(RESUME_FOLDER_PATH):
    if filename.endswith("pdf"):
        filepath = os.path.join(RESUME_FOLDER_PATH, filename)
        loader = PyPDFLoader(filepath)
        async for page in loader.alazy_load():
            page.page_content = remove_escape_characters(page.page_content)
            resume_docs.append(page)

resume_docs

[Document(metadata={'source': '../data/resumes/resume-software-engg.pdf', 'page': 0}, page_content="Shah Rukh Khan   Software Engineer   Mumbai, India    (123) 456-789    yourname@resumeworded.com    linkedin.com/in/your-profile    EXPERIENCE   Google February 2019 - Present   Software Engineer II   Successfully migrated business application to the Cloud, reducing operational cost by 30% and increasing system performance   Implemented microservices architecture, enhancing system scalability and reliability by 40%   Reduced time to fix bugs by 25% through effective use of test-driven development and continuous  integration strategies   Simplified and undated codebase, increasing software efficiency and readability by 35%   Optimized service by integrating Machine Learning models, boosting customer satisfaction by 15%   Resume Worded June 2016 - January 2019   Software Engineer   Built a secure data storage system, facilitating a 20% increase in system load capacity   Developed a scalabl

In [31]:
# Step 2: Make chunks with documents
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, 
    chunk_overlap = 60,
    length_function = len,
)

resume_chunks = splitter.split_documents(resume_docs)
resume_chunks

[Document(metadata={'source': '../data/resumes/resume-software-engg.pdf', 'page': 0}, page_content='Shah Rukh Khan   Software Engineer   Mumbai, India    (123) 456-789    yourname@resumeworded.com'),
 Document(metadata={'source': '../data/resumes/resume-software-engg.pdf', 'page': 0}, page_content='India    (123) 456-789    yourname@resumeworded.com    linkedin.com/in/your-profile    EXPERIENCE'),
 Document(metadata={'source': '../data/resumes/resume-software-engg.pdf', 'page': 0}, page_content='linkedin.com/in/your-profile    EXPERIENCE   Google February 2019 - Present   Software Engineer'),
 Document(metadata={'source': '../data/resumes/resume-software-engg.pdf', 'page': 0}, page_content='Google February 2019 - Present   Software Engineer II   Successfully migrated business'),
 Document(metadata={'source': '../data/resumes/resume-software-engg.pdf', 'page': 0}, page_content='Software Engineer II   Successfully migrated business application to the Cloud, reducing'),
 Document(metadata

In [32]:
# Step 3: Embed documents
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer



def get_embedding_function():
    embedding_model_path = "models--sentence-transformers--all-mpnet-base-v2"

    if not os.path.exists(embedding_model_path):
        embeddings_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", cache_folder=".")
    else:
        pass
    
    # Load the model using HuggingFaceEmbeddings
    try:
        embeddings_model = HuggingFaceEmbeddings(model_name=embedding_model_path,
                                                model_kwargs={'device': 'cuda',},
                                                encode_kwargs={'normalize_embeddings': True})
    except OSError as e:
        print(f"Error loading model: {e}")
        # Handle the error appropriately, e.g., return None or raise a custom exception
        return None
    return embeddings_model

In [33]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_sentence_embeddings(input_sentences: list[str]):
    # Sentences we want sentence embeddings for
    sentences = input_sentences

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

ex = get_sentence_embeddings(["My name is Harsha", "I am not a student"])
ex_1 = get_sentence_embeddings(["My name is Harsha"])
ex_2 = get_sentence_embeddings(["I am not a student"])



In [34]:
ex_c = torch.cat((ex_1, ex_2), dim=0)
ex_c.size()

torch.Size([2, 768])

In [35]:
torch.equal(ex, ex_c)

False

In [36]:
torch.eq(ex, ex_c)

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True, False, False]])

In [37]:
for chunk in resume_chunks:
    print(len(chunk.page_content))

96
97
95
86
88
98
94
88
91
93
93
94
96
99
96
97
98
97
96
87
99
89
97
97
86
94
99
97
97
99
97
94
96
89
98
99
96
97
94
97
96
93
89
86
99
93
99
95
98
91
87
99
99
97
93
89
94
92
96
85
95
96
99
97
92
94
95
99
88
93
98
52
68
96
92
97
92
92
96
93
97
90
90
91
91
97
97
97
98
82
96
90
91
99
99
92
95
96
92
98
98
98
94
91
98
93
97
95
91
90
96
98
95
93
97
90
92
96
96
95
96
98
97
91
94
98
98
93
90
97
93
88
91
85
90
87
87
88
85
88
91
76
94
99
96
92
94
95
84
91
96
94
93
97
98
95
95
93
89
90
99
98
99
99
95
98
91
98
96
87
99
86
96
94
99
89
94
94
97
99
99
93
99
91
94
90
91
97
91
99
96
97
89
75
97
89
93
93
98
90
92
98
97
94
97
93
96
98
94
96
96
94
94
99
99
93
98
98
92
97
97
99
93
97
93
97
97
98
94
87
97
94
94
95
90
96
95
97
96
91
