In [14]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [2]:
# Load pdf files in the local directory
current_directory = os.getcwd()
def document_loader(folder, chunk_size = 700, chunk_overlap  = 50):

    loader = PyPDFDirectoryLoader(folder)

    docs_before_split = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
    )
    docs_after_split = text_splitter.split_documents(docs_before_split)
    return docs_after_split

docs_after_split = document_loader(current_directory, chunk_size = 700, chunk_overlap  = 50)

In [3]:
docs_after_split

[Document(metadata={'source': '/home/onyxia/work/RAG/CV RAZIG_Ilias_en.pdf', 'page': 0}, page_content='RAZIG Ilias   \n \n       | razigilias@gmail.com  | +33 07.83.74.23.94  \n \nEnthusiastic d ata scientist with a financial background, looking for an opportunity in data science. Eager to leverage my passion for machine \nlearning to drive analysis at the intersection of data science and business expertise.  \n \nEDUCATION  \n09/2023 – 09/2024  ▪ MS Data Science ENSAE, Institut Polytechnique de Paris  \nMain  courses  : \n· Deep Learning  : Models and Optimization  \n· Reinforcement Learning  \n· Machine Learning for NLP  \n· Deploiement of Data Science projects  \n· Machine Learning for portfolio management and trading  \n· Bayesian Statistics  \n· Advanced convex optimization  Paris , \nFRANCE'),
 Document(metadata={'source': '/home/onyxia/work/RAG/CV RAZIG_Ilias_en.pdf', 'page': 0}, page_content='· Advanced convex optimization  Paris , \nFRANCE  \n09/2019 – 09/2023  ▪ Magistère Ban

In [4]:
# We load the langchain embedding model that is on hugging face
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

  from tqdm.autonotebook import tqdm, trange


In [5]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

query = """What formation has followed this candidate based on his CV ?"""  

relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

FRANCE  
09/2017 - 06/2020  ▪ Bachelor’s degree in Economics at PARIS University Panthéon -Assas . Received with honours.  Paris,  
FRANCE  
EXP ERIENCES  
05/2024 – 11/2024 ▪ Optimization  Engineer , EDF  
· Statistical and stochastic modelling  of temperatures and gas prices  
· Monte -Carlo methods  for simulation  
· Time Series  
· Machine Learning  
· Creation of algorithm to opti mize the margin of gas portfolio  Paris,  
FRANCE  
03/202 2 – 09/2023  ▪ Data Analyst,  Covéa Finance  
· Maintenance of internal financial databases using APIs (Bloomberg, Factset)  
· Participation on fund creation project with quantitative tools ( statistical methods , clustering  and


In [7]:
import transformers
import torch
from transformers import AutoTokenizer

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [19]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
llm = HuggingFacePipeline(pipeline=pipeline)


# Create a Retrieval Question Answer Chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


In [20]:
test = qa_chain({"query" : "What can you tell me about this curriculum ?"})
print(test['result'])

  warn_deprecated(


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

• English : advanced level (C2)  
• Arabic, Spanish  : basics

· Benchmarking LLM (llama -2) with other machine learning approaches for determining the family status of a person  
· Garbage detection on images with neural network (ResNet 50)  
· Creation of an evolving sentiment indicator of central bank speech with XGBoost  
· Creation of movie recommendation application by  multiple approaches (NLP, Matrix factorization)  
· Using an LSTM model to estimate the expected return on financial securities  
I.T Skills  
• Programming languages  :  Python , R, C++,  PostgreSQL ,  VBA  
• Tools and technologies  :  Git, Sickit -Learn, PyTorch, FastAPI , 
Docker, Argo CD  Langu ages 
• French  : fluent  
• English : advanced level (C2)

RAZIG Ilias   
 
       | razigilias@gmail.com  | +33 07.83.74.23.94  
 
Enthusiastic d ata scie