# LLMs e IA Generativa

## Chatbot de CVs

In [None]:
# Configuración del entorno
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import os
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

In [3]:
# Configuración de Pinecone
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)


In [4]:
# Carga y procesamiento de PDFs
pdf_dir = "docs"  # Directorio donde se almacenan los PDFs
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

In [5]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

In [6]:
# Procesar todos los PDFs
documents = []
for pdf in pdf_files:
    text = extract_text_from_pdf(os.path.join(pdf_dir, pdf))
    documents.append({"filename": pdf, "text": text})

In [7]:
embedding_model_name="all-MiniLM-L6-v2"
embedding_model_dim=384

embedding_model = SentenceTransformer(embedding_model_name)

def generate_embedding(text):
    return embedding_model.encode(text).tolist()

In [8]:
# Valida si el index existe o no

index_name = "cvs-embeddings"

existing_indexes = pc.list_indexes()
index_names = [index.name for index in existing_indexes]

if index_name not in index_names:
    print("Index doesn't exist. Creating!")
    pc.create_index(
        name=index_name,
        dimension=embedding_model_dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
else:
    print("Index with name " + index_name + " already created. Skipping!")

Index with name cvs-embeddings already created. Skipping!


In [9]:
# Cargando el index de Pinecone
index = pc.Index(index_name)

# Documentos cargados en el directorio
documents_names = [document['filename'] for document in documents]

# Documentos cargados en Pinecone
existing_indexes = pc.list_indexes()
index_names = [index.name for index in existing_indexes]


for document_name in documents_names:
    print("Processing document: " + document_name)

    dummy_vector = [0.0] * 384

    # Realiza la búsqueda con el filtro en el campo 'filename' de la metadata
    query_results = index.query(
        vector=dummy_vector,
        top_k=10,  # Número máximo de resultados a devolver
        filter={'filename': {'$eq': document_name}},  # Filtro de metadata
        include_metadata=True  # Incluir metadata en los resultados
    )

    # # Imprime los resultados
    # for result in query_results['matches']:
    #     print(f"ID: {result['id']}")
    #     print(f"Filename: {result['metadata']['filename']}")
    #     print(f"Score: {result['score']}")

    if query_results['matches']:
        print(f"File '{document_name}' found on index. Upserting aborted...")
    else:
        print(f"File '{document_name}' not found. Initializing upserting!!..")

        # Chunking recursivo
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        chunks = []

        doc = next((doc for doc in documents if doc['filename'] == document_name), None)

        doc_chunks = text_splitter.split_text(doc["text"])
        chunks.extend([{"filename": doc["filename"], "chunk": chunk} for chunk in doc_chunks])
        
        for chunk in chunks:
            chunk["embedding"] = generate_embedding(chunk["chunk"])

        # Subir los vectores a Pinecone
        for i, chunk in enumerate(chunks):
            index.upsert([(str(i), chunk["embedding"], {"filename": chunk["filename"], "chunk": chunk["chunk"]})])



Processing document: Javier Villagra - Resume.pdf
File 'Javier Villagra - Resume.pdf' found on index. Upserting aborted...


In [10]:
# Probar una consulta
def find_similar(query):
    query_embedding = generate_embedding(query)
    response = index.query(vector=query_embedding, top_k=1, include_metadata=True)
    # sample: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')
    return response


query = "Does he know Java?"
response = find_similar(query)

print("Closest chunk:")
print(response)


Closest chunk:
{'matches': [{'id': '8',
              'metadata': {'chunk': "with the channel's ESB.\n"
                                    'Conducted effective troubleshooting and '
                                    'resolved issues with coding, design and '
                                    'infrastructure.Banco Macro, Argentina\n'
                                    'Page 1Universidad de Buenos AiresSYSTEMS '
                                    'ENGINEERING2001 - 2008Microservices\n'
                                    'Java EEAgile Management\n'
                                    'Fintech\n'
                                    'Universidad de Buenos AiresPOSTGRADUATE '
                                    'DEGREE,\n'
                                    'AI SPECIALIST2023 - Machine '
                                    'LearningArtificial Intelligence\n'
                                    'Deep Learning\n'
                                    'PythonEXPERIENCE\n'
               

In [10]:
groq_api_key = os.getenv('GROQ_API_KEY')

client = Groq(
    api_key=groq_api_key,
)

context = response['matches'][0]['metadata']['chunk']

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Answer this question: '"+query+"' using this information: " + context,
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

A nice question!

According to the information provided, the answer is: Yes, he knows Java!

Not only does he have experience with Java EE (Enterprise Edition), but he also mentions Microservices, which is a design pattern often associated with Java development. Additionally, he lists Java EE as one of his skills, which indicates that he has a strong background in Java programming.
