In [110]:
from langchain.document_loaders import TextLoader

loader = TextLoader('resume_narrative.txt')
data=loader.load()
data[0].metadata

text = data[0].page_content


In [111]:
data[0].metadata

{'source': 'resume_narrative.txt'}

In [112]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    separators = ['\n\n', '\n', ' '],
    chunk_size = 200,
    chunk_overlap = 50
)
chunks = r_splitter.split_text(text)

In [113]:
chunks

['Personal Information',
 'Santosh Kumar is a seasoned Software Engineer based in Bangalore, India, with a deep-seated passion for artificial intelligence and machine learning. With a robust portfolio that includes significant',
 'With a robust portfolio that includes significant contributions to various tech fields and a remarkable academic background, Santosh has established himself as a versatile and fast-learning',
 'himself as a versatile and fast-learning professional. His journey is marked by a persistent drive to bridge the gap between innovative research and practical applications, a goal he pursues through',
 'practical applications, a goal he pursues through a variety of collaborative projects and independent ventures.',
 'Santosh holds a Bachelor of Technology degree in Computer Science and Engineering from Dr. B. C. Roy Engineering College, where he graduated with a CGPA of 7.92. His academic journey was not only a',
 'CGPA of 7.92. His academic journey was not only a peri

In [114]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy()

# Generate embeddings for chunks
embeddings = [generate_embedding(chunk) for chunk in chunks]


In [115]:
len(embeddings)

48

In [116]:
len(chunks)

48

In [117]:
embeddings[47].shape

(1, 384)

In [125]:
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection, utility

# Connect to Milvus
# connections.connect(alias="default", host="127.0.0.1", port="19530")

# Define fields for your collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # Replace with your actual embedding dimension
    FieldSchema(name="text_chunk", dtype=DataType.VARCHAR, max_length=500)  # Optionally store the text chunk
]

# Create the schema
schema = CollectionSchema(fields, "Schema for storing text embeddings")

# Check if collection exists and drop if it does
if utility.has_collection("resume_collection"):
    utility.drop_collection("resume_collection")

# Create the collection
collection = Collection(name="resume_collection", schema=schema)


In [126]:
# Flatten the embeddings to match the expected input format
flattened_embeddings = [embedding.flatten() for embedding in embeddings]

# Prepare data for insertion
data = [
    flattened_embeddings,  # Your actual embeddings
    chunks  # Text chunks
]

# Insert data into Milvus
collection.insert(data)

(insert count: 48, delete count: 0, upsert count: 0, timestamp: 451369721061703686, success count: 48, err count: 0, cost: 0)

In [128]:
# Create an index for the embeddings
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128}
}

collection.create_index(field_name="embedding", index_params=index_params)

Status(code=0, message=)

In [129]:
# Flush to ensure data is written
collection.flush()

# Load collection for querying
collection.load()

In [133]:
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Example query text
query_text = "where does Santosh Work?"

# Generate embedding for the query text
query_embedding = generate_embedding(query_text).tolist()

# Perform a similarity search
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}


In [137]:
results = collection.search(
    data=[query_embedding],  # List of query embeddings
    anns_field="embedding",  # Field name of the embeddings
    param=search_params,
    limit=10,  # Number of similar entries to retrieve
    expr=None,  # Optional filter expression
    output_fields=["id", "embedding", "text_chunk"]
)

# Print the results
print(len(results[0]))
for result in results[0]:  # results[0] because results is a list of lists
    print(f"ID: {result.id}")
    # print(f"Embedding: {result.embedding}")
    print(f"Text Chunk: {result.text_chunk}")
    print("="*50)

10
ID: 451358733081252905
Text Chunk: Santosh has played a crucial role in mentoring interns and collaborating with cross-functional teams to meet the Karnataka State Government Forest Department's requirements.
ID: 451358733081252886
Text Chunk: Santosh Kumar is a seasoned Software Engineer based in Bangalore, India, with a deep-seated passion for artificial intelligence and machine learning. With a robust portfolio that includes significant
ID: 451358733081252893
Text Chunk: Santosh is a prolific learner, constantly acquiring new skills to stay at the forefront of technological advancements. His technical toolkit includes TensorFlow, OpenCV, React.js, Node.js, and
ID: 451358733081252898
Text Chunk: Santosh currently serves as a Junior Software Engineer at Pacecom Technologies Pvt Ltd in Bangalore, India. In this role, he has led several image processing tasks on drone-captured imagery to
ID: 451358733081252907
Text Chunk: During the COVID-19 lockdown, Santosh ventured into freelancin

In [207]:
# Example query text
query_text = "What works has he done?"


In [208]:
from pymilvus import connections, Collection
from transformers import AutoTokenizer, AutoModel
import torch

# Connect to Milvus
# connections.connect(alias="default", host="127.0.0.1", port="19530")

# Load the collection
collection = Collection(name="resume_collection")

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Example query text
# query_text = "Where do Santosh Work?"

# Generate embedding for the query text
query_embedding = generate_embedding(query_text).tolist()

# Perform a similarity search
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}

results = collection.search(
    data=[query_embedding],  # List of query embeddings
    anns_field="embedding",  # Field name of the embeddings
    param=search_params,
    limit=10,  # Number of similar entries to retrieve
    expr=None,  # Optional filter expression
    output_fields=["id", "embedding", "text_chunk"]
)

# Combine the retrieved text chunks into a single context
retrieved_text = "\n\n".join([result.text_chunk for result in results[0]])

print("Retrieved Text Chunks for Context:")
print(retrieved_text)
print("="*50)


Retrieved Text Chunks for Context:
himself as a versatile and fast-learning professional. His journey is marked by a persistent drive to bridge the gap between innovative research and practical applications, a goal he pursues through

internships that laid a solid foundation for his career. He is also a published researcher, with his work on Image Steganography being recognized at an IEEE conference.

practical applications, a goal he pursues through a variety of collaborative projects and independent ventures.

CGPA of 7.92. His academic journey was not only a period of acquiring theoretical knowledge but also a time of engaging in practical projects and internships that laid a solid foundation for his

capable of answering queries about his projects and work experience. This application allows HR and recruiters to interactively explore his resume, enhancing user engagement and usability.

With a robust portfolio that includes significant contributions to various tech fields and a rem

In [209]:
# genai.configure(api_key='Enter you palm api key here')

In [210]:
import google.generativeai as palm

# Set your PaLM API key
palm.configure(api_key='Enter you palm api key ')

# Function to generate a response using PaLM
def generate_palm_response(chunks, query):
    prompt = f"Here are some relevant text chunks:\n\n{chunks}\n\nBased on the information in these chunks, can you tell me:\n{query}\n\nAnswer:"

    response = palm.generate_text(
        model='models/text-bison-001',
        prompt=prompt,
        candidate_count=1,  # Number of responses to generate
        temperature=0.7,  # Adjust temperature for response variation
        max_output_tokens=800  # Maximum tokens in the response
    )

    return response.result

# Generate the response using PaLM
human_like_answer = generate_palm_response(retrieved_text, query_text)
print(query_text,"\n")
print("Human-like Answer:")
print(human_like_answer)


What works has he done? 

Human-like Answer:
- developed a web application that is capable of answering queries about his projects and work experience. This application allows HR and recruiters to interactively explore his resume, enhancing user engagement and usability.
- developed three full-stack applications using React, Node.js, and MongoDB, which were hosted on AWS, improving user experience and functionality.
- trained AI/ML models to classify images and generate text.
- worked on a project to develop an image steganography technique that was recognized at an IEEE conference.


In [198]:
import streamlit as st
from pymilvus import connections, Collection
from transformers import AutoTokenizer, AutoModel
import torch
import google.generativeai as palm

# Set your PaLM API key here
PALM_API_KEY = 'Enter you palm api key here'
palm.configure(api_key=PALM_API_KEY)

# Initialize the Milvus connection
# connections.connect(alias="default", host="127.0.0.1", port="19530")

# Load the collection
collection = Collection(name="resume_collection")

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to generate a response using PaLM
def generate_palm_response(chunks, query):
    prompt = f"Here are some relevant text chunks related to your query:\n\n{chunks}\n\nBased on the information in these chunks, please answer the following query:\n{query}\n\nAnswer:"
    response = palm.generate_text(
        model='models/text-bison-001',
        prompt=prompt,
        candidate_count=1,  # Number of responses to generate
        temperature=0.7,  # Adjust temperature for response variation
        max_output_tokens=800  # Maximum tokens in the response
    )
    return response.result

# Streamlit app
st.title("PaLM and Milvus Integration Demo")

query_text = st.text_input("Enter your query text:", "")

if st.button("Search and Generate Response"):
    if query_text:
        # Generate embedding for the query text
        query_embedding = generate_embedding(query_text).tolist()

        # Perform a similarity search
        search_params = {
            "metric_type": "L2",
            "params": {"nprobe": 10},
        }

        results = collection.search(
            data=[query_embedding],  # List of query embeddings
            anns_field="embedding",  # Field name of the embeddings
            param=search_params,
            limit=10,  # Number of similar entries to retrieve
            expr=None,  # Optional filter expression
            output_fields=["id", "embedding", "text_chunk"]
        )

        # Combine the retrieved text chunks into a single context
        retrieved_text = "\n\n".join([result.text_chunk for result in results[0]])

        st.write("Retrieved Text Chunks for Context:")
        st.write(retrieved_text)

        # Generate the response using PaLM
        human_like_answer = generate_palm_response(retrieved_text, query_text)

        st.write("Human-like Answer:")
        st.write(human_like_answer)
    else:
        st.write("Please enter a query text.")


In [196]:
pip install ipywidgets


Note: you may need to restart the kernel to use updated packages.
