#### Step-1: Exploring data

In [2]:
import pandas as pd
df = pd.read_csv('data/Resume/processed_resume.csv')
df.head()

Unnamed: 0,content,id,name,email,candidate_id,resume_url,hiring_manager
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,1,name_1,emailid1@mail.com,1,https://www.resume.com/resume_1,Manager3
1,"HR SPECIALIST, US HR OPERATIONS ...",2,name_2,emailid2@mail.com,2,https://www.resume.com/resume_2,Manager3
2,HR DIRECTOR Summary Over 2...,3,name_3,emailid3@mail.com,3,https://www.resume.com/resume_3,Manager1
3,HR SPECIALIST Summary Dedica...,4,name_4,emailid4@mail.com,4,https://www.resume.com/resume_4,Manager4
4,HR MANAGER Skill Highlights ...,5,name_5,emailid5@mail.com,5,https://www.resume.com/resume_5,Manager5


In [3]:
from pprint import pprint
pprint(df.iloc[0]['content'])

('         HR ADMINISTRATOR/MARKETING ASSOCIATE\n'
 '\n'
 'HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with '
 '15+ years of experience in Hospitality and Customer Service Management.   '
 'Respected builder and leader of customer-focused teams; strives to instill a '
 'shared, enthusiastic commitment to customer service.         '
 'Highlights         Focused on customer satisfaction  Team management  '
 'Marketing savvy  Conflict resolution techniques     Training and '
 'development  Skilled multi-tasker  Client relations specialist           '
 'Accomplishments      Missouri DOT Supervisor Training Certification  '
 'Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton '
 'Worldwide General Manager Training Certification  Accomplished Trainer for '
 'cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera '
 'PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    '
 'Completed courses and seminars in custo

#### Step-2: Setup weaviate

1. Create a weaviate cluster 
2. Generate API key
3. Pip install python client

In [4]:
!pip install weaviate-client



#### Step-3: Let us define Resume Schema

Our Resume class will have 2 properties. Let us also rename the columns
- application_id -> associated with the resume
- chunk_text -> content


In [4]:
df = df.rename(columns={'id': 'application_id'})

In [5]:
import os
import weaviate
from dotenv import load_dotenv

load_dotenv()

True

In [6]:
# Setting up client object
# Note, we are using OpenAI API to generate the vector embeddings. You can use any other vector embedding as well.

client = weaviate.Client(
    url=os.environ["WEAVIATE_URL"],
    auth_client_secret=weaviate.AuthApiKey(api_key=os.environ['WEAVIATE_APIKEY']),
    additional_headers={
        "X-OpenAI-Api-Key": os.environ['OPENAI_APIKEY']
    }
    )

In [7]:
# Test connection
print(client.is_live())
print(client.is_ready())

True
True


In [17]:
class_obj = {
    "class": "Resume",
    "description": "Resume of candidates applying for a job",
    "vectorIndexType": "hnsw",
    "vectorIndexConfig": {
        "distance": "cosine", # Similarity metric
        "efConstruction": 128, # Default value - trades off index build speed against index search speed
        "maxConnections": 64 # Default value - max number of connections supported
    },
    "vectorizer": "text2vec-openai",
    "properties": [
        {
            "dataType": [
                "string"
            ],
            "description": "ID of the application",
            "name": "application_id",
            "indexInverted": True,
            "moduleConfig": {
                "text2vec-openai": {
                  "skip": True
                }
              }
        },
        {
            "dataType": [
                "text"
            ],
            "description": "Content of the resume",
            "name": "content",
            "indexInverted": True,
            "moduleConfig": {
                "text2vec-openai": {
                  "skip": False,
                  "vectorizePropertyName": True
                }
              }
        }
    ]
}

In [18]:
# # If you need to delete existing class
client.schema.delete_class("Resume")

In [19]:
client.schema.create_class(class_obj)

In [12]:
client.schema.get()

{'classes': [{'class': 'Resume',
   'description': 'Resume of candidates applying for a job',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'properties': [{'dataType': ['text'],
     'description': 'ID of the application',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': True,
       'vectorizePropertyName': False}},
     'name': 'application_id',
     'tokenization': 'whitespace'},
    {'dataType': ['text'],
     'description': 'Content of the resume',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': True}},
     'name': 'content',
     'tokenization': 'word'}],
   're

In [13]:
result = (
    client.query.get(
    "Resume", 
    [
        "application_id", 
        "content"
    ]
    )
    .with_limit(1)
    .do()
)
print(result)

{'data': {'Get': {'Resume': []}}}


#### Step-4 Load data into weaviate db

In [14]:
client.batch.shape

(0, 0)

In [15]:
df.head(1)

Unnamed: 0,content,application_id,name,email,candidate_id,resume_url,hiring_manager
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,1,name_1,emailid1@mail.com,1,https://www.resume.com/resume_1,Manager3


In [20]:
df.dtypes

content           object
application_id     int64
name              object
email             object
candidate_id       int64
resume_url        object
hiring_manager    object
dtype: object

In [21]:
# While loading we also need to chunk it into smaller pieces
# The right chunk size needs to be defined after experimentation

from langchain.text_splitter import RecursiveCharacterTextSplitter

df['application_id'] = df['application_id'].astype(str)
bulk_objects = df[["application_id", "content"]].T.to_dict().values()
with client.batch as batch:
    for obj in bulk_objects:
        batch.add_data_object(obj, class_name="Resume")
        print(client.batch.shape)
        # Chunking the text - uncomment if interested in chunking
        # Update the schema name to ResumeChunks and make related changes
        # text = obj["content"]
        # splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500, separators=["."])
        # chunks = splitter.split_text(text)
        # for chunk in chunks:
        #     obj = {
        #         "application_id": obj["application_id"],
        #         "chunk_content": chunk
        #     }
        #     batch.add_data_object(obj, class_name="ResumeChunks")

(1, 0)
(2, 0)
(3, 0)
(4, 0)
(5, 0)
(6, 0)
(7, 0)
(8, 0)
(9, 0)
(10, 0)
(11, 0)
(12, 0)
(13, 0)
(14, 0)
(15, 0)
(16, 0)
(17, 0)
(18, 0)
(19, 0)
(20, 0)
(21, 0)
(22, 0)
(23, 0)
(24, 0)
(25, 0)
(26, 0)
(27, 0)
(28, 0)
(29, 0)
(30, 0)
(31, 0)
(32, 0)
(33, 0)
(34, 0)
(35, 0)
(36, 0)
(37, 0)
(38, 0)
(39, 0)
(40, 0)
(41, 0)
(42, 0)
(43, 0)
(44, 0)
(45, 0)
(46, 0)
(47, 0)
(48, 0)
(49, 0)
(50, 0)
(51, 0)
(52, 0)
(53, 0)
(54, 0)
(55, 0)
(56, 0)
(57, 0)
(58, 0)
(59, 0)
(60, 0)
(61, 0)
(62, 0)
(63, 0)
(64, 0)
(65, 0)
(66, 0)
(67, 0)
(68, 0)
(69, 0)
(70, 0)
(71, 0)
(72, 0)
(73, 0)
(74, 0)
(75, 0)
(76, 0)
(77, 0)
(78, 0)
(79, 0)
(80, 0)
(81, 0)
(82, 0)
(83, 0)
(84, 0)
(85, 0)
(86, 0)
(87, 0)
(88, 0)
(89, 0)
(90, 0)
(91, 0)
(92, 0)
(93, 0)
(94, 0)
(95, 0)
(96, 0)
(97, 0)
(98, 0)
(99, 0)
(100, 0)
(101, 0)
(102, 0)
(103, 0)
(104, 0)
(105, 0)
(106, 0)
(107, 0)
(108, 0)
(109, 0)
(110, 0)
(111, 0)
(112, 0)
(113, 0)
(114, 0)
(115, 0)
(116, 0)
(117, 0)
(118, 0)
(119, 0)
(120, 0)
(121, 0)
(122, 0)
(123, 0)
(



In [40]:
# Let us run the same code to see if we have any entry in the database

result = (
    client.query.get(
    "Resume", 
    [
        "application_id", 
        "content"
    ]
    )
    .with_limit(1)
    .do()
)
print(result)

{'data': {'Get': {'Resume': [{'application_id': 995, 'content': "         SALES           Summary     Account Manager focused on maximizing sales by managing all accounts systematically and logically. Believes consistency and dedication build the most successful business partnerships.\xa0  \xa0Excel\xa0 at building a loyal customer base to achieve both short and long-term organization sales goals.        Highlights          Superior communication skills  Cheerful and energetic  Effective team player       Staff training and development  Resolution-oriented  Dependable and reliable            Accomplishments     Responsible for training new employees in all aspects of distribution and sales.   Ensured one hundred percent compliance with all company rules and regulations.   Established new accounts and client interaction.   Monitored sales goals and expectations for multiple brands.   Improved product placement at both on premise and off premise accounts.   Managed quality assurance for 

In [30]:
# Let us now check if all of the records are loaded

print("Size of our dataframe: ", df.shape[0])
result = (
    client.query
        .aggregate("Resume")
        .with_meta_count()
        .do()

)
print("Number of rows in our db", result["data"]["Aggregate"]["Resume"])

Size of our dataframe:  2484
Number of rows in our db [{'meta': {'count': 2484}}]
