In [1]:
import vertexai
import numpy as np
import pandas as pd
from google.cloud import storage, aiplatform
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.generative_models import GenerativeModel
import vertexai.generative_models as generative_models

In [None]:
project = "vtxdemos"
dataset = "gs://vtxdemos-vsearch-datasets/stgwell_data/employees_v3.csv"
emb_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
storage_client = storage.Client(project=project)
vertexai.init(project="vtxdemos", location="us-central1")
model = GenerativeModel("gemini-1.5-pro-001",)

In [3]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

In [4]:
df = pd.read_csv(dataset)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12982 entries, 0 to 12981
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     12982 non-null  object 
 1   email                  12981 non-null  object 
 2   name                   12982 non-null  object 
 3   first_name             9453 non-null   object 
 4   last_name              9453 non-null   object 
 5   job_title              12982 non-null  object 
 6   department             12103 non-null  object 
 7   company                12982 non-null  object 
 8   location               11019 non-null  object 
 9   office_address_1       12982 non-null  object 
 10  office_address_2       8731 non-null   object 
 11  city_state_zip         10602 non-null  object 
 12  created_at             12982 non-null  object 
 13  updated_at             12982 non-null  object 
 14  profile_picture        152 non-null    object 
 15  pr

In [5]:
def llm_preprocess(context: str):
  prompt = f"""Given the following structured data, generate a detailed and coherent paragraph that includes all the provided information. Ensure the paragraph is well-organized and flows naturally, highlighting key details such as names, dates, locations, events, descriptions, education, experience, skills and any other relevant information.
  {context}
  """
  response = model.generate_content(
      [prompt],
      generation_config=generation_config,
      safety_settings=safety_settings,
  )

  try: return response.text
  except: return "non response from llm"

In [None]:
embeddings_list = []
text_for_embeddings = []

df.dropna(axis=1, how='all', inplace=True)
df = df.fillna("none information")
for index, row in df.iterrows():
  text = {
      "id": row["id"],
      "email": row["email"],
      "first_name": row["first_name"],
      "last_name": row["last_name"],
      "name": row["name"],
      "job_title": row["job_title"],
      "department": row["department"],
      "company": row["company"],
      "location": row["location"],
      "office_address_1": row["office_address_1"],
      "office_address_2": row["office_address_2"],
      "city_state_zip": row["city_state_zip"],
      "phone_country_code": row["phone_country_code"],
      "linkedin_experiences": row["linkedin_experiences"],
      "linkedin_education": row["linkedin_education"],
      "linkedin_skills": row["linkedin_skills"],
      "linkedin_interests": row["linkedin_interests"],
      "linkedin_languages": row["linkedin_languages"],
      "created_at": row["created_at"],
      "updated_at": row["updated_at"],
  }
  r = llm_preprocess(text)
  text_for_embeddings.append(text)
  embeddings_list.append(emb_model.get_embeddings([TextEmbeddingInput(r, "SEMANTIC_SIMILARITY")])[0].values)
  print(r)

In [None]:
df["embedding"] = embeddings_list
df["gem_text"] = text_for_embeddings

In [None]:
df.to_csv("embeddings.csv")