In [None]:
!pip install cohere hnswlib unstructured -q

In [None]:
!rm -r profiles profiles.zip

In [None]:
from google.colab import files

uploaded = files.upload()

Saving profiles.zip to profiles.zip


In [None]:
!unzip profiles.zip

Archive:  profiles.zip
   creating: profiles/
  inflating: profiles/arts1.json     
  inflating: profiles/arts2.json     
  inflating: profiles/arts3.json     
  inflating: profiles/arts4.json     
  inflating: profiles/arts5.json     
  inflating: profiles/backend1.json  
  inflating: profiles/backend2.json  
  inflating: profiles/backend3.json  
  inflating: profiles/backend4.json  
  inflating: profiles/backend5.json  
  inflating: profiles/business1.json  
  inflating: profiles/business2.json  
  inflating: profiles/business3.json  
  inflating: profiles/business4.json  
  inflating: profiles/business5.json  
  inflating: profiles/c_suite1.json  
  inflating: profiles/c_suite2.json  
  inflating: profiles/c_suite3.json  
  inflating: profiles/c_suite4.json  
  inflating: profiles/c_suite5.json  
  inflating: profiles/finance1.json  
  inflating: profiles/finance2.json  
  inflating: profiles/finance3.json  
  inflating: profiles/finance4.json  
  inflating: profiles/finance5.json  

In [None]:
import cohere
import os
import hnswlib
import json
import uuid
from typing import List, Dict
from unstructured.partition.html import partition_html
from unstructured.chunking.title import chunk_by_title
import json
from pprint import pprint

co = cohere.Client(COHERE_API_KEY)

In [None]:
import os
import json

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def json_files_to_list_of_dicts(directory_path):
    data_list = []

    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            json_data = read_json_file(file_path)
            data_list.append(json_data)

    return data_list

directory_path = 'profiles'
profiles = json_files_to_list_of_dicts(directory_path)

print(len(profiles))

75


In [None]:
def preprocess(dict):
    # Take in a dict and convert it to a string with an associated user id
    experiences = ""
    for experience in dict["experience"]:
        responsibilities = ""
        for responsibility in experience["responsibilities"]:
          responsibilities = responsibilities +  responsibility  + "\n"
        experiences += (
            f"Position: {experience['position']}\n"
            f"Duration: {experience['duration']}\n"
            f"Location: {experience['location']}\n"
            f"Responsibilities: {responsibilities}\n"
        )
    skills = ""
    for skill in dict["skills"]:
        skills += f"{skill}\n"

    educations = ""
    for education in dict["education"]:
        educations += (
            f"Degree: {education['degree']}\n"
            f"University: {education['university']}\n"
            f"Location: {education['location']}\n"
            f"Graduation Year: {education['graduation_year']}\n"
        )

    linkedinstring = (
        f"Background: {dict['background']}\n"
        f"Experiences: {experiences}\n"
        f"Education: {educations}\n"
        f"Skills: {skills}\n"
    )
    return linkedinstring

In [None]:
class Documents:
  def __init__(self, sources: List[Dict[str, str]]):
    self.sources = sources
    self.docs = []
    self.docs_embs = []
    self.retrieve_top_k = 20
    self.rerank_top_k = 5
    self.load()
    self.embed()
    self.index()

  def load(self) -> None:
    for index, source in enumerate(self.sources):
      self.docs.append({'text': preprocess(source), 'id': index})

  def embed(self) -> None:
    """
    Embeds the documents using the Cohere API.
    """
    print("Embedding documents...")

    batch_size = 90
    self.docs_len = len(self.docs)

    for i in range(0, self.docs_len, batch_size):
      batch = self.docs[i : min(i + batch_size, self.docs_len)]
      texts = [item['text'] for item in batch]
      docs_embs_batch = co.embed(
        texts=texts,
        model="embed-english-v3.0",
        input_type="search_document"
	 		).embeddings
      self.docs_embs.extend(docs_embs_batch)

  def index(self) -> None:
    """
    Indexes the documents for efficient retrieval.
    """
    print("Indexing documents...")

    self.index = hnswlib.Index(space="ip", dim=1024)
    self.index.init_index(max_elements=self.docs_len, ef_construction=512, M=64)
    self.index.add_items(self.docs_embs, list(range(len(self.docs_embs))))

    print(f"Indexing complete with {self.index.get_current_count()} documents.")

  def retrieve(self, query: str) -> List[Dict[str, str]]:
    """
    Retrieves documents based on the given query.

    Parameters:
    query (str): The query to retrieve documents for.

    Returns:
    List[Dict[str, str]]: A list of dictionaries representing the retrieved  documents, with 'title', 'snippet', and 'url' keys.
    """
    docs_retrieved = []
    query_emb = co.embed(
      texts=[query],
      model="embed-english-v3.0",
      input_type="search_query"
    ).embeddings

    doc_ids = self.index.knn_query(query_emb, k=self.retrieve_top_k)[0][0]

    docs_to_rerank = []
    for doc_id in doc_ids:
        docs_to_rerank.append(self.docs[doc_id])

    rerank_results = co.rerank(
        query=query,
        documents=docs_to_rerank,
        top_n=self.rerank_top_k,
        model="rerank-english-v2.0",
    )

    doc_ids_reranked = []
    for result in rerank_results:
        doc_ids_reranked.append(doc_ids[result.index])

    for doc_id in doc_ids_reranked:
        docs_retrieved.append(
            {
                "text": self.docs[doc_id]['text'],
                "id": self.docs[doc_id]['id']
            }
        )

    sources_retrieved = []

    for docs_retrieved_item in docs_retrieved:
      sources_retrieved.append(self.sources[docs_retrieved_item['id']])

    return sources_retrieved, docs_retrieved

In [None]:
class SearchEngine:
  def __init__(self, docs: Documents):
    self.documents = documents
    self.conversation_id = str(uuid.uuid4())

  def retrieve_and_summarize(self, profile_index: int):

    summarized_profile = co.summarize(preprocess(profiles[profile_index]))

    sources, docs = self.retrieve_sources(summarized_profile)

    summarized_docs = []
    for doc in docs:
      summarized_docs.append({ "text": co.summarize(doc['text']), "id": doc["id"]})

    result = []

    for index, summarized_doc in enumerate(summarized_docs):
      response_why = co.chat(
          message=f"Can you explain why the following profile: {summarized_doc['text']}, is a good match for this profile: {summarized_profile}, as co-founder?",
          model="command",
	        temperature=0.9
      )
      name_key = [item for item in sources[index].keys() if item not in ['profile_url', 'background', 'experience', 'education', 'skills']]
      print(f"Matched {index + 1} People...")
      result.append({
          "name": sources[index][name_key[0]],
          "title": "",
          "summary_of_matches": response_why.text
      })

    return { "matches": result }

  def retrieve_sources(self, summarized_profile: str):
    sources, docs = self.documents.retrieve(f"Find a co-founder with complementary skillset given the following profile?: {summarized_profile}")
    return sources, docs

In [None]:
documents = Documents(profiles)

Embedding documents...
Indexing documents...
Indexing complete with 75 documents.


In [None]:
search_engine = SearchEngine(documents)

In [None]:
result = search_engine.retrieve_and_summarize(2)

sources: [{'startup_enthusiast': 'Sophia Patel', 'profile_url': 'https://www.linkedin.com/in/sophiapatel/', 'background': 'Sophia Patel is a forward-thinking entrepreneur who thrives on innovation and disruption. She has a history of founding and scaling startups in both the tech and healthcare sectors. Currently, Sophia is the Co-Founder and CEO of a healthtech startup focused on revolutionizing remote patient monitoring and healthcare access.', 'experience': [{'position': 'Co-Founder & CEO at HealthTech Innovations', 'duration': 'Jan 2017 - Present', 'location': 'Boston, MA, USA', 'responsibilities': ["Lead the company's mission to improve healthcare access", 'Developed telemedicine and remote monitoring solutions', 'Secured partnerships and funding for expansion']}, {'position': 'Co-Founder & COO at TechMed Solutions', 'duration': 'Jun 2012 - Dec 2016', 'location': 'San Diego, CA, USA', 'responsibilities': ['Launched and scaled a healthcare technology startup', 'Managed day-to-day o

In [None]:
result

{'matches': [{'name': 'Matthew Turner',
   'title': '',
   'summary_of_matches': "Based on the provided profiles, Matthew Turner and Emily Patel both appear to be well-qualified entrepreneurs with complementary skills and experience relevant to founding and leading a startup.\n\nMatthew Turner has extensive experience in founding and building e-commerce startups, specifically in the fashion industry. His expertise in brand development, creative direction, and strategic partnerships would be invaluable for any startup looking to establish a strong brand identity and differentiate itself in the competitive online retail space. Furthermore, his background in business administration demonstrates a strong understanding of business fundamentals, which is crucial for any entrepreneurial venture.\n\nEmily Patel, on the other hand, brings a diverse set of skills and experience to the table. With a background in fashion design and business administration, she has honed her abilities in startup l