### Importing Libraries and Loading Keys

In [10]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
)

from dotenv import dotenv_values
from openai import OpenAI

import csv

In [11]:
env_name = ".env"
config = dotenv_values(env_name)           

# Azure AI Search Details
service_endpoint = config["search_endpoint"]
index_name = config["index_name"]
search_key = config["search_api_key"]

# OpenAI Details
openapi_key = config["openapi_key"]


### Prepare Data Source

In [12]:
# Converts the csv file into a format that can be added to the index

startupscsv = config["startupscsv"]
with open(startupscsv,"r") as csvFile, open("startups.txt","w") as txtFile:
    header = ("Name","StartupName","Location","StartupStage","Industry")
    reader = csv.DictReader(csvFile, header)
    writer = csv.writer(txtFile)
    next(reader, None)
    txtFile.write("["+"\n")

    for row in reader: 
        newDict = {"StartupID":reader.line_num-1}
        newDict.update(row)
        newDict["DescriptionVector"] = ""
        
        newDict_location = newDict["Location"]
        newDict_seedstage = newDict["StartupStage"]
        newDict_industry = newDict["Industry"]
        newDict_descvector = f"A startup in {newDict_location} in {newDict_seedstage} Stage focusing on {newDict_industry}"
        toWrite = f"{newDict}"
        startup = toWrite.replace("'DescriptionVector': ''",
                              f"'DescriptionVector': get_embedding('{newDict_descvector}')" )
        txtFile.write(startup + ',\n')
    
    txtFile.write("]")


In [13]:
# Replace the value of startups variable at the end of this notebook with startups_data 
# to use the data from the csv file
startups_data = []
with open("./startups.txt","r") as file:
    startups_data = file.read()


### Functions

In [14]:
def get_embedding(text, model="text-embedding-ada-002"):
   client = OpenAI(api_key=openapi_key)
   
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [15]:
def get_startups_index(name: str):
    fields = [
        SimpleField(
            name="StartupID", 
            type=SearchFieldDataType.String, 
            key=True
        ),
        SearchableField(
            name="Name",
            type=SearchFieldDataType.String,
        ),
        SearchableField(
            name="StartupName",
            type=SearchFieldDataType.String,
        ),
        SearchableField(
            name="Location",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchableField(
            name="StartupStage",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchableField(
            name="Industry",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchField(
            name="DescriptionVector", 
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="slt-vector-config",
        )

    ]
    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="slt-vector-config", 
                algorithm_configuration_name="slt-algorithms-config"
                )],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="slt-algorithms-config")]
    )
    return SearchIndex(name=name, fields=fields, vector_search=vector_search)

In [16]:
def get_startups():
  startups = [
  {
    "StartupID": "1",
    "Name": "Aliah",
    "StartupName": "Startup 1",
    "Location": "Singapore",
    "StartupStage": "Seed",
    "Industry": "Material Technology",
    "DescriptionVector": get_embedding("A startup in Singapore in Seed Stage focusing on Material Technology"),
  },
  {
    "StartupID": "2",
    "Name": "Bailey",
    "StartupName": "Startup 2",
    "Location": "Ireland",
    "StartupStage": "Seed",
    "Industry": "Material Technology",
    "DescriptionVector": get_embedding("A startup in Ireland in Seed Stage focusing on Material Technology"),
  },
  {
    "StartupID": "3",
    "Name": "Clara",
    "StartupName": "Startup 3",
    "Location": "South Korea",
    "StartupStage": "Pre-A",
    "Industry": "Women issues, Femtech",
    "DescriptionVector": get_embedding("A startup in South Korea in Pre-A Stage focusing on Women issues, Femtech"),
  },
  {
    "StartupID": "4",
    "Name": "Diane",
    "StartupName": "Startup 4",
    "Location": "Africa",
    "StartupStage": "Seed",
    "Industry": "Logistic technology",
    "DescriptionVector": get_embedding("A startup in Africa in Seed Stage focusing on Logistic technology"),
  },
  {
    "StartupID": "5",
    "Name": "Eve",
    "StartupName": "Startup 5",
    "Location": "Turkey",
    "StartupStage": "Pre-A",
    "Industry": "Sustainable technology",
    "DescriptionVector": get_embedding("A startup in Turkey in Pre-A Stage focusing on Sustainable technology"),
  },
  {
    "StartupID": "6",
    "Name": "Farida",
    "StartupName": "Startup 6",
    "Location": "Egypt",
    "StartupStage": "Seed",
    "Industry": "Healthcare Technology",
    "DescriptionVector": get_embedding("A startup in Egypt in Seed Stage focusing on Healthcare Technology"),
  },
  {
    "StartupID": "7",
    "Name": "Greta",
    "StartupName": "Startup 7",
    "Location": "United Kingdom",
    "StartupStage": "Pre-A",
    "Industry": "Material Technology, Deeptech",
    "DescriptionVector": get_embedding("A startup in United Kingdom in Pre-A Stage focusing on Material Technology, Deeptech"),
  },
  {
    "StartupID": "8",
    "Name": "Hailey",
    "StartupName": "Startup 8",
    "Location": "Thailand",
    "StartupStage": "Seed",
    "Industry": "Human Resource Technology",
    "DescriptionVector": get_embedding("A startup in Thailand in Seed Stage focusing on Human Resource Technology"),
  },
  {
    "StartupID": "9",
    "Name": "Irina",
    "StartupName": "Startup 9",
    "Location": "Australia",
    "StartupStage": "Seed",
    "Industry": "Education Technology",
    "DescriptionVector": get_embedding("A startup in Australia in Seed Stage focusing on Education Technology"),
  },
  {
    "StartupID": "10",
    "Name": "Jina",
    "StartupName": "Startup 10",
    "Location": "India",
    "StartupStage": "Seed",
    "Industry": "Medical Technology",
    "DescriptionVector": get_embedding("A startup in India in Seed Stage focusing on Medical Technology"),
  },
]
  return startups

In [31]:
def vector_search(query):
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(search_key))
    vector_query = VectorizedQuery(vector=get_embedding(query), k_nearest_neighbors=3, fields="DescriptionVector")

    results = search_client.search(
        vector_queries=[vector_query],
        select=["Name","StartupName","Location","StartupStage","Industry"]
    )
    for result in results:
        print(result)
    

In [18]:
if __name__ == "__main__":
    credential = AzureKeyCredential(search_key)
    index_client = SearchIndexClient(service_endpoint, credential)
    
    # # Create index
    # index = get_startups_index(index_name)
    # index_client.create_index(index)

    ## Get index
    # index = index_client.get_index(index_name)

    client = SearchClient(service_endpoint, index_name, credential)
    
    # # Upload Documents
    
    # startups = get_startups() # Replace this
    # client.upload_documents(documents=startups)

    # Search the index
    # vector_search("A startup in Global in Seed to Series B Stage focusing on Deeptech or Medtech")
    # vector_search("A startup in South Asia in Seed to Series B Stage focusing on any technology")
